# Dataset creation for both binary and multiclass TCGA classification
This notebook creates the dataset necessary to perform the classification tasks

In [1]:
import pandas as pd
import os

  from pandas.core import (


### Function to load the txt files

In [2]:
def read_txt_to_df(file_path):
    """
    Reads a text file and converts it into a pandas DataFrame.
    
    Args:
        file_path (str): Path to the text file.
    
    Returns:
        pd.DataFrame: DataFrame containing the data from the text file.
    """
    with open(file_path, 'r') as f:
        lines = f.readlines()

    # Extract header
    header = lines[0].strip().split('\t')
    
    # Extract data
    data = [line.strip().split('\t') for line in lines[1:]]
    
    # Create DataFrame
    df = pd.DataFrame(data, columns=header)
    
    return df

In [3]:
mutations_folder_path='/Users/filippofocaccia/Desktop/MLLab-Bio/mutations'
mrna_folder_path='/Users/filippofocaccia/Desktop/MLLab-Bio/mrna'
mrna_bladder=read_txt_to_df(os.path.join(mrna_folder_path,'data_mrna_seq_v2_rsem_bladder.txt'))
mrna_breast=read_txt_to_df(os.path.join(mrna_folder_path,'data_mrna_seq_v2_rsem_breast.txt'))
mrna_ovarian=read_txt_to_df(os.path.join(mrna_folder_path,'data_mrna_seq_v2_rsem_ovarian.txt'))
mrna_stomach=read_txt_to_df(os.path.join(mrna_folder_path,'data_mrna_seq_v2_rsem_stomach.txt'))
mrna_head_neck=read_txt_to_df(os.path.join(mrna_folder_path,'data_mrna_seq_v2_rsem_head_neck.txt'))
mrna_brain=read_txt_to_df(os.path.join(mrna_folder_path,'data_mrna_seq_v2_rsem_brain.txt'))
mutations_bladder=read_txt_to_df(os.path.join(mutations_folder_path,'data_mutations_bladder.txt'))
mutations_breast=read_txt_to_df(os.path.join(mutations_folder_path,'data_mutations_breast.txt'))
mutations_ovarian=read_txt_to_df(os.path.join(mutations_folder_path,'data_mutations_ovarian.txt'))
mutations_stomach=read_txt_to_df(os.path.join(mutations_folder_path,'data_mutations_stomach.txt'))
mutations_head_neck=read_txt_to_df(os.path.join(mutations_folder_path,'data_mutations_head_neck.txt'))
mutations_brain=read_txt_to_df(os.path.join(mutations_folder_path,'data_mutations_brain.txt'))


### The datasets all have different columns therefore let's consider only the ones we care about

In [117]:
mutations_bladder= mutations_bladder[['Hugo_Symbol','Variant_Type','Tumor_Sample_Barcode','Variant_Classification']]
mutations_brain= mutations_brain[['Hugo_Symbol','Variant_Type','Tumor_Sample_Barcode','Variant_Classification']]
mutations_stomach= mutations_stomach[['Hugo_Symbol','Variant_Type','Tumor_Sample_Barcode','Variant_Classification']]
mutations_breast= mutations_breast[['Hugo_Symbol','Variant_Type','Tumor_Sample_Barcode','Variant_Classification']]
mutations_ovarian= mutations_ovarian[['Hugo_Symbol','Variant_Type','Tumor_Sample_Barcode','Variant_Classification']]
mutations_head_neck= mutations_head_neck[['Hugo_Symbol','Variant_Type','Tumor_Sample_Barcode','Variant_Classification']]

### Let's create some auxiliary functions to process mutations and mrnas, let's also delete duplicate patients from the mutations' datasets

In [118]:
def process_mutations(mutations_df):

    processed_rows = []
    #we create the binary column
    mutations_df['is_tp53'] = mutations_df['Hugo_Symbol'].apply(lambda x: 1 if x == 'TP53' else 0)

    #each patient has a unique Tumor_Sample_Barcode however in each original dataset we have multiple rows for each patient
    #we need to keep only one row for each patient and if the patient has TP53 mutations we need to keep only one of them
    #if the patient has more than one TP53 mutation we keep only one of them
    #if the patient has no TP53 mutations we keep the first row
    
    unique_patients = list(mutations_df['Tumor_Sample_Barcode'].unique())
    for patient in unique_patients:
        patient_genes = mutations_df.loc[mutations_df['Tumor_Sample_Barcode'] == patient]
        if 'TP53' in patient_genes['Hugo_Symbol'].values:
            patient_genes = patient_genes.loc[patient_genes['Hugo_Symbol'] == 'TP53']
            if len(patient_genes) > 1:
                patient_genes = patient_genes.iloc[0:1]
        else:
            # keep the first row and drop the rest
            patient_genes = patient_genes.iloc[0:1]
        processed_rows.append(patient_genes)
    
    processed_df = pd.concat(processed_rows, ignore_index=True)
    return processed_df

In [119]:
# Check for duplicates in each mutations dataset
datasets = [mutations_bladder, mutations_breast, mutations_ovarian, mutations_stomach, mutations_head_neck, mutations_brain]
for i, dataset in enumerate(datasets, start=1):
    if dataset['Tumor_Sample_Barcode'].duplicated().any():
        mutations_bladder = mutations_bladder.drop_duplicates('Tumor_Sample_Barcode')

In [120]:
def process_mrna(df):
    " This function processes the mRNA data by removing a useless column and "
    "transposing the DataFrame to have genes as columns and patients as rows. "
    "This allows for merge with the mutations data and standardization"
    
    df.drop(columns=['Entrez_Gene_Id'])
    df = df.transpose()
    df.columns = df.iloc[0]
    df = df[1:]
    df = df.reset_index()
    df = df.rename(columns={'index': 'Tumor_Sample_Barcode'})
    return df

In [121]:
def concatenate(df1, df2):
    "This function concatenates two DataFrames, ensuring they have the same columns and "
    "the Tumor_Sample_Barcode column is preserved."

    # Ensure both DataFrames have the same columns
    df1_tumor_sample= df1['Tumor_Sample_Barcode']
    df2_tumor_sample= df2['Tumor_Sample_Barcode']

    common_columns = set(df1.columns).intersection(set(df2.columns))
    common_columns = list(common_columns) 

    df1 = df1[common_columns]
    df2 = df2[common_columns]

    df1['Tumor_Sample_Barcode']= df1_tumor_sample
    df2['Tumor_Sample_Barcode']= df2_tumor_sample
    
    # Concatenate the DataFrames
    concatenated_df = pd.concat([df1, df2], axis=0, ignore_index=True)
    return concatenated_df

### We can finally create the whole dataset containing all cancer data for each patient and the mutations

In [122]:
mutations = [mutations_bladder, mutations_breast, mutations_ovarian, mutations_stomach, mutations_head_neck, mutations_brain]
mrna = [mrna_bladder, mrna_breast, mrna_ovarian, mrna_stomach, mrna_head_neck, mrna_brain]

# Initialize the final concatenated dataset
final_conc = None

for i in range(len(mutations)):
    if i == 0:
        mutation_processed_1 = process_mutations(mutations[i])
        mrna_processed_1 = process_mrna(mrna[i])
        combined = pd.merge(mutation_processed_1, mrna_processed_1, on='Tumor_Sample_Barcode', how='inner')
        mutation_processed_2 = process_mutations(mutations[i+1])
        mrna_processed_2 = process_mrna(mrna[i+1])
        combined2= pd.merge(mutation_processed_2, mrna_processed_2, on='Tumor_Sample_Barcode', how='inner')
        first_concat= concatenate(combined, combined2)
    else:
        mutation_processed = process_mutations(mutations[i])
        mrna_processed = process_mrna(mrna[i])
        combined3 = pd.merge(mutation_processed, mrna_processed, on='Tumor_Sample_Barcode', how='inner')
        concat= concatenate(first_concat, combined3)
    

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['Tumor_Sample_Barcode']= df1_tumor_sample
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2['Tumor_Sample_Barcode']= df2_tumor_sample
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['Tumor_Sample_Barcode']= df1_tumor_sample
A value is trying to be set on a copy of a slice from a DataFrame.


In [123]:
concat.to_csv('data/final_dataset_TCGA.csv', index=False)