In [3]:
# Imports

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Data Frame 1

In [6]:
# Create first data frame
glioma_df1 = pd.read_csv('data/patient_data/difg_glass_2019_clinical_data.tsv', sep='\t')

# Generate an enumerated list of the column names
list(enumerate(glioma_df1.columns))

[(0, 'Study ID'),
 (1, 'Patient ID'),
 (2, 'Sample ID'),
 (3, '1p19q Status'),
 (4, 'Age at First Diagnosis'),
 (5, 'Aliquot Barcode'),
 (6, 'Alkylating Agent Treatment'),
 (7, 'Aneuploidy Amplification Score'),
 (8, 'Aneuploidy Deletion Score'),
 (9, 'Aneuploidy Score'),
 (10, 'Cancer Type'),
 (11, 'Cancer Type Detailed'),
 (12, 'Project Type'),
 (13, 'Chemotherapy Other'),
 (14, 'Chemotherapy Other Cycles'),
 (15, 'Comments'),
 (16, 'Concurrent Chemoradiotherapy'),
 (17, 'Extent of Resection'),
 (18, 'Fraction Genome Altered'),
 (19, 'Gene Panel'),
 (20, 'Grade'),
 (21, 'Histology'),
 (22, 'Recurrent Tumor Hypermutation Status'),
 (23, 'IDH-codel Subtype'),
 (24, 'IDH Status'),
 (25, 'Initial Grade'),
 (26, 'Initial Tumor Histology'),
 (27, 'MGMT Methylation Status'),
 (28, 'MGMT Methylation Method'),
 (29, 'MNP Classification'),
 (30, 'Mutation Count'),
 (31, 'Oncotree Code'),
 (32, 'Overall Survival (Months)'),
 (33, 'Overall Survival Status'),
 (34, 'Post Recurrence Survival(month

In [11]:
# Extract valuable features 

df_1 = glioma_df1.iloc[: , [10, 11, 20, 53, 42, 4]].copy()
df_1.head(3)

Unnamed: 0,Cancer Type,Cancer Type Detailed,Grade,Tumor Location,Sex,Age at First Diagnosis
0,Glioma,Glioblastoma,IV,,Male,71.0
1,Glioma,Glioblastoma,IV,Occipital lobe,Male,71.0
2,Glioma,Glioblastoma,IV,,Male,61.0


In [37]:
# Change the names of columnms (if necessary)

df_1.rename(columns={'Age at First Diagnosis': 'Age', 'Grade':'Tumor Grade'}, inplace=True)
df_1

Unnamed: 0,Cancer Type,Cancer Type Detailed,Tumor Grade,Tumor Location,Sex,Age
0,Glioma,Glioblastoma,IV,,Male,71.0
1,Glioma,Glioblastoma,IV,Occipital lobe,Male,71.0
2,Glioma,Glioblastoma,IV,,Male,61.0
3,Glioma,Glioblastoma,IV,Temporal lobe,Male,61.0
4,Glioma,Glioblastoma,IV,,Male,57.0
...,...,...,...,...,...,...
439,Glioma,Diffuse Astrocytoma,II,,Male,29.0
440,Glioma,Diffuse Astrocytoma,II,,Male,27.0
441,Glioma,Diffuse Astrocytoma,II,,Male,27.0
442,Glioma,Anaplastic Astrocytoma,III,,Female,42.0


# Data Frame 2

In [43]:
glioma_df2 = pd.read_csv('data/patient_data/difg_glass_clinical_data.tsv', sep='\t')
list(enumerate(glioma_df2.columns))

[(0, 'Study ID'),
 (1, 'Patient ID'),
 (2, 'Sample ID'),
 (3, 'Age'),
 (4, 'Analysis Type'),
 (5, 'Aklylating Agents Treatment'),
 (6, 'Cancer Type'),
 (7, 'Cancer Type Detailed'),
 (8, 'Project ID'),
 (9, 'Codeletion Status'),
 (10, 'DNA Aliquot Barcode'),
 (11, 'ESTIMATE Score'),
 (12, 'Fraction Genome Altered'),
 (13, 'Histology'),
 (14, 'IDH and Codeletion Status'),
 (15, 'IDH Status'),
 (16, 'Immune Score'),
 (17, 'MGMT Methylation'),
 (18, 'MGMT Methylation Method'),
 (19, 'Mutation Count'),
 (20, 'Oncotree Code'),
 (21, 'Overall Survival (months)'),
 (22, 'Overall Survival'),
 (23, 'Purity'),
 (24, 'RNA Aliquot Barcode'),
 (25, 'Number of Samples Per Patient'),
 (26, 'Sample Type'),
 (27, 'Sex'),
 (28, 'Stromal Score'),
 (29, 'Surgery Extent of Resection'),
 (30, 'Surgery Indication'),
 (31, 'Surgery Laterality'),
 (32, 'Surgery Location'),
 (33, 'Surgery Type'),
 (34, 'Tissue Source'),
 (35, 'Concurrent TMZ Treatment'),
 (36, 'Radiation Dose'),
 (37, 'Radiotherapy Treatment'),


In [44]:
df_2 = glioma_df2.iloc[: , [6, 7, 42, 32, 27]].copy()
df_2.head(5)

Unnamed: 0,Cancer Type,Cancer Type Detailed,Tumor Grade,Surgery Location,Sex
0,Glioma,Glioblastoma,IV,,Male
1,Glioma,Glioblastoma,IV,Occipital lobe,Male
2,Glioma,Glioblastoma,IV,,Male
3,Glioma,Glioblastoma,IV,Temporal lobe,Male
4,Glioma,Glioblastoma,IV,,Male


In [45]:
# Adjust column names
df_2.rename(columns={'Surgery Location':'Tumor Location'}, inplace=True)

In [46]:
df_2.head()

Unnamed: 0,Cancer Type,Cancer Type Detailed,Tumor Grade,Tumor Location,Sex
0,Glioma,Glioblastoma,IV,,Male
1,Glioma,Glioblastoma,IV,Occipital lobe,Male
2,Glioma,Glioblastoma,IV,,Male
3,Glioma,Glioblastoma,IV,Temporal lobe,Male
4,Glioma,Glioblastoma,IV,,Male


# Concatenate df_1 and df_2

In [48]:
main_df = pd.concat([df_1, df_2], ignore_index=True)
main_df.head()

Unnamed: 0,Cancer Type,Cancer Type Detailed,Tumor Grade,Tumor Location,Sex,Age
0,Glioma,Glioblastoma,IV,,Male,71.0
1,Glioma,Glioblastoma,IV,Occipital lobe,Male,71.0
2,Glioma,Glioblastoma,IV,,Male,61.0
3,Glioma,Glioblastoma,IV,Temporal lobe,Male,61.0
4,Glioma,Glioblastoma,IV,,Male,57.0


# Data Frame 3

In [51]:
glioma_df3 = pd.read_csv('data/patient_data/difg_msk_2023_clinical_data.tsv', sep='\t') 
list(enumerate(glioma_df3.columns))

[(0, 'Study ID'),
 (1, 'Patient ID'),
 (2, 'Sample ID'),
 (3, '1p19q Status'),
 (4, 'Diagnosis Age'),
 (5, 'Age at Which Sequencing was Reported (Years)'),
 (6, 'Cancer Type'),
 (7, 'Cancer Type Detailed'),
 (8, 'Clinical or Research Impact'),
 (9, 'Disease Free (Months)'),
 (10, 'Disease Free Status'),
 (11, 'Fraction Genome Altered'),
 (12, 'Gene Panel'),
 (13, 'Neoplasm Histologic Type Name'),
 (14, 'Institute Source'),
 (15, 'Metastatic Site'),
 (16, 'MGMT Status'),
 (17, 'MSI Score'),
 (18, 'MSI Type'),
 (19, 'Mutation Count'),
 (20, 'Oncotree Code'),
 (21, 'Overall Survival (Months)'),
 (22, 'Overall Survival Status'),
 (23, 'Period1 Growth Rate'),
 (24, 'Period2 Growth Rate'),
 (25, 'Primary Tumor Site'),
 (26, 'Quartile Period1 Growth Rate'),
 (27, 'Quartile Period2 Growth Rate'),
 (28, 'Race Category'),
 (29, 'Sample Class'),
 (30, 'Number of Samples Per Patient'),
 (31, 'Sample coverage'),
 (32, 'Sample Type'),
 (33, 'Sex'),
 (34, 'Somatic Status'),
 (35, 'TMB (nonsynonymous)

In [56]:
glioma_df3['WHO Grade']

0      G3
1      G3
2      G3
3      G2
4      G3
     ... 
68    NaN
69    NaN
70    NaN
71    NaN
72    NaN
Name: WHO Grade, Length: 73, dtype: object

In [55]:
glioma_df3['Age at Which Sequencing was Reported (Years)']

0     41.0
1     30.0
2     37.0
3     56.0
4     42.0
      ... 
68     NaN
69     NaN
70     NaN
71     NaN
72     NaN
Name: Age at Which Sequencing was Reported (Years), Length: 73, dtype: float64

In [59]:
df_3 = glioma_df3.iloc[: , [6, 7, 37, 25, 33, 5, 28]]
df_3.head()

Unnamed: 0,Cancer Type,Cancer Type Detailed,WHO Grade,Primary Tumor Site,Sex,Age at Which Sequencing was Reported (Years),Race Category
0,Glioma,Anaplastic Oligodendroglioma,G3,Brain,Female,41.0,ASIAN-FAR EAST/INDIAN SUBCONT
1,Glioma,Astrocytoma,G3,Brain,Male,30.0,WHITE
2,Glioma,Anaplastic Oligodendroglioma,G3,Brain,Male,37.0,WHITE
3,Glioma,Oligodendroglioma,G2,Brain,Male,56.0,ASIAN-FAR EAST/INDIAN SUBCONT
4,Glioma,Anaplastic Oligoastrocytoma,G3,Brain,Female,42.0,WHITE


In [None]:
# Adjust the feature names to match primary df

