#  DataBase contruction

In [32]:
import pandas as pd
import sns
from matplotlib import pyplot as plt
from scipy.stats import chi2_contingency

In [33]:
# reading all the data files
data_clinical_patient = pd.read_csv('pan_origimed_2020/data_clinical_patient.txt', sep="\t")
data_clinical_sample = pd.read_csv('pan_origimed_2020/data_clinical_sample.txt', sep="\t")
data_cna_log2 = pd.read_csv('pan_origimed_2020/data_cna_log2.txt', sep="\t")
data_cna = pd.read_csv('pan_origimed_2020/data_cna.txt', sep="\t")
data_mutations = pd.read_csv('pan_origimed_2020/data_mutations.txt', sep="\t", header=2, dtype={"Exon_Number": "string"})
data_sv = pd.read_csv('pan_origimed_2020/data_sv.txt', sep="\t")

In [34]:
# removing bad rows
data_clinical_sample = data_clinical_sample[4:]
data_clinical_patient = data_clinical_patient[4:]

In [35]:
data_clinical_patient = data_clinical_patient.rename(columns={'#Patient Identifier': 'PATIENT_ID'})

In [36]:
data_clinical_patient.head()

Unnamed: 0,PATIENT_ID,Sex,Diagnosis Age,Smoke Status,Treatment
4,Patient0001,Female,67,Unknown,Other_Treatments
5,Patient0002,Male,75,Unknown,Treatment-naive
6,Patient0003,Female,45,Unknown,Treatment-naive
7,Patient0004,Male,70,Unknown,Treatment-naive
8,Patient0005,Male,53,Unknown,Treatment-naive


In [37]:
# matching the sample id to match other tables
data_clinical_patient["SAMPLE_ID"] = data_clinical_patient["PATIENT_ID"].apply(lambda x: "P-" + x[7:])

In [38]:
# make all sample id header name the same - "SAMPLE_ID"
data_clinical_sample.rename(columns={"Sample Identifier": 'SAMPLE_ID'}, inplace=True)
data_mutations.rename(columns={"Tumor_Sample_Barcode": 'SAMPLE_ID'}, inplace=True)
data_sv.rename(columns={"Sample_Id": 'SAMPLE_ID'}, inplace=True)

In [39]:
# merge everything
merged_clinical_data = data_clinical_patient.merge(data_clinical_sample, on="SAMPLE_ID", how='outer')
merged_mutations_data = merged_clinical_data.merge(data_mutations, on="SAMPLE_ID", how='outer')
merged_all_data = merged_mutations_data.merge(data_sv, on="SAMPLE_ID", how='outer')

In [40]:
merged_all_data.to_csv("pan_cancer_db_merged.csv")

In [41]:
merged_all_data["SNP_event"] = merged_all_data["Reference_Allele"].fillna("").astype(str) + ">" + merged_all_data["Tumor_Seq_Allele2"].fillna("").astype(str)


In [42]:
merged_all_data.head()

Unnamed: 0,PATIENT_ID,Sex,Diagnosis Age,Smoke Status,Treatment,SAMPLE_ID,#Patient Identifier,Cancer Type,Cancer Type Detailed,Tumor Stage,...,Site2_Hugo_Symbol,Center_y,Event_Info,DNA_support,RNA_support,Method,Connection_Type,SV_Status,Group,SNP_event
0,Patient0001,Female,67,Unknown,Other_Treatments,P-0001,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,...,,,,,,,,,,C>T
1,Patient0001,Female,67,Unknown,Other_Treatments,P-0001,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,...,,,,,,,,,,C>A
2,Patient0001,Female,67,Unknown,Other_Treatments,P-0001,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,...,,,,,,,,,,C>-
3,Patient0001,Female,67,Unknown,Other_Treatments,P-0001,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,...,,,,,,,,,,C>T
4,Patient0001,Female,67,Unknown,Other_Treatments,P-0001,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,...,,,,,,,,,,G>T


In [43]:
data_for_model = merged_all_data[["PATIENT_ID", "Cancer Type", 'Cancer Type Detailed', 'Tumor Stage',
                                'Sample Type', "Sex", "Diagnosis Age", "Smoke Status", "TMB (nonsynonymous)",
                                "Hugo_Symbol", "Chromosome", "Start_Position", "End_Position",
                                "Consequence", "Variant_Type", "SNP_event", "Protein_position", "Codons",
                                "Exon_Number","VAR_TYPE_SX", "Site1_Hugo_Symbol", "Site2_Hugo_Symbol","Event_Info"]]

In [44]:
data_for_model.to_csv("pan_cancer_db_for_model.csv")

In [45]:
data_for_model.shape

(105906, 23)

In [46]:
data_for_model.head(20)

Unnamed: 0,PATIENT_ID,Cancer Type,Cancer Type Detailed,Tumor Stage,Sample Type,Sex,Diagnosis Age,Smoke Status,TMB (nonsynonymous),Hugo_Symbol,...,Consequence,Variant_Type,SNP_event,Protein_position,Codons,Exon_Number,VAR_TYPE_SX,Site1_Hugo_Symbol,Site2_Hugo_Symbol,Event_Info
0,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,67,Unknown,0.333333333,KMT2C,...,missense_variant,SNP,C>T,4822.0,cGt/cAt,57/59,Substitution/Indel,,,
1,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,67,Unknown,0.333333333,KRAS,...,missense_variant,SNP,C>A,12.0,Ggt/Tgt,5-Feb,Substitution/Indel,,,
2,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,67,Unknown,0.333333333,SOX9,...,frameshift_variant,DEL,C>-,236.0,aCc/ac,3-Mar,Truncation,,,
3,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,67,Unknown,0.333333333,APC,...,"stop_gained,splice_region_variant",SNP,C>T,216.0,Cga/Tga,16-Jul,Truncation,,,
4,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,67,Unknown,0.333333333,APC,...,stop_gained,SNP,G>T,1286.0,Gaa/Taa,16/16,Truncation,,,
5,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,67,Unknown,0.333333333,QKI,...,missense_variant,SNP,G>A,47.0,Gaa/Aaa,8-Jan,Substitution/Indel,,,
6,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,67,Unknown,0.333333333,FBXW7,...,missense_variant,SNP,G>A,505.0,Cgc/Tgc,12-Oct,Substitution/Indel,,,
7,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,67,Unknown,0.333333333,PTPN11,...,intron_variant,DEL,TTTC>-,,,,Substitution/Indel,,,
8,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,67,Unknown,0.333333333,GLI1,...,missense_variant,SNP,G>T,488.0,Gct/Tct,12-Nov,Substitution/Indel,,,
9,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,67,Unknown,0.333333333,ATM,...,"splice_region_variant,intron_variant",SNP,G>A,1026.0,,,Substitution/Indel,,,


In [47]:
# checking if the dates in Exon number are dd/mm or mm/dd by comparing the dates and not dates for gene APC
# data_for_model[data_for_model["Hugo_Symbol"].str.contains("APC", na=False)]
# result - the format of the dates are total exons-exon number

In [48]:
# Function to handle the conversion
def convert_exon_number(val):
    try:
        # First, try to convert to 'Month-Year' format (e.g., 'Sep-89' -> '09/89')
        return pd.to_datetime(val, format='%b-%y').strftime('%m/%y')
    except ValueError:
        pass

    try:
        # Then, try to convert to 'DD-Mon' format (e.g., '14-Sep' -> '09/14')
        date_obj = pd.to_datetime(val, format='%d-%b', errors='raise')
        return date_obj.strftime('%m/%d')
    except ValueError:
        # If neither format matches, return the value as is (non-date-like string)
        return val

In [49]:
# Apply the function to the column
data_for_model.loc[:, 'Exon_Number'] = data_for_model['Exon_Number'].apply(convert_exon_number)

In [50]:
data_for_model.head(10)

Unnamed: 0,PATIENT_ID,Cancer Type,Cancer Type Detailed,Tumor Stage,Sample Type,Sex,Diagnosis Age,Smoke Status,TMB (nonsynonymous),Hugo_Symbol,...,Consequence,Variant_Type,SNP_event,Protein_position,Codons,Exon_Number,VAR_TYPE_SX,Site1_Hugo_Symbol,Site2_Hugo_Symbol,Event_Info
0,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,67,Unknown,0.333333333,KMT2C,...,missense_variant,SNP,C>T,4822.0,cGt/cAt,57/59,Substitution/Indel,,,
1,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,67,Unknown,0.333333333,KRAS,...,missense_variant,SNP,C>A,12.0,Ggt/Tgt,02/05,Substitution/Indel,,,
2,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,67,Unknown,0.333333333,SOX9,...,frameshift_variant,DEL,C>-,236.0,aCc/ac,03/03,Truncation,,,
3,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,67,Unknown,0.333333333,APC,...,"stop_gained,splice_region_variant",SNP,C>T,216.0,Cga/Tga,07/16,Truncation,,,
4,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,67,Unknown,0.333333333,APC,...,stop_gained,SNP,G>T,1286.0,Gaa/Taa,16/16,Truncation,,,
5,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,67,Unknown,0.333333333,QKI,...,missense_variant,SNP,G>A,47.0,Gaa/Aaa,01/08,Substitution/Indel,,,
6,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,67,Unknown,0.333333333,FBXW7,...,missense_variant,SNP,G>A,505.0,Cgc/Tgc,10/12,Substitution/Indel,,,
7,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,67,Unknown,0.333333333,PTPN11,...,intron_variant,DEL,TTTC>-,,,,Substitution/Indel,,,
8,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,67,Unknown,0.333333333,GLI1,...,missense_variant,SNP,G>T,488.0,Gct/Tct,11/12,Substitution/Indel,,,
9,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,67,Unknown,0.333333333,ATM,...,"splice_region_variant,intron_variant",SNP,G>A,1026.0,,,Substitution/Indel,,,


In [51]:
data_for_model["Hugo_Symbol"].value_counts()

Hugo_Symbol
TP53            6719
LRP1B           2138
APC             1882
KRAS            1781
EGFR            1402
                ... 
STK24-AS1          1
EGFR-AS1           1
DPYD-AS1           1
MIR4466            1
RP11-770J1.3       1
Name: count, Length: 479, dtype: int64

In [52]:
data_for_model["Chromosome"].value_counts()

Chromosome
17    11798
2      8468
1      7888
7      7764
3      7694
12     7669
5      6356
4      5822
8      4713
19     4491
X      4341
11     4282
9      3657
16     3449
6      3194
13     2753
10     2587
20     1711
22     1663
14     1648
15     1583
18     1516
21      446
Name: count, dtype: int64

In [53]:
data_for_model["Cancer Type"].value_counts()


Cancer Type
Colorectal Carcinoma                     28396
Non Small Cell Lung Cancer               19526
Gastric Cancer                           10832
Liver Hepatocellular Carcinoma            9633
Esophageal Carcinoma                      6298
Intrahepatic Cholangiocarcinoma           3939
Small Cell Lung Cancer                    3025
Pancreatic Cancer                         2957
Extrahepatic Cholangiocarcinoma           2883
Breast Carcinoma                          2350
Soft Tissue Sarcoma                       2270
Gallbladder Carcinoma                     2192
Ovarian Carcinoma                         1851
Urothelial Carcinoma                      1631
Kidney Renal Cell Carcinoma               1500
Uterine Corpus Endometrial Carcinoma      1358
Cancer of Unknown Primary                 1068
Head and Neck Carcinoma                    997
Carcinoma of Uterine Cervix                787
Small Bowel Carcinoma                      755
Bone Sarcoma                               601
G

In [54]:
data_for_model['Consequence'].str.split(',')
dummy_vars = data_for_model['Consequence'].str.split(',').explode().str.get_dummies().groupby(level=0).sum()
data_for_model = data_for_model.join(dummy_vars)
data_for_model.drop('Consequence', axis=1, inplace=True)



In [57]:
data_for_model = pd.get_dummies(data_for_model, columns=['Smoke Status'], drop_first=False)
data_for_model.drop('Smoke Status_Unknown', axis=1, inplace=True)

In [58]:
columns_to_convert = ['Sex', 'Hugo_Symbol', 'Chromosome', 'Variant_Type', 'SNP_event', 'Exon_Number',
                      'Codons', 'VAR_TYPE_SX', 'Site1_Hugo_Symbol', 'Site2_Hugo_Symbol', 'Event_Info',
                      'Cancer Type', 'Cancer Type Detailed', 'Tumor Stage', 'Sample Type']  # List of columns to convert
data_for_model[columns_to_convert] = data_for_model[columns_to_convert].astype('category')

In [60]:
def create_age_range(x):
    if x <= 10:
        return "0-10"
    elif x <= 20:
        return "11-20"
    elif x <= 30:
        return "21-30"
    elif x <= 40:
        return "31-40"
    elif x <= 50:
        return "41-50"
    elif x <= 60:
        return "51-60"
    elif x <= 70:
        return "61-70"
    elif x <= 80:
        return "71-80"
    else:
        return "80+"

In [61]:
data_for_model['Diagnosis Age'] = data_for_model['Diagnosis Age'].astype(int).apply(create_age_range).astype("category")

In [62]:
data_for_model.to_csv("pan_cancer_data_for_model.csv", index=False)

In [63]:
data_for_model.head()

Unnamed: 0,PATIENT_ID,Cancer Type,Cancer Type Detailed,Tumor Stage,Sample Type,Sex,Diagnosis Age,TMB (nonsynonymous),Hugo_Symbol,Chromosome,...,splice_region_variant,start_lost,start_retained_variant,stop_gained,stop_lost,stop_retained_variant,synonymous_variant,upstream_gene_variant,Smoke Status_Nonsmoker,Smoke Status_Smoker
0,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,61-70,0.333333333,KMT2C,7,...,0,0,0,0,0,0,0,0,False,False
1,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,61-70,0.333333333,KRAS,12,...,0,0,0,0,0,0,0,0,False,False
2,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,61-70,0.333333333,SOX9,17,...,0,0,0,0,0,0,0,0,False,False
3,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,61-70,0.333333333,APC,5,...,1,0,0,1,0,0,0,0,False,False
4,Patient0001,Colorectal Carcinoma,Colorectal Adenocarcinoma,IV,Primary,Female,61-70,0.333333333,APC,5,...,0,0,0,1,0,0,0,0,False,False
