## Mutation Search For SL in Lung Cancer

installs and imports

In [2]:
import pandas as pd

### Data Loading

All analyses were performed using summary-level data derived from TCGA and associated public cancer genomics repositories.
Access to raw mutation-level data was conducted under institutional authorization, and therefore the original datasets are not publicly shareable - "LUAD_mutations.csv", "LUSC_mutations.csv"

In [3]:
file_path = "LUAD_mutations.csv"
columns_to_load = [
    "Hugo_Symbol","mut_id","Tumor_Sample_Barcode",
    "Chromosome", "Start_Position", "End_Position","Reference_Allele", "Tumor_Seq_Allele2",
    "Variant_Classification", "Variant_Type", "HGVSp_Short","case_id","PolyPhen"]
luad_df = pd.read_csv(file_path, usecols=columns_to_load)

In [4]:
file_path = "LUSC_mutations.csv"
columns_to_load = [
    "Hugo_Symbol","mut_id","Tumor_Sample_Barcode",
    "Chromosome", "Start_Position", "End_Position","Reference_Allele", "Tumor_Seq_Allele2",
    "Variant_Classification", "Variant_Type", "HGVSp_Short","case_id", "PolyPhen"]
lusc_df = pd.read_csv(file_path, usecols=columns_to_load)

In [5]:
file_name = 'known_cancer_drivers.csv' #Nico's Table
known_cancer_drivers_df = pd.read_csv(file_name)

In [6]:
tsg_df = pd.read_csv('tsg.txt', sep='\t') #Table from bio info https://bioinfo.uth.edu/TSGene/download.cgi
tsg_gene_symbols = tsg_df['GeneSymbol'].unique()
tsg_gene_symbols_df = pd.DataFrame(tsg_gene_symbols, columns=['GeneSymbol'])
tsg_gene_symbols_df.to_csv('tsg_gene_symbols.csv', index=False, encoding='utf-8') 

In [7]:
file_name = 'more_tsg_from_bioinfo.xlsx'
tsg_genes_bioinfo = pd.read_excel(file_name)

In [8]:
file_name = 'slidr_pan_lung.xlsx'
slidr_df = pd.read_excel(file_name)

In [9]:
file_name = 'SynLeth_lung&null_df.csv'
SynLeth_df = pd.read_csv(file_name)

### Filter #1 the top genes from synlethDB + SLIdr

In [10]:
driver_gene_slidr = slidr_df['Driver gene'].unique().tolist()

In [11]:
filtered_SynLet_df = SynLeth_df[SynLeth_df['r.statistic_score'] > 0.5] #threshold for r score
sorted_SynLet_df = filtered_SynLet_df.sort_values(by='r.statistic_score', ascending=False) #sorting the df by r score
top_genes = pd.concat([sorted_SynLet_df['n1.name'], sorted_SynLet_df['n2.name']])
unique_top_genes = top_genes.unique().tolist()
len(unique_top_genes)

655

In [12]:
top_SL_genes = driver_gene_slidr + unique_top_genes
top_SL_genes = list(set(top_SL_genes))

In [13]:
filt1_luad_df = luad_df[luad_df["Hugo_Symbol"].isin(top_SL_genes)]

In [14]:
filt1_lusc_df = lusc_df[lusc_df["Hugo_Symbol"].isin(top_SL_genes)]

### Filter #2 only TSG

Firstly, we need to add the tsg column to manually cheack the genes

In [15]:
filt1_luad_df_tsg = pd.merge(filt1_luad_df, known_cancer_drivers_df[['Gene_name','tsg']],left_on="Hugo_Symbol",right_on='Gene_name', how= 'left')
unique_genes = filt1_luad_df_tsg[['Hugo_Symbol', 'tsg']].drop_duplicates()
unique_genes_not_tsg_laud = unique_genes[unique_genes['tsg']!= True]
unique_genes_not_tsg_laud.to_csv('check_not_tsg_laud.csv', index=False, encoding='utf-8') 

In [16]:
filt1_lusc_df_tsg = pd.merge(filt1_lusc_df, known_cancer_drivers_df[['Gene_name','tsg']],left_on="Hugo_Symbol",right_on='Gene_name', how= 'left')
unique_genes = filt1_lusc_df_tsg[['Hugo_Symbol', 'tsg']].drop_duplicates()
unique_genes_not_tsg_lusc = unique_genes[unique_genes['tsg']!= True]
unique_genes_not_tsg_lusc.to_csv('check_not_tsg_laud.csv', index=False, encoding='utf-8') 

In [17]:
tsg_1 = known_cancer_drivers_df[(known_cancer_drivers_df['tsg'] == True)]['Gene_name'].tolist() #from nico

In [18]:
tsg_2 = tsg_genes_bioinfo['more tsg genes from bioinfo'].tolist()

In [19]:
tsg = tsg_1 + tsg_2
tsg = list(set(tsg)) #uniq values only

In [20]:
filt2_luad_df = filt1_luad_df[filt1_luad_df["Hugo_Symbol"].isin(tsg)]

In [21]:
filt2_luad_df.to_csv('laud_tcga_mutations_after_filter.csv',index = False,encoding='utf-8')

In [22]:
filt2_lusc_df = filt1_lusc_df[filt1_lusc_df["Hugo_Symbol"].isin(tsg)]

In [23]:
filt2_lusc_df.to_csv('lasc_tcga_mutations_after_filter.csv',index = False,encoding='utf-8')

List of the genes in both of the tables

In [24]:
combined_gene_tsg = pd.concat([filt2_luad_df['Hugo_Symbol'], filt2_lusc_df['Hugo_Symbol']])
tsg_genes_lung_cancer = combined_gene_tsg.unique()
tsg_genes_lung_cancer_df = pd.DataFrame(tsg_genes_lung_cancer, columns=['GeneSymbol'])
tsg_genes_lung_cancer_df.to_csv('tsg_genes_lung_cancer.csv', index=False, encoding='utf-8')