### About this notebook: 
<font color='grey'>This notebook aims to integrate mutation data and drug response data, and to find which drugs show sensitivity or resistance to a gene mutation. <br/>
Considering the variants in GDSC dataset are without normal samples, we then used TCGA somatic mutations as well as Cosmic dataset to filter the mutation site in GDSC.
<br/></font>

In [73]:
import sys
import matplotlib.pyplot as plt
import pandas as pd
import scipy
from scipy import stats 
import numpy as np
import json
import statsmodels.stats.multitest as multi
import matplotlib.pyplot as plt
import math
sys.path.append('../scripts/')
import Docket_integration

### Define the input directory and the output directory: 
<font color='grey'> 
    "input_dir": directory of input data<br/>
    "output_dir":directory of output data<br/>
</font>

In [74]:
directories = {"input_dir":"../Data/Data_input_for_LUAD",
               "output_dir":"../Data/Data_output_for_LUAD/"}


### About the input file and the output files: 
<font color='grey'> 
    <font color='green'> input_data;<br/></font>
        "input_file_1": The mutation matrix;<br/>
        "input_file_2": The drug response matrix;<br/>
        "output_file": The merged data table ;<br/>
</font>

<font color='grey'> 
    <font color='green'> input_data2;<br/></font>
        "Genelist": The list of genes selected for analysis;<br/>
        "Druglist": The list of drugs selected for analysis, if it is empty, all drugs from the input data will be used;<br/>
        "Output": The association between drug response and mutations;<br/>
</font>

In [76]:
input_data = {
         "input_file_1": "Mut_site_GDSC.csv",
         "input_file_2": "Drug_GDSC_matirx.csv",
         "input_file_TCGA_cosmic":"../Dataset/variants.xlsx",
         "Sample_label":"COSMIC_identifier",
         "Gene_label":"HGNC_gene_symbol",
         "output_file": "GDSC_mut_TCGA_cosmic_drug_merge.csv",
         "Genelist": ['PIK3CA','TP53','KRAS','EGFR'],   #Select the genes mutations for analysis
         "Druglist":[],                                                        #If no drug is selected, all drugs will be used!
        "Output": "mut_cancerRelated_drug_pair.csv" 
           }


### Data processing --  Step 1: 
<font color='grey'> 
    Read tables;<br/>
    Compare tables;<br/>
    Merge tables;<br/>
    
</font>

In [77]:
mut_GDSC = pd.read_csv(directories['input_dir']+'/'+input_data['input_file_1'])  #Read files
drug_matrix = pd.read_csv(directories['input_dir']+'/'+input_data['input_file_2'], index_col = "Unnamed: 0")

Mut_CEF = pd.read_excel(input_data['input_file_TCGA_cosmic'])
Mut_CEF['new_id'] = Mut_CEF['Gene'] + Mut_CEF['AA']
mut_GDSC_sele = mut_GDSC.loc[mut_GDSC['new_id'].isin(list(Mut_CEF['new_id']))]


In [78]:
Mut_TCGA_matrix = Docket_integration.generate_mutation_matrix(mut_GDSC_sele,input_data['Sample_label'],input_data['Gene_label'],'') #Generate gene mutation matrix 

In [79]:
label_for_merge = Docket_integration.matrix_comp(Mut_TCGA_matrix,drug_matrix )   

Row names are identical!


In [80]:
if label_for_merge[0] > 0.9:
    Merged_mat = Docket_integration.merge_matrix(Mut_TCGA_matrix, drug_matrix, 'Row','Row')
elif label_for_merge[1] == 1:
    Merged_mat = Docket_integration.merge_matrix(Mut_TCGA_matrix, drug_matrix, 'Col','Col')
    
Merged_mat.to_csv(directories['output_dir']+'/'+input_data['output_file'])


### Data processing --  Step 2: 
<font color='grey'> 
    Integrating two datasets through statistical modeling;<br/>
</font>

In [81]:
if len(input_data['Genelist']) > 0:
    Features_groupA = input_data['Genelist']
else:
    Features_groupA = list(Mut_TCGA_matrix.columns.values)

if len(input_data['Druglist']) > 0:
    Features_groupB = input_data['Druglist']
else:
    Features_groupB = list(drug_matrix.columns.values)


result = Docket_integration.Integration_category_numeric(Merged_mat, Features_groupA, Features_groupB)

In [82]:
result.to_csv(directories['output_dir']+input_data['Output'])


In [83]:
result.loc[result['FDR'] < 0.05].sort_values(by = ['FDR'])


Unnamed: 0,F1,F2,p,SE,FDR,-logP
320,KRAS,1526,2e-06,-1.217725,0.001154,13.329999
305,KRAS,1498,7e-06,-1.148194,0.002446,11.885543
278,KRAS,1372,9.3e-05,-1.024762,0.021894,9.288189
