<a href="https://colab.research.google.com/github/SivalayaG/BioInf/blob/main/Preprocess_SVM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import halfnorm
from sklearn.svm import LinearSVC, SVC
from sklearn.model_selection import GridSearchCV
from matplotlib import pyplot as plt   
from sklearn.utils._testing import ignore_warnings
from sklearn.exceptions import ConvergenceWarning

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:

## Reading the Gene Information data - downloaded from NCBI Database

rawdata = pd.read_csv(
    filepath_or_buffer='/content/drive/MyDrive/Homo_sapiens.GRCh38.103.gtf.gz', 
    sep='\t', 
    header=None,
    names=['seqid', 'source', 'type', 'start', 'end', 'score', 'strand', 'phase', 'attributes'],
    skiprows=[0, 1, 2, 3, 4])

  exec(code_obj, self.user_global_ns, self.user_ns)


In [None]:
## Getting to know the database 

rawdata.head()

Unnamed: 0,seqid,source,type,start,end,score,strand,phase,attributes
0,1,havana,gene,11869,14409,.,+,.,"gene_id ""ENSG00000223972""; gene_version ""5""; g..."
12,1,havana,gene,14404,29570,.,-,.,"gene_id ""ENSG00000227232""; gene_version ""5""; g..."
25,1,mirbase,gene,17369,17436,.,-,.,"gene_id ""ENSG00000278267""; gene_version ""1""; g..."
28,1,havana,gene,29554,31109,.,+,.,"gene_id ""ENSG00000243485""; gene_version ""5""; g..."
36,1,mirbase,gene,30366,30503,.,+,.,"gene_id ""ENSG00000284332""; gene_version ""1""; g..."


In [None]:
## Knowing more about each column

for col in rawdata.columns:
    print(col, rawdata[col].unique())

seqid [1 2 3 4 5 6 7 '7' 'X' '8' 8 9 11 10 12 13 14 15 16 17 18 20 19 '19' 'Y'
 '22' '21' 'MT' 'KI270728.1' 'KI270727.1' 'KI270442.1' 'GL000225.1'
 'GL000009.2' 'GL000194.1' 'GL000205.2' 'GL000195.1' 'KI270733.1'
 'GL000219.1' 'GL000216.2' 'KI270744.1' 'KI270734.1' 'GL000213.1'
 'GL000220.1' 'GL000218.1' 'KI270731.1' 'KI270750.1' 'KI270721.1'
 'KI270726.1' 'KI270711.1' 'KI270713.1']
source ['havana' 'mirbase' 'ensembl_havana' 'ensembl' 'havana_tagene'
 'ensembl_havana_tagene' 'insdc']
type ['gene']
start [11869 14404 17369 ... 30437 31698 35407]
end [14409 29570 17436 ... 30580 32528 35916]
score ['.']
strand ['+' '-']
phase ['.']
attributes ['gene_id "ENSG00000223972"; gene_version "5"; gene_name "DDX11L1"; gene_source "havana"; gene_biotype "transcribed_unprocessed_pseudogene";'
 'gene_id "ENSG00000227232"; gene_version "5"; gene_name "WASH7P"; gene_source "havana"; gene_biotype "unprocessed_pseudogene";'
 'gene_id "ENSG00000278267"; gene_version "1"; gene_name "MIR6859-1"; gene_sour

In [None]:
## Since we are only concerned about gene id, name and biotype contained in attributes column, we first filter
## all rows to retain only genes (from type), and split attributes into a new dataframe

rawdata = rawdata[rawdata['type']=='gene']

genedata = pd.DataFrame(columns = ['gene_id', 'gene_name', 'gene_biotype'])
genedata['gene_id'] = rawdata.apply(lambda x : x.attributes.split(';')[0].strip().split(' ')[1].strip('"'), axis=1)  
genedata['gene_name'] = rawdata.apply(lambda x : x.attributes.split(';')[2].strip().split(' ')[1].strip('"'), axis=1)
genedata['gene_biotype'] = rawdata.apply(lambda x : x.attributes.split(';')[4].strip().split(' ')[1].strip('"'), axis=1)



In [None]:
## Getting to know the database 

genedata.head()

Unnamed: 0,gene_id,gene_name,gene_biotype
0,ENSG00000223972,DDX11L1,transcribed_unprocessed_pseudogene
12,ENSG00000227232,WASH7P,unprocessed_pseudogene
25,ENSG00000278267,MIR6859-1,miRNA
28,ENSG00000243485,MIR1302-2HG,lncRNA
36,ENSG00000284332,MIR1302-2,miRNA


In [None]:
## Knowing more about each column

for col in genedata.columns:
    print(col, genedata[col].unique())

gene_id ['ENSG00000223972' 'ENSG00000227232' 'ENSG00000278267' ...
 'ENSG00000275987' 'ENSG00000277475' 'ENSG00000268674']
gene_name ['DDX11L1' 'WASH7P' 'MIR6859-1' ... 'AC240274.1' 'AC213203.2' 'AC213203.1']
gene_biotype ['transcribed_unprocessed_pseudogene' 'unprocessed_pseudogene' 'miRNA'
 'lncRNA' 'protein_coding' 'processed_pseudogene' 'snRNA'
 'transcribed_processed_pseudogene' 'misc_RNA' 'TEC'
 'transcribed_unitary_pseudogene' 'snoRNA' 'scaRNA' 'rRNA_pseudogene'
 'unitary_pseudogene' 'polymorphic_pseudogene' 'pseudogene' 'rRNA'
 'IG_V_pseudogene' 'scRNA' 'IG_V_gene' 'IG_C_gene' 'IG_J_gene' 'sRNA'
 'ribozyme' 'translated_processed_pseudogene' 'vault_RNA' 'TR_C_gene'
 'TR_J_gene' 'TR_V_gene' 'TR_V_pseudogene'
 'translated_unprocessed_pseudogene' 'TR_D_gene' 'IG_C_pseudogene'
 'TR_J_pseudogene' 'IG_J_pseudogene' 'IG_D_gene' 'IG_pseudogene' 'Mt_tRNA'
 'Mt_rRNA']


In [None]:
## Read TCGA Lung Adenocarcinoma Gene data for various cancerous and non-cancerous genes

for file in os.listdir('/content/drive/MyDrive/TCGA_Data'):
    print("Reading " + file)
    tcgadata = pd.read_csv('/content/drive/MyDrive/TCGA_Data' + '/' + file, sep='\t', header=None, names=['gene_id', file])
    tcgadata['gene_id'] = tcgadata.apply(lambda x : x.gene_id.split('.')[0].strip(), axis = 1)
    genedata = pd.merge(genedata, tcgadata, on = 'gene_id')

Reading TCGA-05-4249-01A-01R-1107-07
Reading TCGA-05-4403-01A-01R-1206-07
Reading TCGA-05-4250-01A-01R-1107-07
Reading TCGA-05-4417-01A-22R-1858-07
Reading TCGA-05-4430-01A-02R-1206-07
Reading TCGA-05-4426-01A-01R-1206-07
Reading TCGA-05-4422-01A-01R-1206-07
Reading TCGA-05-4432-01A-01R-1206-07
Reading TCGA-05-4382-01A-01R-1206-07
Reading TCGA-05-4397-01A-01R-1206-07
Reading TCGA-05-4410-01A-21R-1858-07
Reading TCGA-05-4415-01A-22R-1858-07
Reading TCGA-05-4395-01A-01R-1206-07
Reading TCGA-05-4398-01A-01R-1206-07
Reading TCGA-05-4389-01A-01R-1206-07
Reading TCGA-05-4420-01A-01R-1206-07
Reading TCGA-05-4418-01A-01R-1206-07
Reading TCGA-05-4425-01A-01R-1755-07
Reading TCGA-05-4390-01A-02R-1755-07
Reading TCGA-05-4244-01A-01R-1107-07
Reading TCGA-05-4424-01A-22R-1858-07
Reading TCGA-05-4396-01A-21R-1858-07
Reading TCGA-05-4402-01A-01R-1206-07
Reading TCGA-05-4427-01A-21R-1858-07
Reading TCGA-05-4384-01A-01R-1755-07
Reading TCGA-05-4405-01A-21R-1858-07
Reading TCGA-35-3615-01A-01R-0946-07
R

In [None]:
## Getting to know the database 

genedata.head()

Unnamed: 0,gene_id,gene_name,gene_biotype,TCGA-05-4249-01A-01R-1107-07,TCGA-05-4403-01A-01R-1206-07,TCGA-05-4250-01A-01R-1107-07,TCGA-05-4417-01A-22R-1858-07,TCGA-05-4430-01A-02R-1206-07,TCGA-05-4426-01A-01R-1206-07,TCGA-05-4422-01A-01R-1206-07,...,TCGA-O1-A52J-01A-11R-A262-07,TCGA-S2-AA1A-01A-12R-A39D-07,TCGA-NJ-A7XG-01A-12R-A39D-07,TCGA-NJ-A55O-01A-11R-A262-07,TCGA-NJ-A4YF-01A-12R-A262-07,TCGA-NJ-A4YP-01A-11R-A262-07,TCGA-NJ-A4YI-01A-11R-A262-07,TCGA-NJ-A55R-01A-11R-A262-07,TCGA-MP-A5C7-01A-11R-A262-07,TCGA-NJ-A55A-01A-11R-A262-07
0,ENSG00000223972,DDX11L1,transcribed_unprocessed_pseudogene,0,0,0,0,1,0,0,...,0,0,0,1,0,0,0,0,0,0
1,ENSG00000227232,WASH7P,unprocessed_pseudogene,30,24,26,11,14,75,44,...,72,126,74,31,161,148,103,240,101,44
2,ENSG00000278267,MIR6859-1,miRNA,4,0,0,0,1,2,1,...,5,4,1,1,2,3,4,7,1,0
3,ENSG00000243485,MIR1302-2HG,lncRNA,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,ENSG00000237613,FAM138A,lncRNA,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
## Knowing more about each column

for col in genedata.columns:
    print(col, genedata[col].unique())
    if 'TCGA' in col:
        print(genedata[col].describe())

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
TCGA-44-5645-11A-01R-1628-07 [     0     47      3 ...  96111  44824 139390]
count    5.646100e+04
mean     8.455594e+02
std      9.047590e+03
min      0.000000e+00
25%      0.000000e+00
50%      1.000000e+00
75%      1.090000e+02
max      1.559158e+06
Name: TCGA-44-5645-11A-01R-1628-07, dtype: float64
TCGA-44-6147-01A-11R-A278-07 [    0    19     3 ... 18167  8100  1958]
count    5.646100e+04
mean     6.807278e+02
std      7.258286e+03
min      0.000000e+00
25%      0.000000e+00
50%      6.000000e+00
75%      1.350000e+02
max      1.125367e+06
Name: TCGA-44-6147-01A-11R-A278-07, dtype: float64
TCGA-44-6146-01A-11R-A278-07 [    0    10     1 ...  7169 16008  1754]
count     56461.000000
mean        483.421494
std        6586.565878
min           0.000000
25%           0.000000
50%           1.000000
75%          43.000000
max      955782.000000
Name: TCGA-44-6146-01A-11R-A278-07, dtype: float64
TCGA-44-6144-11A-01R-1755-0

In [None]:
## Here we filter out only the protein-coding and lncRNA values, since these are of the most significant to us as described in the report

genedata_filtered = genedata.loc[genedata.gene_biotype.isin(['protein_coding','lncRNA'])]

genedata_filtered = genedata_filtered.reset_index()
genedata_clean = genedata_filtered.drop(['index', 'gene_id', 'gene_biotype'], axis = 1, inplace = False)
genedata_clean.to_csv('cleaned_data.csv', index=False)
genedata_clean


Unnamed: 0,gene_name,TCGA-05-4249-01A-01R-1107-07,TCGA-05-4403-01A-01R-1206-07,TCGA-05-4250-01A-01R-1107-07,TCGA-05-4417-01A-22R-1858-07,TCGA-05-4430-01A-02R-1206-07,TCGA-05-4426-01A-01R-1206-07,TCGA-05-4422-01A-01R-1206-07,TCGA-05-4432-01A-01R-1206-07,TCGA-05-4382-01A-01R-1206-07,...,TCGA-O1-A52J-01A-11R-A262-07,TCGA-S2-AA1A-01A-12R-A39D-07,TCGA-NJ-A7XG-01A-12R-A39D-07,TCGA-NJ-A55O-01A-11R-A262-07,TCGA-NJ-A4YF-01A-12R-A262-07,TCGA-NJ-A4YP-01A-11R-A262-07,TCGA-NJ-A4YI-01A-11R-A262-07,TCGA-NJ-A55R-01A-11R-A262-07,TCGA-MP-A5C7-01A-11R-A262-07,TCGA-NJ-A55A-01A-11R-A262-07
0,MIR1302-2HG,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,FAM138A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,OR4F5,0,0,0,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,AL627309.1,0,2,0,5,0,2,2,1,1,...,2,2,1,1,0,3,0,7,1,0
4,AL627309.3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33616,MT-ND4L,15678,25217,11405,16963,12029,15513,34383,30626,33402,...,20375,23033,15070,28612,44415,38800,10497,115624,23556,9861
33617,MT-ND4,183877,414523,148457,195953,191583,208220,419316,362349,419514,...,380846,234024,224090,501434,1082325,690291,211094,2074258,341904,169440
33618,MT-ND5,109635,208486,44292,44106,70519,97757,132960,115393,98002,...,123477,89629,35322,130672,226609,92768,79926,733766,126735,84507
33619,MT-ND6,38316,104307,15828,13820,29697,39158,43524,36684,31598,...,36842,26620,10475,36894,88380,22655,25064,198114,38872,25122


In [None]:
## Obtain and clean our final train data

train_data = genedata_clean.set_index('gene_name')
train_data = train_data.T

## Building the label, 1 -> Tumor; 0 -> Normal
train_data['y'] = train_data.apply(lambda x : int('11' not in x.name.split('-')[3]), axis=1)
train_data = train_data.drop(['ADRB2'], axis = 1, inplace = False)

train_data = train_data.reset_index(drop=True)
train_data.to_csv('trainset1.csv', index = False)
train_data

gene_name,MIR1302-2HG,FAM138A,OR4F5,AL627309.1,AL627309.3,AL627309.2,AL627309.5,AL627309.4,AL732372.1,AC114498.1,...,MT-ATP8,MT-ATP6,MT-CO3,MT-ND3,MT-ND4L,MT-ND4,MT-ND5,MT-ND6,MT-CYB,y
0,0,0,0,0,0,0,21,0,0,0,...,16001,93232,152327,36652,15678,183877,109635,38316,104899,1
1,0,0,0,2,0,0,0,0,0,0,...,23848,171153,247889,82918,25217,414523,208486,104307,153226,1
2,0,0,0,0,0,0,8,0,0,0,...,12618,70918,155271,26228,11405,148457,44292,15828,61443,1
3,0,0,0,5,0,1,5,0,0,0,...,10495,64060,86559,24568,16963,195953,44106,13820,82851,1
4,0,0,0,0,0,0,6,0,0,0,...,12846,81632,193420,39533,12029,191583,70519,29697,101419,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
587,0,0,0,3,0,0,16,0,0,0,...,21828,166787,267334,58733,38800,690291,92768,22655,220095,1
588,0,0,0,0,0,0,10,0,0,0,...,6758,61607,121844,29198,10497,211094,79926,25064,126121,1
589,0,0,0,7,0,7,53,0,0,1,...,58792,535212,984619,202069,115624,2074258,733766,198114,856510,1
590,0,0,0,1,0,5,85,0,0,0,...,16945,120798,231400,66018,23556,341904,126735,38872,199622,1


In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
reduced_traindata = pca.fit_transform(train_data.iloc[:, :-1])
reduced_traindata = reduced_traindata/reduced_traindata.max()
reduced_traindata_df = pd.DataFrame(data = reduced_traindata, columns = ['component1', 'component2'])
reduced_traindata_df = pd.concat([reduced_traindata_df, train_data.iloc[:, -1]], axis = 1)

In [None]:
reduced_traindata_df.to_csv('trainset2.csv', index = False)
reduced_traindata_df

Unnamed: 0,component1,component2,y
0,0.005477,-0.001006,1
1,0.204483,0.044904,1
2,-0.087482,0.021875,1
3,0.034319,-0.122092,1
4,0.039086,0.013469,1
...,...,...,...
587,0.172334,0.213057,1
588,-0.015920,-0.062548,1
589,0.263756,0.690210,1
590,0.218608,0.029319,1


In [None]:
## Preprocessing completed, use train.csv to train models and analyze classification results