<a id='sec0'></a>
# Feature Engineering1
- Importing Data
- <a href='#sec1'>Exemplary Text Analysis</a>
  - <a href='#sec1_1'>Old way</a>
  - <a href='#sec1_2'>Some useful regex and lists</a>
  - <a href='#sec1_3'>Replace periods, commas, hyphens, brackets with a space, and then tokenize</a>

- <a href='#sec2'>Surveying the whole text. Looking at each type of feature</a>
  - <a href='#sec2_1'>Words with '-positive' and '-negative'</a>
  - <a href='#sec2_2'>Detect genes with hyphens, including fusions</a>
  - <a href='#sec2_3'>Mutation Types</a>
  - <a href='#sec2_4'>Gene Like Names</a>

- <a href='#sec3'>Functions for Processing Features</a>
- <a href='#sec4'>Process Features</a>
- <a href='#sec5'>Test with Random Forest</a>

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re

from nltk import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

sns.set_context("paper")
%matplotlib inline

<b>Importing train_text</b>

In [2]:
class_train = pd.read_csv('train_variants')
text_train = pd.read_csv("train_text", sep="\|\|", engine='python', header=None, skiprows=1, names=["ID","Text"])

In [3]:
class_train.head()

Unnamed: 0,ID,Gene,Variation,Class
0,0,FAM58A,Truncating Mutations,1
1,1,CBL,W802*,2
2,2,CBL,Q249E,2
3,3,CBL,N454D,3
4,4,CBL,L399V,4


Unnamed: 0,ID,Text
0,0,Cyclin-dependent kinases (CDKs) regulate a var...
1,1,Abstract Background Non-small cell lung canc...
2,2,Abstract Background Non-small cell lung canc...
3,3,Recent evidence has demonstrated that acquired...
4,4,Oncogenic mutations in the monomeric Casitas B...


<a id='sec1'></a>
# Exemplary Text Analysis  (<a href='#sec0'>Back To Top</a>)

In [5]:
txt1 = text_train.iloc[903, 1]

In [6]:
class_train.iloc[903, :]

ID                           903
Gene                      PDGFRA
Variation    KIF5B-PDGFRA Fusion
Class                          7
Name: 903, dtype: object

<a id='sec1_1'></a>
## Old way (<a href='#sec0'>Back To Top</a>)

In [7]:
word_tokens = word_tokenize(txt1)
word_tokens = np.array(word_tokens)

In [8]:
print('initial leng %d' % len(word_tokens))

initial leng 6402


In [9]:
stop_words = set(stopwords.words('english'))
txt1_words = [w for w in word_tokens if not w in stop_words]
print('After removing stop words %d' % len(txt1_words))

After removing stop words 4632


In [10]:
df1 = pd.DataFrame(txt1_words)
df1.columns = ['tokens']
df1.head()

Unnamed: 0,tokens
0,We
1,identified
2,two
3,patients
4,(


In [11]:
gene_ish_pattern = r"[A-Z]{2,7}"

In [12]:
# get gene-ish words in a simple list
gene_ish_words1 = [word for word in txt1_words if re.match(gene_ish_pattern, word)]

In [13]:
len(gene_ish_words1)

445

In [14]:
gene_ish_words1

['BCR-ABL',
 'FIP1L1-PDGFRA',
 'STRN-PDGFRA',
 'ETV6-PDGFRA',
 'PDGFRA',
 'FIP1L1-PDGFRA',
 'CEL',
 'CEL',
 'PDGFRA',
 'PDGFRB',
 'FGFR1',
 'JAK2',
 'PDGFRA',
 'PDGFRB',
 'PDGFRA',
 'BCR',
 'FIP1L1',
 'KIF5B',
 'CDK5RAP2',
 'CEL',
 'CEL',
 'MPD',
 'MPD',
 'XY',
 'XY',
 'PCR',
 'RT-PCR',
 'PDGFRA',
 'PCR',
 'STRN-PDGFRA',
 'DNA',
 'STRN.Fusion.Ex6.1F',
 'PDGFRA.Fusion.Ex12.2R',
 'ETV6-PDGFRA',
 'ETV6.Fusion.Ex6.1F',
 'PDGFRA.Fusion.Ex12.2R',
 'MRD',
 'FISH',
 'FISH',
 'BAC',
 'PDGFRA',
 'ETV6',
 'BAC',
 'DNA',
 'UK',
 'BACs',
 'RP11-24O10',
 'PDGFRA',
 'RP11-434C1',
 'ETV6',
 'PCR',
 'PDGFRA',
 'BCR-ABL',
 'FIP1L1-PDGFRA',
 'RT-PCR',
 'FISH',
 'PDGFRA',
 'PDGFRA',
 'PDGFRA',
 'PCR',
 'STRN-PDGFRA',
 'PCR',
 'PCR',
 'PCR',
 'PDAI12-R4',
 'DNA',
 'PCR',
 'STRN-PDGFRA',
 'STRN-PDGFRA',
 'DNA',
 'WW-like',
 'WW',
 'WD40',
 'WW',
 'STRN-PDGFRA',
 'STRN',
 'STRN',
 'NM_003162',
 'PDGFRA',
 'DNA',
 'DNA',
 'PDGFRA-STRN',
 'PCR',
 'PDGFRA',
 'FIP1L1-PDGFRA',
 'PDGFRA',
 'AG',
 'STRN',
 'ETV6-PD

<a id='sec1_2'></a>
## Some useful regex and lists (<a href='#sec0'>Back To Top</a>)

<b>Keep on adding words that are too common in 'commoners' list!</b>

In [15]:
commoners = ['RT', 'PCR', 'RT-PCR', 'DNA', 'cDNA', 'RNA', 'mRNA', 'protein', 'cell', 'cancer', 'CHIP', 'FISH', 'UK', 'USA']

<b>Keep words with '-positive' and '-negative'</b>

In [16]:
[word for word in txt1_words if '-positive' in word.lower()]

['FIP1L1-PDGFRA-positive',
 'FIP1L1-PDGFRA-positive',
 'FIP1L1-PDGFRA-positive',
 'FIP1L1-PDGFRA-positive',
 'FIP1L1-PDGFRA-positive']

In [17]:
[word for word in txt1_words if '-negative' in word.lower()]

['FIP1L1-PDGFRA-negative', 'FIP1L1-PDGFRA-negative']

<b>Detect gene fusions</b>

In [18]:
fuse_pattern = r"[A-Z]{2,7}-[A-Z]{2,7}"
[word for word in txt1_words \
 if word not in commoners \
 if re.search(fuse_pattern, word)]

['BCR-ABL',
 'STRN-PDGFRA',
 'STRN-PDGFRA',
 'BCR-ABL',
 'STRN-PDGFRA',
 'STRN-PDGFRA',
 'STRN-PDGFRA',
 'STRN-PDGFRA',
 'PDGFRA-STRN',
 'PDGFRA-ETV6',
 'BCR-ABL',
 'BCR-ABL',
 'BCR-ABL',
 'BCR-ABL',
 'BCR-PDGFRA',
 'BCR-ABL',
 'PDGFRA-KIF5B']

<b>Detect genes with hyphens, including fusions</b>

In [19]:
# Remove nucleotide sequences
nts = r"[ACTGU]{4,}"

In [20]:
gene_pattern1 = r"([A-Z]{2,7})([0-9]{1,4})?-([A-Z]{0,7})([0-9]{1,4})?$"
[word for word in txt1_words \
 if word not in commoners \
 if re.search(gene_pattern1, word) \
 if not re.search(nts, word)]

['BCR-ABL',
 'STRN-PDGFRA',
 'ETV6-PDGFRA',
 'STRN-PDGFRA',
 'ETV6-PDGFRA',
 'BCR-ABL',
 'STRN-PDGFRA',
 'PDAI12-R4',
 'STRN-PDGFRA',
 'STRN-PDGFRA',
 'STRN-PDGFRA',
 'PDGFRA-STRN',
 'ETV6-PDGFRA',
 'PDGFRA-ETV6',
 'ETV6-PDGFRA',
 'ETV6-PDGFRA',
 'ETV6-PDGFRA',
 'ETV6-PDGFRA',
 'ETV6-PDGFRA',
 'ETV6-CHIC2',
 'ETV6-PDGFRA',
 'ETV6-PDGFRA',
 'BCR-ABL',
 'BCR-ABL',
 'PDA-R1',
 'PDA-R2',
 'PDAI12-R3',
 'PDAI12-R3',
 'PDAI12-R4',
 'BCR-ABL',
 'PDAI12-R3',
 'NVAMP-1',
 'PDAI12-R4',
 'NVAMP-2',
 'BCR-ABL',
 'BCR-PDGFRA',
 'MG-63',
 'MG-63',
 'BCR-ABL',
 'PDAI12-R4',
 'PDAI12-R4',
 'PDAI12-R3']

<b>Mutation Types (previously explored)</b>

In [21]:
mutation_patterns = ['truncation', 'deletion', 'promoter', 
                     'amplification', 'epigenetic', 'frame', 
                     'overexpression', 'duplication', 'insertion',
                     'subtype', 'fusion', 'splice', 'wildtype']

In [22]:
[word.lower() for word in txt1_words if word.lower() in mutation_patterns]

['fusion',
 'amplification',
 'fusion',
 'fusion',
 'amplification',
 'frame',
 'amplification',
 'fusion',
 'fusion',
 'fusion',
 'fusion',
 'fusion',
 'fusion',
 'fusion',
 'splice',
 'splice',
 'fusion',
 'fusion',
 'fusion',
 'fusion',
 'fusion',
 'fusion',
 'fusion',
 'fusion',
 'fusion',
 'amplification',
 'fusion',
 'fusion',
 'fusion',
 'fusion',
 'truncation',
 'truncation',
 'fusion',
 'fusion',
 'truncation',
 'deletion',
 'fusion',
 'overexpression',
 'fusion',
 'overexpression',
 'amplification',
 'fusion',
 'fusion',
 'overexpression',
 'fusion',
 'deletion',
 'fusion',
 'fusion',
 'fusion',
 'overexpression',
 'fusion',
 'amplification',
 'fusion',
 'amplification',
 'amplification',
 'overexpression',
 'amplification',
 'overexpression',
 'fusion',
 'fusion',
 'fusion',
 'amplification',
 'fusion',
 'fusion',
 'amplification',
 'overexpression',
 'fusion',
 'amplification',
 'amplification',
 'fusion',
 'fusion',
 'fusion',
 'fusion',
 'fusion',
 'fusion',
 'amplificati

<a id='sec1_3'></a>
## Replace periods, commas, hyphens, brackets with a space, and then tokenize (<a href='#sec0'>Back To Top</a>)

In [23]:
tokens = word_tokenize(txt1)
tokens = np.array(tokens)
print('initial length %d' % len(tokens))

initial length 6402


In [24]:
txt1_white = txt1.encode().decode()  # copy a string?!
txt1_white = txt1_white.replace('"', ' ')
txt1_white = txt1_white.replace('.', ' ')
txt1_white = txt1_white.replace('\'', ' ')
txt1_white = txt1_white.replace('_', ' ')    # This should be done after _pos, etc checked
#txt1_white = txt1_white.replace('-', ' ')    # This should be done after -pos, etc checked
txt1_white = txt1_white.replace('=', ' ')
txt1_white = txt1_white.replace('\n', ' ')
txt1_white = txt1_white.replace('\\n', ' ')
txt1_white = txt1_white.replace('\'', ' ')
txt1_white = re.sub(' +',' ', txt1_white) 
txt1_white = txt1_white.replace('\'', ' ')
txt1_white = txt1_white.replace('(', ' ')
txt1_white = txt1_white.replace(')', ' ')
txt1_white = txt1_white.replace('[', ' ')
txt1_white = txt1_white.replace(']', ' ')
txt1_white = txt1_white.replace('{', ' ')
txt1_white = txt1_white.replace('}', ' ')

In [25]:
tokens_white = word_tokenize(txt1_white)
tokens_white = np.array(tokens_white)
print('initial length %d' % len(tokens_white))

initial length 5891


In [26]:
tokens_white = [word for word in tokens_white if not word in stop_words]
tokens_white = [word for word in tokens_white if not word in commoners]
print('After removing stop-words & commoners %d' % len(tokens_white))

After removing stop-words & commoners 3987


In [27]:
# Remove nucleotide sequences
nts = r"[ACTGU]{4,}"
tokens_white = [word for word in tokens_white if not re.search(nts, word)]
print('After removing DNA/RNA seqs %d' % len(tokens_white))

After removing DNA/RNA seqs 3967


This time switch to re.search t pickup pattern not just from the beginning of the words

In [28]:
gene_ish_words_white = [word for word in tokens_white if re.search(gene_ish_pattern, word)]
print('# of gene-ish words: %d' % len(gene_ish_words_white))

# of gene-ish words: 364


In [29]:
gene_ish_words_white

['BCR-ABL',
 'FIP1L1-PDGFRA',
 'STRN-PDGFRA',
 'ETV6-PDGFRA',
 'PDGFRA',
 'FIP1L1-PDGFRA',
 'CEL',
 'CEL',
 'PDGFRA',
 'PDGFRB',
 'FGFR1',
 'JAK2',
 'PDGFRA',
 'PDGFRB',
 'PDGFRA',
 'BCR',
 'FIP1L1',
 'KIF5B',
 'CDK5RAP2',
 'CEL',
 'CEL',
 'MPD',
 'MPD',
 'XY',
 'XY',
 'PDGFRA',
 'STRN-PDGFRA',
 'STRN',
 'PDGFRA',
 'ETV6-PDGFRA',
 'ETV6',
 'PDGFRA',
 'MRD',
 'BAC',
 'PDGFRA',
 'ETV6',
 'BAC',
 'BACs',
 'RP11-24O10',
 'PDGFRA',
 'RP11-434C1',
 'ETV6',
 'PDGFRA',
 'BCR-ABL',
 'FIP1L1-PDGFRA',
 'PDGFRA',
 'PDGFRA',
 'PDGFRA',
 'STRN-PDGFRA',
 'HaeIII',
 'PDAI12-R4',
 'HaeIII',
 'STRN-PDGFRA',
 'STRN-PDGFRA',
 'WW-like',
 'WW',
 'WD40',
 'WW',
 'ΔWW',
 'STRN-PDGFRA',
 'STRN',
 'STRN',
 'NM',
 'PDGFRA',
 'PDGFRA-STRN',
 'PDGFRA',
 'FIP1L1-PDGFRA',
 'PDGFRA',
 'AG',
 'STRN',
 'ETV6-PDGFRA',
 'ETV6',
 'NM',
 'PDGFRA',
 'PDGFRA-ETV6',
 'ETV6-PDGFRA',
 'ETV6',
 'PDGFRA',
 'ETV6-PDGFRA',
 'BACs',
 'RP11-24O10',
 'RP11-434C1',
 'ETV6',
 'PDGFRA',
 'ETV6-PDGFRA',
 'ETV6',
 'PDGFRA',
 'ETV6-PDGFRA'

In [30]:
df1 = pd.DataFrame(gene_ish_words_white)
df1.columns = ['tokens']
print(len(df1))

364


In [31]:
table1 = df1.groupby('tokens').size().reset_index()
table1.columns = ['tokens', 'appearances']  # Use rename!!!

In [32]:
table1.sort_values('appearances', ascending=False).head(15)

Unnamed: 0,tokens,appearances
58,PDGFRA,91
24,FIP1L1-PDGFRA,24
33,KIF5B,19
34,KIF5B-PDGFRA,14
30,IHES,13
16,ETV6,10
18,ETV6-PDGFRA,10
80,WW-like,9
12,CEL,9
73,STRN,8


<a id='sec2'></a>
# Surveying the whole text. Looking at each type of feature <br>(<a href='#sec0'>Back To Top</a>)

In [33]:
%%time
textome = ''
for i in range(len(text_train)):
    text = text_train.loc[i, 'Text'] + ''
    textome += text

CPU times: user 74.1 ms, sys: 160 ms, total: 234 ms
Wall time: 235 ms


In [68]:
len(textome)

3321

In [34]:
len(textome)

211268641

<b>Words with '-positive' and '-negative'</b>

In [35]:
[word for word in txt1_words if '-positive' in word.lower()]

['FIP1L1-PDGFRA-positive',
 'FIP1L1-PDGFRA-positive',
 'FIP1L1-PDGFRA-positive',
 'FIP1L1-PDGFRA-positive',
 'FIP1L1-PDGFRA-positive']

In [35]:
%%time
textome_tokens = word_tokenize(textome)

CPU times: user 3min 5s, sys: 596 ms, total: 3min 5s
Wall time: 3min 6s


initial leng 37460802


In [36]:
print('initial leng %d' % len(textome_tokens))

initial leng 37460802


In [37]:
%%time
stop_words = set(stopwords.words('english'))
textome_tokens = [token for token in textome_tokens if not token in stop_words]

CPU times: user 2.41 s, sys: 50.9 ms, total: 2.46 s
Wall time: 2.46 s


In [38]:
print('After removing stop words %d' % len(textome_tokens))

After removing stop words 27557071


Remove (E)GFP tags

<b>Words with '-positive' and '-negative'</b>

In [39]:
%%time
# Remove (E)GFP taggs
for i, word in enumerate(textome_tokens):
    if 'EGFP/EYFP' in word:
        textome_tokens[i] = word.replace('EGFP/EYFP' + '-', '')
    if 'EGFP' in word:
        textome_tokens[i] = word.replace('EGFP' + '-', '')
    if 'GFP' in word:
        textome_tokens[i] = word.replace('GFP' + '-', '')
    if 'GST' in word:
        textome_tokens[i] = word.replace('GST' + '-', '')

CPU times: user 3.95 s, sys: 1e+03 ns, total: 3.95 s
Wall time: 3.95 s


<a id='sec2_1'></a>
## Words with '-positive' and '-negative' (<a href='#sec0'>Back To Top</a>)

In [138]:
%%time
pos_neg = [token.lower() for token in textome_tokens \
           if (('-positive' in token.lower()) | ('-negative' in token.lower())) \
           if not (('false' in token.lower()) | ('true' in token.lower()))]

for i, word in enumerate(pos_neg):
    if 'egfp/eyfp' in word: 
        pos_neg[i] = word.replace('egfp/eyfp-', '')
    elif 'egfp' in word:
        pos_neg[i] = word.replace('egfp-', '')
    elif 'gfp' in word:
        pos_neg[i] = word.replace('gfp'+"–", '') #This hyphen is somehow different from the other one

pos_neg = [token for token in pos_neg \
           if not ((token == '-positive') | (token == '-negative'))]

CPU times: user 4.62 s, sys: 3.01 ms, total: 4.62 s
Wall time: 4.62 s


In [116]:
gfp_pattern = r"e?gfp-"

In [None]:
textome_tokens = re.sub(gfp_pattern, '', textome_tokens) 

In [55]:
word = 'gfp-positive'

In [56]:
word.replace('gfp'+'-', '')

'positive'

In [139]:
word = 'gfp–vhl213-positive'

In [140]:
word.replace('–', '')

'gfpvhl213-positive'

In [141]:
pos_neg

['erα-positive',
 'cbl-positive',
 'mutation-positive',
 'dominant-negative',
 'dominant-negative',
 'dominant-negative',
 'phospho-stat5-positive',
 'v617fjak2-negative',
 'v617fjak2-positive',
 'v617fjak2-negative',
 'v617fjak2-positive',
 'v617fjak2-negative',
 'v617fjak2-negative',
 'v617fjak2-positive',
 'v617fjak2-positive',
 'v617fjak2-positive',
 'bcr-abl1-negative',
 'v617fjak2-negative',
 'v617fjak2-negative',
 'v617fjak2-positive',
 'v617fjak2-positive',
 'v617fjak2-negative',
 'v617fjak2-negative',
 'v617fjak2-positive',
 'v617fjak2-negative',
 'v617fjak2-negative',
 'v617fjak2-positive',
 'v617fjak2-negative',
 'v617fjak2-negative',
 'v617fjak2-positive',
 'v617fjak2-positive',
 'v617fjak2-positive',
 'v617fjak2-positive',
 'v617fjak2-negative',
 'v617fjak2-negative',
 'v617fjak2-negative',
 'v617jak2-positive',
 'v617fjak2-positive',
 'v617fjak2-negative',
 'v617fjak2-positive',
 'v617fjak2-positive',
 'v617fjak2-positive',
 'v-positive',
 'v-positive',
 'dominant-negativ

In [142]:
df_pos_neg = pd.DataFrame(pos_neg, columns=['pos_neg'])

In [143]:
df_pos_neg.groupby('pos_neg').size().sort_values(ascending=False).head(15)

pos_neg
dominant-negative    1853
mutation-positive     588
alk-positive          566
her2-positive         277
mutation-negative     265
pan-negative          255
fusion-positive       253
dsred-positive        216
er-positive           191
triple-negative       191
her2-negative         126
fish-positive         101
ros1-positive          98
double-positive        97
cd117-positive         76
dtype: int64

In [144]:
df_pos_neg[df_pos_neg['pos_neg'].str.contains('gfp')]['pos_neg'].unique()

array([], dtype=object)

<a id='sec2_2'></a>
## Detect genes with hyphens, including fusions (<a href='#sec0'>Back To Top</a>)

In [182]:
commoners = ['RT', 'PCR', 'RT-PCR', 'DNA', 'cDNA', 'RNA', 'mRNA', 'siRNA', 'shRNA', 'protein', 
             'cell', 'cancer', 'CHIP', 'FISH', 'SDS-PAGE', 'UK', 'USA', 'GST', 'GFP', 'SDS', 'PAGE',
             'qPCR', 'PBS', 'TBS', 'DTT', 'BSA', 'HSA', 'HCl', 'NCBI', 'PBST', 'ANOVA', 'RIKEN',
             'COHORT', 'OUTCOME', 'AIRWAY', 'EMSA']

In [81]:
word = 'gfp-positive'dd

In [88]:
word.replace('gfp'+'-', '')

'positive'

In [90]:
[word for word in pos_neg if 'gfp' in word]

[]

In [146]:
%%time
# Remove nucleotide sequences
nts = r"[ACTGU]{4,}"

# fusion-like pattern
fusion_like_pattern = r"([A-Z]{2,7})([0-9]{1,4})?-([A-Z]{0,7})([0-9]{1,4})?$"

fusion_like = [word.lower() for word in textome_tokens \
               if word not in commoners \
               if re.search(fusion_like_pattern, word) \
               if not re.search(nts, word)]

for i, word in enumerate(fusion_like):
    if 'gst-' in word: 
        fusion_like[i] = word.replace('gst-', '')

CPU times: user 21 s, sys: 8.96 ms, total: 21 s
Wall time: 21 s


In [147]:
fusion_like

['pjg4-5',
 'sc-2033',
 'cdk10-',
 'cbl-3',
 'rpmi-1640',
 'exosap-it',
 'np-40',
 'pca-96',
 'flt-1',
 'cbl-3',
 'rpmi-1640',
 'exosap-it',
 'np-40',
 'pca-96',
 'flt-1',
 'bcr-abl',
 'bcr-abl',
 'pcmv-ha',
 'bcr-abl',
 'il-3',
 'il-3',
 'ns-1',
 'nup98-hoxd13',
 'cbl-e2',
 'cbl-e2',
 'cbl-e2',
 'cbl-e2',
 'cbl-s',
 'cbl-s',
 'mm-pbsa',
 'cbl-s',
 'cbl-s',
 'cbl-e2',
 'cbl-e2',
 'cbl-s',
 'cbl-e2',
 'cbl-e2',
 'cbl-e2',
 'cbl-e2',
 'cbl-e2',
 'cbl-e2',
 'cbl-e2',
 'cbl-e2',
 'cbl-s',
 'cbl-e2',
 'cbl-e2',
 'cbl-s',
 'cbl-e2',
 'cbl-e2',
 'cbl-e2',
 'cbl-s',
 'cbl-e2',
 'cbl-s',
 'cbl-s',
 'cbl-e2',
 'cbl-e2',
 'cbl-e2',
 'cbl-e2',
 'cbl-e2',
 'cbl-e2',
 'ring-tkbd',
 'cbl-e2',
 'ring-tkbd',
 'cbl-e2',
 'cbl-e3',
 'cbl-e2',
 'cbl-e2',
 'cbl-e2',
 'cbl-e2',
 'cbl-e2',
 'cbl-e2',
 'cbl-s',
 'cbl-s',
 'mm-pbsa',
 'cbl-s',
 'cbl-s',
 'cbl-e2',
 'cbl-e2',
 'cbl-s',
 'cbl-e2',
 'cbl-e2',
 'cbl-e2',
 'cbl-e2',
 'cbl-e2',
 'cbl-e2',
 'cbl-e2',
 'cbl-e2',
 'cbl-s',
 'cbl-e2',
 'cbl-e2',
 'cbl-s

In [148]:
df_fusion_like = pd.DataFrame(fusion_like, columns=['fusion_like'])

In [149]:
df_fusion_like.groupby('fusion_like').size().sort_values(ascending=False).head(15)

fusion_like
nih-pa           3399
il-3             3297
bcr-abl          2091
flt3-itd         2031
eml4-alk         1557
cos-7            1086
src-3             994
shp-2             970
mcf-7             911
npm-alk           773
dr-gfp            745
ar-v7             728
wm278-nic-gfp     688
msi-h             674
wtb-raf           672
dtype: int64

<a id='sec2_3'></a>
## Mutation Types (<a href='#sec0'>Back To Top</a>)

In [150]:
mutation_patterns = ['truncation', 'deletion', 'promoter', 
                     'amplification', 'epigenetic', 'frame', 
                     'overexpression', 'duplication', 'insertion',
                     'subtype', 'fusion', 'splice', 'wildtype']

In [151]:
%%time
mutation_types = [word.lower() for word in textome_tokens if word.lower() in mutation_patterns]

CPU times: user 5.59 s, sys: 3.99 ms, total: 5.59 s
Wall time: 5.6 s


In [152]:
df_mutation_types = pd.DataFrame(mutation_types, columns=['mutation_types'])

In [153]:
df_mutation_types.groupby('mutation_types').size().sort_values(ascending=False)

mutation_types
fusion            17133
deletion          15717
amplification     13849
promoter          10147
overexpression     7572
splice             5974
insertion          4925
subtype            2440
truncation         2122
duplication        1925
epigenetic         1889
frame              1611
wildtype           1498
dtype: int64

<a id='sec2_4'></a>
## Gene Like Names (<a href='#sec0'>Back To Top</a>)

In [168]:
%%time
txt1_white = textome.encode().decode()  # copy a string?!
txt1_white = txt1_white.replace('"', ' ')
txt1_white = txt1_white.replace('.', ' ')
txt1_white = txt1_white.replace('/', ' ')
txt1_white = txt1_white.replace('\'', ' ')
txt1_white = txt1_white.replace('_', ' ')    # This should be done after _pos, etc checked
txt1_white = txt1_white.replace('-', ' ')    # This should be done after -pos, etc checked
txt1_white = txt1_white.replace('=', ' ')
txt1_white = txt1_white.replace('\n', ' ')
txt1_white = txt1_white.replace('\\n', ' ')
txt1_white = txt1_white.replace('\'', ' ')
txt1_white = re.sub(' +',' ', txt1_white) 
txt1_white = txt1_white.replace('\'', ' ')
txt1_white = txt1_white.replace('(', ' ')
txt1_white = txt1_white.replace(')', ' ')
txt1_white = txt1_white.replace('[', ' ')
txt1_white = txt1_white.replace(']', ' ')
txt1_white = txt1_white.replace('{', ' ')
txt1_white = txt1_white.replace('}', ' ')

CPU times: user 10.5 s, sys: 2.65 s, total: 13.1 s
Wall time: 13.1 s


In [169]:
%%time
tokens_white = word_tokenize(txt1_white)

CPU times: user 1min 48s, sys: 500 ms, total: 1min 49s
Wall time: 1min 49s


In [183]:
%%time
# Remove nucleotide sequences
nts = r"[ACTGU]{4,}"
tokens_white = [word for word in tokens_white \
                if not word in stop_words \
                if not word in commoners \
                if not re.search(nts, word)]

CPU times: user 24.6 s, sys: 43.9 ms, total: 24.7 s
Wall time: 24.7 s


In [188]:
%%time
gene_ish_pattern1 = r"[A-Z]{2,7}[0-9]{0,4}$"
gene_ish_pattern2 = r"p[0-9]{1,3}$"
gene_ish_words = [word for word in tokens_white if re.match(gene_ish_pattern1, word)]
gene_ish_words += [word for word in tokens_white if re.match(gene_ish_pattern2, word)]
print('# of gene-ish words: %d' % len(gene_ish_words_white))

# of gene-ish words: 364
CPU times: user 29.6 s, sys: 0 ns, total: 29.6 s
Wall time: 29.7 s


In [191]:
set(gene_ish_words)

{'KEGG',
 'ASS',
 'FRASER',
 'PHD',
 'FY23',
 'OHA',
 'WHSC2',
 'DFS',
 'KIAA1509',
 'FFAR2',
 'SOS1',
 'NSCPO',
 'FYVE',
 'RNF4',
 'UBI',
 'PTCH1',
 'RASU2',
 'PVALB',
 'NB',
 'RIKEN',
 'HCC2429',
 'AXIN1',
 'BT4',
 'MSN2',
 'EPHA2',
 'HM48',
 'ARV7',
 'NOX4',
 'QTR2',
 'BME',
 'FINGER',
 'NCOA6',
 'PGCTYS',
 'LAML',
 'AB1442',
 'FLVRDAS',
 'SLIDE',
 'CXCL3',
 'CYVD',
 'ROBETTA',
 'CSE1',
 'WHATIF',
 'NACC2',
 'META',
 'NXPH1',
 'CAPZB',
 'OD2',
 'CAND',
 'ZNF639',
 'DAB2',
 'IRC',
 'GAG',
 'BZS',
 'AVF',
 'LYS505',
 'TYYL',
 'EN26',
 'ZWILCH',
 'PGBM14',
 'IIA110',
 'TALL',
 'LSL',
 'SKBR',
 'VHL30',
 'HOXB8',
 'AZD2644',
 'GOPC',
 'RAS10',
 'ASPL',
 'ATHB6',
 'PRCC',
 'CUCRC040',
 'GHE0536',
 'EA4340',
 'MARCKS',
 'FOXA',
 'PD0119',
 'RCPHHER',
 'LSD1',
 'EPOCH',
 'GMS',
 'SCCOHT1',
 'TSS1',
 'ALH3000',
 'CAC',
 'RAL',
 'CLD',
 'TRAF7',
 'ML3000',
 'UMUC17',
 'WO',
 'IDH1',
 'ABI7000',
 'RENZO',
 'GMCSFR',
 'RSB2',
 'COLO205',
 'BRCAX',
 'LZTR1',
 'VAMP',
 'PR110',
 'DOGS',
 'CEN8',

<a id='sec3'></a>
# Functions for Processing Features (<a href='#sec0'>Back To Top</a>)

In [192]:
commoners = ['RT', 'PCR', 'RT-PCR', 'DNA', 'cDNA', 'RNA', 'mRNA', 'siRNA', 'shRNA', 'protein', 
             'cell', 'cancer', 'CHIP', 'FISH', 'SDS-PAGE', 'UK', 'USA', 'GST', 'GFP', 'SDS', 'PAGE',
             'qPCR', 'PBS', 'TBS', 'DTT', 'BSA', 'HSA', 'HCl', 'NCBI', 'PBST', 'ANOVA', 'RIKEN',
             'COHORT', 'OUTCOME', 'AIRWAY', 'EMSA']

In [193]:
def get_positive_and_negative_words(tokens):
    pos_neg = [token.lower() for token in tokens \
           if (('-positive' in token.lower()) | ('-negative' in token.lower())) \
           if not (('false' in token.lower()) | ('true' in token.lower()))]
    
    # Remove GFP tag
    for i, word in enumerate(pos_neg):
        if 'egfp/eyfp' in word: 
            pos_neg[i] = word.replace('egfp/eyfp-', '')
        elif 'egfp' in word:
            pos_neg[i] = word.replace('egfp-', '')
        elif 'gfp' in word:
            pos_neg[i] = word.replace('gfp'+"–", '') #This hyphen is somehow different from the other one

    pos_neg = [token for token in pos_neg \
               if not ((token == '-positive') | (token == '-negative'))]
    
    return pos_neg

In [194]:
def get_fusion_like_words(tokens, commoners):
    # Remove nucleotide sequences
    nts = r"[ACTGU]{4,}"
    
    # fusion-like pattern
    fusion_like_pattern = r"([A-Z]{2,7})([0-9]{1,4})?-([A-Z]{0,7})([0-9]{1,4})?$"

    fusion_like = [word.lower() for word in tokens \
                   if word not in commoners \
                   if re.search(fusion_like_pattern, word) \
                   if not re.search(nts, word)]
    
    # Remove GST-tagg
    for i, word in enumerate(fusion_like):
        if 'gst-' in word: 
            fusion_like[i] = word.replace('gst-', '')
    
    return fusion_like

In [195]:
def get_mutation_type_words(tokens):
    mutation_patterns = ['truncation', 'deletion', 'promoter', 
                     'amplification', 'epigenetic', 'frame', 
                     'overexpression', 'duplication', 'insertion',
                     'subtype', 'fusion', 'splice', 'wildtype']
    
    mutation_type_words = [word.lower() for word in tokens if word.lower() in mutation_patterns]
    
    return mutation_type_words

In [196]:
def get_gene_like_words(text, commoners):
    text_white = text.encode().decode()  # copy a string?!
    text_white = text_white.replace('"', ' ')
    text_white = text_white.replace('.', ' ')
    text_white = text_white.replace('/', ' ')
    text_white = text_white.replace('\'', ' ')
    text_white = text_white.replace('_', ' ')    # This should be done after _pos, etc checked
    text_white = text_white.replace('-', ' ')    # This should be done after -pos, etc checked
    text_white = text_white.replace('=', ' ')
    text_white = text_white.replace('\n', ' ')
    text_white = text_white.replace('\\n', ' ')
    text_white = text_white.replace('\'', ' ')
    text_white = re.sub(' +',' ', text_white) 
    text_white = text_white.replace('\'', ' ')
    text_white = text_white.replace('(', ' ')
    text_white = text_white.replace(')', ' ')
    text_white = text_white.replace('[', ' ')
    text_white = text_white.replace(']', ' ')
    text_white = text_white.replace('{', ' ')
    text_white = text_white.replace('}', ' ')

    tokens_white = word_tokenize(text_white)
    
    # Remove nucleotide sequences
    nts = r"[ACTGU]{4,}"
    
    tokens_white = [word for word in tokens_white \
                    if not word in stop_words \
                    if not word in commoners \
                    if not re.search(nts, word)]
    
    gene_ish_pattern1 = r"[A-Z]{2,7}[0-9]{0,4}$"
    gene_ish_pattern2 = r"p[0-9]{1,3}$"
    gene_ish_words = [word for word in tokens_white if re.match(gene_ish_pattern1, word)]
    gene_ish_words += [word for word in tokens_white if re.match(gene_ish_pattern2, word)]
    
    return gene_ish_words

In [218]:
def count_appearances(list_of_words):
    unique_words = set(list_of_words)
    appearances = {}
    for unique_word in unique_words:
        count = len([word for word in list_of_words if word == unique_word])
        appearances[unique_word] = count
    
    return appearances

In [243]:
text1 = text_train.loc[2105, 'Text']
tokens1 = word_tokenize(text1)

In [244]:
count_appearances(get_positive_and_negative_words(tokens1))

{'mhc-positive': 2}

In [245]:
count_appearances(get_fusion_like_words(tokens1, commoners))

{'crl-1772': 1,
 'ct-tc': 1,
 'hg-u133': 1,
 'mck-cat': 2,
 'mf-20': 1,
 'mlp-cat': 4,
 'p3300mck-cat': 1,
 'pax-foxo1': 3,
 'pax3-foxo1': 1,
 'pax7-foxo1': 1,
 'pmlp-cat': 3,
 'sms-ctr': 1}

In [246]:
count_appearances(get_mutation_type_words(tokens1))

{'amplification': 3,
 'duplication': 1,
 'fusion': 8,
 'promoter': 9,
 'splice': 1,
 'subtype': 1,
 'wildtype': 1}

In [247]:
count_appearances(get_gene_like_words(text1, commoners))

{'ABC': 1,
 'AD': 2,
 'AKT': 3,
 'AL': 1,
 'ALK': 1,
 'ALN': 1,
 'ALT': 1,
 'AND': 1,
 'AP4': 1,
 'ARMS': 8,
 'BAM': 1,
 'BD': 2,
 'BJ': 6,
 'BWA': 3,
 'CA': 11,
 'CANNTG': 3,
 'CASAVA': 1,
 'CAT': 21,
 'CBFJ': 1,
 'CC': 2,
 'CC1': 1,
 'CGG': 1,
 'CLN': 1,
 'CMV': 3,
 'CMYC': 1,
 'CRL': 1,
 'CT': 1,
 'CTG': 1,
 'CTR': 1,
 'CZ': 1,
 'DAPI': 1,
 'DI': 1,
 'DMEM': 6,
 'DP': 1,
 'DSHB': 1,
 'EDTA': 2,
 'EMD': 1,
 'EO': 3,
 'ERMS': 37,
 'EXOME': 1,
 'FASTQ': 1,
 'FASTX': 2,
 'FBS': 3,
 'FFPE': 4,
 'FGFR4': 2,
 'FI': 1,
 'FIG': 4,
 'FOXO1': 5,
 'GAC': 4,
 'GAPDH': 1,
 'GATK': 8,
 'GSEA': 5,
 'GTF': 1,
 'HA': 2,
 'HG': 1,
 'HLH': 4,
 'IA': 1,
 'IHC': 2,
 'II': 1,
 'IIII': 1,
 'IL': 1,
 'INDEL': 1,
 'JAF': 1,
 'JB': 1,
 'LOH': 2,
 'MA': 3,
 'MAB3878': 1,
 'MACS': 2,
 'MALDI': 3,
 'MAPQ': 1,
 'MCK': 9,
 'METHODS': 2,
 'MF': 1,
 'MF20': 1,
 'MHC': 3,
 'MI': 1,
 'MLP': 8,
 'MLPCAT': 1,
 'MO': 3,
 'MRF4': 2,
 'MS': 4,
 'MSKCC': 4,
 'MYC': 23,
 'MYCN': 2,
 'MYF5': 3,
 'MYF6': 1,
 'MYOD': 1,
 'MYOD1

<a id='sec4'></a>
# Process Features (<a href='#sec0'>Back To Top</a>)

In [248]:
%%time
pos_neg_list = []
fusion_like_list = []
mutation_type_list = []
gene_like_list = []

for i in range(len(text_train)):
    text = text_train.loc[i, 'Text']
    tokens = word_tokenize(text)
    
    pos_neg_list.append(count_appearances(get_positive_and_negative_words(tokens)))
    fusion_like_list.append(count_appearances(get_fusion_like_words(tokens, commoners)))
    mutation_type_list.append(count_appearances(get_mutation_type_words(tokens)))
    gene_like_list.append(count_appearances(get_gene_like_words(text, commoners)))

CPU times: user 7min 12s, sys: 8.84 ms, total: 7min 12s
Wall time: 7min 12s


In [249]:
pos_neg_table = pd.DataFrame(pos_neg_list)
gene_like_table = pd.DataFrame(gene_like_list)
fusion_like_table = pd.DataFrame(fusion_like_list)
mutation_type_table = pd.DataFrame(mutation_type_list)

In [250]:
pos_neg_table = pos_neg_table.fillna(value=0)
gene_like_table = gene_like_table.fillna(value=0)
fusion_like_table = fusion_like_table.fillna(value=0)
mutation_type_table = mutation_type_table.fillna(value=0)

In [251]:
pos_neg_table.head(10)

Unnamed: 0,'dominant-negative,'mutation-negative,-stain-negative,.non-negative,/aml1-eto-positive,/dsred-positive,/kit-positive,10-67-positive,1100delc-positive,11q-aupd-positive,...,zsgreen-positive,α-actinin-positive,α-negative,α-positive,β-galactosidase-positive,β1-tubulin-positive,β2m-negative,β2m-positive,γ-h2ax-positive,γh2ax-positive
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [252]:
gene_like_table.head(10)

Unnamed: 0,AA,AA0869,AA1077,AA1370,AA2,AA26,AA2711,AA2800,AA4345,AA491,...,p815,p83,p84,p85,p86,p89,p9,p90,p95,p97
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0


In [253]:
fusion_like_table.head(10)

Unnamed: 0,Unnamed: 1,'251i-labeledfo-,'whole-exome,**pax5-jak2,+ca-alk5,+il-3,-aif4-,-akt-mtor,-akt-s6,-bcor-rara,...,δbrip1-pcdna-dest47,δc·abt-737,δegf-myc,δg466vb-raf,δgh2oi-n,δgh2ou-i,δhm-akt1,δpr-lbd,δzf-ha,∆wd40-jak2
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [254]:
mutation_type_table.head(10)

Unnamed: 0,amplification,deletion,duplication,epigenetic,frame,fusion,insertion,overexpression,promoter,splice,subtype,truncation,wildtype
0,0.0,12.0,0.0,0.0,0.0,5.0,1.0,2.0,0.0,5.0,0.0,1.0,0.0
1,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,2.0,5.0,1.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,2.0,28.0,1.0,0.0,7.0,3.0,2.0,4.0,0.0,13.0,1.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [255]:
print(pos_neg_table.shape)
print(gene_like_table.shape)
print(fusion_like_table.shape)
print(mutation_type_table.shape)

(3321, 765)
(3321, 18295)
(3321, 7789)
(3321, 13)


In [478]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.preprocessing import scale, normalize, robust_scale
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

In [257]:
# Get Gene feature from 'train_variants' data
X_gene = np.array(class_train.Gene)
X_gene_int = LabelEncoder().fit_transform(X_gene.ravel()).reshape(-1, 1)
X_gene_bin = OneHotEncoder().fit_transform(X_gene_int).toarray()

In [258]:
full_gene_table = pd.DataFrame(X_gene_bin)

In [260]:
full_gene_table.shape

(3321, 264)

<b>Combined All!</b>

In [262]:
features = pd.concat([pos_neg_table, 
                      gene_like_table,
                      fusion_like_table,
                      mutation_type_table,
                      full_gene_table],
                      axis=1)

In [263]:
features.shape

(3321, 27126)

In [264]:
class_train.Class.shape

(3321,)

<a id='sec5'></a>
# Test with Random Forest (<a href='#sec0'>Back To Top</a>)

In [577]:
X = np.array(features).astype(float)
y = np.array(class_train.Class).astype(int).ravel()

<b>un-scaled</b>

In [578]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=55)

In [579]:
%%time
rfc1 = RandomForestClassifier(n_estimators=50, max_depth=100, n_jobs=4)
rfc1.fit(X_train, y_train)

CPU times: user 15.2 s, sys: 93.3 ms, total: 15.3 s
Wall time: 4.08 s


In [580]:
y_pred = rfc1.predict(X_test)

In [581]:
print(accuracy_score(y_test, y_pred))

0.685370741483


<b>scaled</b>

In [582]:
X_scaled = scale(X)

In [583]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.15, random_state=55)

In [584]:
%%time
rfc2 = RandomForestClassifier(n_estimators=50, max_depth=100, n_jobs=4)
rfc2.fit(X_train, y_train)

CPU times: user 13.5 s, sys: 66.8 ms, total: 13.6 s
Wall time: 3.65 s


In [585]:
y_pred = rfc2.predict(X_test)

In [586]:
print(accuracy_score(y_test, y_pred))

0.661322645291


<b>normalized</b>

In [587]:
X_normed = normalize(X)

In [588]:
X_train, X_test, y_train, y_test = train_test_split(X_normed, y, test_size=0.15, random_state=55)

In [589]:
%%time
rfc3 = RandomForestClassifier(n_estimators=50, max_depth=100, n_jobs=4)
rfc3.fit(X_train, y_train)

CPU times: user 14.5 s, sys: 76.2 ms, total: 14.6 s
Wall time: 3.95 s


In [590]:
y_pred = rfc3.predict(X_test)

In [591]:
print(accuracy_score(y_test, y_pred))

0.673346693387


<b>Select Features from the Best RFC above</b>

In [592]:
importances = rfc1.feature_importances_
indices = np.argsort(importances)[::-1]
features1000 = indices[:1000]

In [593]:
X100 = X[:, features1000]

In [594]:
X_train, X_test, y_train, y_test = train_test_split(X100, y, test_size=0.15, random_state=55)

In [595]:
%%time
rfc4 = RandomForestClassifier(n_estimators=50, max_depth=100, n_jobs=4)
rfc4.fit(X_train, y_train)

CPU times: user 742 ms, sys: 5.03 ms, total: 747 ms
Wall time: 231 ms


In [596]:
y_pred = rfc4.predict(X_test)

In [597]:
print(accuracy_score(y_test, y_pred))

0.663326653307


<b>RobustScale</b>

In [598]:
X_robust = robust_scale(X)

In [599]:
X_train, X_test, y_train, y_test = train_test_split(X_robust, y, test_size=0.15, random_state=55)

In [600]:
%%time
rfc5 = RandomForestClassifier(n_estimators=50, max_depth=100, n_jobs=4)
rfc5.fit(X_train, y_train)

CPU times: user 13.2 s, sys: 58.8 ms, total: 13.3 s
Wall time: 3.55 s


In [601]:
y_pred = rfc5.predict(X_test)

In [602]:
print(accuracy_score(y_test, y_pred))

0.657314629259


GridSearchCV

In [603]:
from sklearn.model_selection import GridSearchCV

In [628]:
parameters = {'max_depth':[40, 60, 80, 100], 'criterion':['gini', 'entropy'], 
              'class_weight':[None, 'balanced', 'balanced_subsample']}
rfc = RandomForestClassifier(n_jobs=1, n_estimators=60)
clf = GridSearchCV(rfc, parameters, n_jobs=4, verbose=2)

In [629]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=55)

In [630]:
%%time
clf.fit(X_train, y_train)

Fitting 3 folds for each of 24 candidates, totalling 72 fits
[CV] class_weight=None, criterion=gini, max_depth=40 .................
[CV] class_weight=None, criterion=gini, max_depth=40 .................
[CV] class_weight=None, criterion=gini, max_depth=40 .................
[CV] class_weight=None, criterion=gini, max_depth=60 .................
[CV] .. class_weight=None, criterion=gini, max_depth=40, total=   8.5s
[CV] class_weight=None, criterion=gini, max_depth=60 .................
[CV] .. class_weight=None, criterion=gini, max_depth=40, total=   8.1s
[CV] class_weight=None, criterion=gini, max_depth=60 .................
[CV] .. class_weight=None, criterion=gini, max_depth=40, total=   8.7s
[CV] class_weight=None, criterion=gini, max_depth=80 .................
[CV] .. class_weight=None, criterion=gini, max_depth=60, total=  10.2s
[CV] class_weight=None, criterion=gini, max_depth=80 .................
[CV] .. class_weight=None, criterion=gini, max_depth=60, total=   9.8s
[CV] class_weigh

[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:  1.5min


[CV]  class_weight=balanced, criterion=gini, max_depth=100, total=  11.9s
[CV] class_weight=balanced, criterion=entropy, max_depth=40 ..........
[CV]  class_weight=balanced, criterion=gini, max_depth=100, total=  11.8s
[CV] class_weight=balanced, criterion=entropy, max_depth=40 ..........
[CV]  class_weight=balanced, criterion=gini, max_depth=100, total=  11.8s
[CV]  class_weight=balanced, criterion=entropy, max_depth=40, total=   9.3s
[CV] class_weight=balanced, criterion=entropy, max_depth=60 ..........
[CV] class_weight=balanced, criterion=entropy, max_depth=60 ..........
[CV]  class_weight=balanced, criterion=entropy, max_depth=40, total=   9.4s
[CV] class_weight=balanced, criterion=entropy, max_depth=60 ..........
[CV]  class_weight=balanced, criterion=entropy, max_depth=40, total=   9.9s
[CV] class_weight=balanced, criterion=entropy, max_depth=80 ..........
[CV]  class_weight=balanced, criterion=entropy, max_depth=60, total=  10.6s
[CV] class_weight=balanced, criterion=entropy, m

[Parallel(n_jobs=4)]: Done  72 out of  72 | elapsed:  3.4min finished


CPU times: user 1min 14s, sys: 780 ms, total: 1min 15s
Wall time: 3min 38s


GridSearchCV(cv=None, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=60, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=4,
       param_grid={'max_depth': [40, 60, 80, 100], 'criterion': ['gini', 'entropy'], 'class_weight': [None, 'balanced', 'balanced_subsample']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=2)

In [631]:
best_clf = clf.best_estimator_

In [632]:
best_clf

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=80, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=60, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [633]:
y_pred = best_clf.predict(X_test)

In [634]:
print(accuracy_score(y_test, y_pred))

0.649298597194
