In [35]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

#  Load data

In [36]:
df_train = pd.read_csv('training_variants')

In [37]:
df_train.head()

Unnamed: 0,ID,Gene,Variation,Class
0,0,FAM58A,Truncating Mutations,1
1,1,CBL,W802*,2
2,2,CBL,Q249E,2
3,3,CBL,N454D,3
4,4,CBL,L399V,4


In [38]:
df_test = pd.read_csv('test_variants')

In [39]:
df_test.head()

Unnamed: 0,ID,Gene,Variation
0,0,ACSL4,R570S
1,1,NAGLU,P521L
2,2,PAH,L333F
3,3,ING1,A148D
4,4,TMEM216,G77A


In [40]:
df_train_text = pd.read_csv('training_text', sep="\|\|", engine='python', header=None, skiprows=1, names=["ID","Text"])

In [41]:
df_train_text.head()

Unnamed: 0,ID,Text
0,0,Cyclin-dependent kinases (CDKs) regulate a var...
1,1,Abstract Background Non-small cell lung canc...
2,2,Abstract Background Non-small cell lung canc...
3,3,Recent evidence has demonstrated that acquired...
4,4,Oncogenic mutations in the monomeric Casitas B...


In [42]:
df_test_text = pd.read_csv('test_text', sep="\|\|", engine='python', header=None, skiprows=1, names=["ID","Text"])

In [43]:
df_test_text.head()

Unnamed: 0,ID,Text
0,0,2. This mutation resulted in a myeloproliferat...
1,1,Abstract The Large Tumor Suppressor 1 (LATS1)...
2,2,Vascular endothelial growth factor receptor (V...
3,3,Inflammatory myofibroblastic tumor (IMT) is a ...
4,4,Abstract Retinoblastoma is a pediatric retina...


In [44]:
df_submission = pd.read_csv('submissionFile')

In [45]:
df_submission.head()

Unnamed: 0,ID,class1,class2,class3,class4,class5,class6,class7,class8,class9
0,0,0,0,0,0,0,1,0,0,0
1,1,0,1,0,0,0,0,0,0,0
2,2,0,0,0,0,0,1,0,0,0
3,3,0,0,0,0,0,0,0,1,0
4,4,0,0,0,1,0,0,0,0,0


#  merging test and train data for processing

In [46]:
df_test["Class"] = -1

In [47]:
df = pd.concat([df_train, df_test])

In [48]:
df.tail()

Unnamed: 0,ID,Gene,Variation,Class
5663,5663,SLC46A1,R113S,-1
5664,5664,FOXC1,L130F,-1
5665,5665,GSS,R267W,-1
5666,5666,CTSK,G79E,-1
5667,5667,DFNB59,T54I,-1


In [49]:
df.shape

(8989, 4)

In [50]:
df_text = pd.concat([df_train_text, df_test_text])

In [51]:
df_text.head()

Unnamed: 0,ID,Text
0,0,Cyclin-dependent kinases (CDKs) regulate a var...
1,1,Abstract Background Non-small cell lung canc...
2,2,Abstract Background Non-small cell lung canc...
3,3,Recent evidence has demonstrated that acquired...
4,4,Oncogenic mutations in the monomeric Casitas B...


In [52]:
df_text.shape

(8989, 2)

In [53]:
df_text.tail()

Unnamed: 0,ID,Text
5663,5663,The realization in the late 1970s that RAS har...
5664,5664,Hemizygous deletions are common molecular abno...
5665,5665,All most R267W of has with to SMARTpool invest...
5666,5666,Abstract Blood samples from 125 unrelated fami...
5667,5667,"Loss of DNA mismatch repair (MMR) in humans, m..."


In [54]:
del(df_train)


In [55]:
del(df_test)

In [56]:
del(df_train_text)

In [57]:
del(df_test_text)

#  text word and character counting

In [58]:
df['text_char_len'] = df_text['Text'].str.len()

In [59]:
df.head()

Unnamed: 0,ID,Gene,Variation,Class,text_char_len
0,0,FAM58A,Truncating Mutations,1,39672
1,1,CBL,W802*,2,36691
2,2,CBL,Q249E,2,36691
3,3,CBL,N454D,3,36238
4,4,CBL,L399V,4,41308


In [60]:
df['text_words'] = df_text['Text'].apply(lambda row: len(row.split(" ")))

In [61]:
df.head()

Unnamed: 0,ID,Gene,Variation,Class,text_char_len,text_words
0,0,FAM58A,Truncating Mutations,1,39672,6105
1,1,CBL,W802*,2,36691,5783
2,2,CBL,Q249E,2,36691,5783
3,3,CBL,N454D,3,36238,5625
4,4,CBL,L399V,4,41308,6248


In [62]:
df_text = df_text.drop('ID', axis =1)

In [63]:
df_all = pd.concat([df, df_text], axis  =1)

In [64]:
df_all['Gene_Share'] = df_all.apply(lambda r: sum([1 for w in r['Gene'].split(' ') if w in r['Text'].split(' ')]), axis=1)
df_all['Variation_Share'] = df_all.apply(lambda r: sum([1 for w in r['Variation'].split(' ') if w in r['Text'].split(' ')]), axis=1)



In [65]:
df_all.head()

Unnamed: 0,ID,Gene,Variation,Class,text_char_len,text_words,Text,Gene_Share,Variation_Share
0,0,FAM58A,Truncating Mutations,1,39672,6105,Cyclin-dependent kinases (CDKs) regulate a var...,1,1
1,1,CBL,W802*,2,36691,5783,Abstract Background Non-small cell lung canc...,1,1
2,2,CBL,Q249E,2,36691,5783,Abstract Background Non-small cell lung canc...,1,1
3,3,CBL,N454D,3,36238,5625,Recent evidence has demonstrated that acquired...,1,1
4,4,CBL,L399V,4,41308,6248,Oncogenic mutations in the monomeric Casitas B...,1,1


In [73]:
df_all.iloc[2].Text.split(' ').count(df_all.iloc[2].Variation)

8

In [78]:
result = ''
for item  in df_all.iloc[2].Text.split('. '):
    if df_all.iloc[2].Variation in item:
        print (item)
        result = result + " " + item
print (result)

Using select c-CBL somatic mutations such as S80N/H94Y, Q249E and W802* (obtained from Caucasian, Taiwanese and African-American samples, respectively) transfected in NSCLC cell lines, there was increased cell viability and cell motility
In this study, we report novel c-CBL somatic mutations S80N/H94Y, Q249E and W802* in Caucasian, Taiwanese and African-American lung cancer patients, respectively
Using this parental plasmid pAlterMax-c-CBL, the TKB domain double mutation (S80N/H94Y), the point mutation (Q249E), and the C-terminal point mutation W802* of c-CBL were created using the following primers: 5′-GCTGGCGCTAAAGAATAACCCACCTTATATCTTAGAC-3′ and 5′-CTACCAGATACCTACCAGTATCTCCGTACTATCTTGTC-3′ for the double mutation S80N/H94Y; 5′-CTTTACCCGACTCTTTGAGCCCTGGTCCTCTTTGC-3′ for Q249E, and 5′-CAGCTCCTCCTTTGGCTGATTGTCTCTGGATGGTGATC-3′ for W802* along with their complementary primers using the QuickChange Site-Directed Mutagenesis XL kit (Stratagene, La Jolla, CA) according to the manufacturer's

In [90]:
def key_containing_text(text, keyword):
    result = " "
    for item in text.split('. '):
        if keyword in item:
            result = result + " " + item
    return result
    

In [91]:
key_containing_text(df_all.iloc[2].Text, df_all.iloc[2].Variation )

"  Using select c-CBL somatic mutations such as S80N/H94Y, Q249E and W802* (obtained from Caucasian, Taiwanese and African-American samples, respectively) transfected in NSCLC cell lines, there was increased cell viability and cell motility In this study, we report novel c-CBL somatic mutations S80N/H94Y, Q249E and W802* in Caucasian, Taiwanese and African-American lung cancer patients, respectively Using this parental plasmid pAlterMax-c-CBL, the TKB domain double mutation (S80N/H94Y), the point mutation (Q249E), and the C-terminal point mutation W802* of c-CBL were created using the following primers: 5′-GCTGGCGCTAAAGAATAACCCACCTTATATCTTAGAC-3′ and 5′-CTACCAGATACCTACCAGTATCTCCGTACTATCTTGTC-3′ for the double mutation S80N/H94Y; 5′-CTTTACCCGACTCTTTGAGCCCTGGTCCTCTTTGC-3′ for Q249E, and 5′-CAGCTCCTCCTTTGGCTGATTGTCTCTGGATGGTGATC-3′ for W802* along with their complementary primers using the QuickChange Site-Directed Mutagenesis XL kit (Stratagene, La Jolla, CA) according to the manufacture

In [92]:
df_all['Key_text'] = " "

In [93]:
df_all.head()

Unnamed: 0,ID,Gene,Variation,Class,text_char_len,text_words,Text,Gene_Share,Variation_Share,Key_text
0,0,FAM58A,Truncating Mutations,1,39672,6105,Cyclin-dependent kinases (CDKs) regulate a var...,1,1,
1,1,CBL,W802*,2,36691,5783,Abstract Background Non-small cell lung canc...,1,1,
2,2,CBL,Q249E,2,36691,5783,Abstract Background Non-small cell lung canc...,1,1,
3,3,CBL,N454D,3,36238,5625,Recent evidence has demonstrated that acquired...,1,1,
4,4,CBL,L399V,4,41308,6248,Oncogenic mutations in the monomeric Casitas B...,1,1,


In [96]:
df_all.loc[2,'Key_text']= key_containing_text(df_all.iloc[2].Text, df_all.iloc[2].Variation )

In [97]:
df_all.head()

Unnamed: 0,ID,Gene,Variation,Class,text_char_len,text_words,Text,Gene_Share,Variation_Share,Key_text
0,0,FAM58A,Truncating Mutations,1,39672,6105,Cyclin-dependent kinases (CDKs) regulate a var...,1,1,
1,1,CBL,W802*,2,36691,5783,Abstract Background Non-small cell lung canc...,1,1,
2,2,CBL,Q249E,2,36691,5783,Abstract Background Non-small cell lung canc...,1,1,Using select c-CBL somatic mutations such as...
3,3,CBL,N454D,3,36238,5625,Recent evidence has demonstrated that acquired...,1,1,
4,4,CBL,L399V,4,41308,6248,Oncogenic mutations in the monomeric Casitas B...,1,1,


In [101]:
df_all = df_all.reset_index(drop=True)

In [102]:
for i in range(len(df_all)):
    df_all.loc[i,'Key_text']= key_containing_text(df_all.iloc[i].Text, df_all.iloc[i].Variation )

In [103]:
df_all.head()

Unnamed: 0,ID,Gene,Variation,Class,text_char_len,text_words,Text,Gene_Share,Variation_Share,Key_text
0,0,FAM58A,Truncating Mutations,1,39672,6105,Cyclin-dependent kinases (CDKs) regulate a var...,1,1,
1,1,CBL,W802*,2,36691,5783,Abstract Background Non-small cell lung canc...,1,1,Using select c-CBL somatic mutations such as...
2,2,CBL,Q249E,2,36691,5783,Abstract Background Non-small cell lung canc...,1,1,Using select c-CBL somatic mutations such as...
3,3,CBL,N454D,3,36238,5625,Recent evidence has demonstrated that acquired...,1,1,"Most of the changes were novel, although 4 c..."
4,4,CBL,L399V,4,41308,6248,Oncogenic mutations in the monomeric Casitas B...,1,1,"Finally, the third group constituted mutati..."


In [104]:
df_all.tail()

Unnamed: 0,ID,Gene,Variation,Class,text_char_len,text_words,Text,Gene_Share,Variation_Share,Key_text
8984,5663,SLC46A1,R113S,-1,76385,11114,The realization in the late 1970s that RAS har...,0,1,These disease point both role the was signal...
8985,5664,FOXC1,L130F,-1,27014,4134,Hemizygous deletions are common molecular abno...,0,1,For group-by-day variants not for KIT L130F ...
8986,5665,GSS,R267W,-1,23101,3427,All most R267W of has with to SMARTpool invest...,0,1,All most R267W of has with to SMARTpool inve...
8987,5666,CTSK,G79E,-1,67269,10635,Abstract Blood samples from 125 unrelated fami...,0,1,Figure these are MYC also protein subsequent...
8988,5667,DFNB59,T54I,-1,27021,4079,"Loss of DNA mismatch repair (MMR) in humans, m...",0,1,In Author mutations response data T54I has d...


In [105]:
df_text = df_all[['ID', 'Text','Key_text']]

In [106]:
df_text.head()

Unnamed: 0,ID,Text,Key_text
0,0,Cyclin-dependent kinases (CDKs) regulate a var...,
1,1,Abstract Background Non-small cell lung canc...,Using select c-CBL somatic mutations such as...
2,2,Abstract Background Non-small cell lung canc...,Using select c-CBL somatic mutations such as...
3,3,Recent evidence has demonstrated that acquired...,"Most of the changes were novel, although 4 c..."
4,4,Oncogenic mutations in the monomeric Casitas B...,"Finally, the third group constituted mutati..."


In [107]:
df_all.columns.values

array(['ID', 'Gene', 'Variation', 'Class', 'text_char_len', 'text_words',
       'Text', 'Gene_Share', 'Variation_Share', 'Key_text'], dtype=object)

In [108]:
columns = ['ID', 'Gene', 'Variation', 'Class', 'text_char_len', 'text_words',
        'Gene_Share', 'Variation_Share']

In [109]:
df = df_all[columns]

In [110]:
df.head()

Unnamed: 0,ID,Gene,Variation,Class,text_char_len,text_words,Gene_Share,Variation_Share
0,0,FAM58A,Truncating Mutations,1,39672,6105,1,1
1,1,CBL,W802*,2,36691,5783,1,1
2,2,CBL,Q249E,2,36691,5783,1,1
3,3,CBL,N454D,3,36238,5625,1,1
4,4,CBL,L399V,4,41308,6248,1,1


In [111]:
del(df_all)

# chage categorical to numbers

In [112]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

In [113]:
le.fit(df.Gene)

LabelEncoder()

In [114]:
le.transform(df.Gene)

array([447, 216, 216, ..., 576, 314, 344])

In [115]:
df['Gene'] = le.transform(df.Gene)

In [116]:
df.head()

Unnamed: 0,ID,Gene,Variation,Class,text_char_len,text_words,Gene_Share,Variation_Share
0,0,447,Truncating Mutations,1,39672,6105,1,1
1,1,216,W802*,2,36691,5783,1,1
2,2,216,Q249E,2,36691,5783,1,1
3,3,216,N454D,3,36238,5625,1,1
4,4,216,L399V,4,41308,6248,1,1


In [117]:
df.tail()

Unnamed: 0,ID,Gene,Variation,Class,text_char_len,text_words,Gene_Share,Variation_Share
8984,5663,1262,R113S,-1,76385,11114,0,1
8985,5664,486,L130F,-1,27014,4134,0,1
8986,5665,576,R267W,-1,23101,3427,0,1
8987,5666,314,G79E,-1,67269,10635,0,1
8988,5667,344,T54I,-1,27021,4079,0,1


In [118]:
le.fit(df.Variation)

LabelEncoder()

In [119]:
df['Variation'] = le.transform(df.Variation)

In [120]:
df.head()

Unnamed: 0,ID,Gene,Variation,Class,text_char_len,text_words,Gene_Share,Variation_Share
0,0,447,7654,1,39672,6105,1,1
1,1,216,8255,2,36691,5783,1,1
2,2,216,5191,2,36691,5783,1,1
3,3,216,4572,3,36238,5625,1,1
4,4,216,3958,4,41308,6248,1,1


In [121]:
df = df.drop('ID', axis = 1)

In [122]:
df.head()

Unnamed: 0,Gene,Variation,Class,text_char_len,text_words,Gene_Share,Variation_Share
0,447,7654,1,39672,6105,1,1
1,216,8255,2,36691,5783,1,1
2,216,5191,2,36691,5783,1,1
3,216,4572,3,36238,5625,1,1
4,216,3958,4,41308,6248,1,1


# use CountVectorizer to simply convert text to vector

In [123]:
from sklearn.feature_extraction.text import CountVectorizer

In [124]:
vectorizer = CountVectorizer()

In [125]:
vectorizer.fit(df_text.Text)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [126]:
df_text.head()

Unnamed: 0,ID,Text,Key_text
0,0,Cyclin-dependent kinases (CDKs) regulate a var...,
1,1,Abstract Background Non-small cell lung canc...,Using select c-CBL somatic mutations such as...
2,2,Abstract Background Non-small cell lung canc...,Using select c-CBL somatic mutations such as...
3,3,Recent evidence has demonstrated that acquired...,"Most of the changes were novel, although 4 c..."
4,4,Oncogenic mutations in the monomeric Casitas B...,"Finally, the third group constituted mutati..."


In [127]:
X = vectorizer.transform(df_text.Text)

In [128]:
X_array = X.toarray()

In [129]:
df_text_array = pd.DataFrame(X_array)

In [130]:
df_text_array.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,169415,169416,169417,169418,169419,169420,169421,169422,169423,169424
0,0,14,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,4,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,4,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,3,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [131]:
df_text_array.columns.values

array([     0,      1,      2, ..., 169422, 169423, 169424])

In [132]:
title = ['vec{}'.format(i) for i in range(len(df_text_array.columns.values))]

In [133]:
df_text_array.head()


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,169415,169416,169417,169418,169419,169420,169421,169422,169423,169424
0,0,14,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,4,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,4,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,3,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [134]:
df_text_array.columns = title

In [135]:
df_text_array.head()

Unnamed: 0,vec0,vec1,vec2,vec3,vec4,vec5,vec6,vec7,vec8,vec9,...,vec169415,vec169416,vec169417,vec169418,vec169419,vec169420,vec169421,vec169422,vec169423,vec169424
0,0,14,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,4,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,4,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,3,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# do the same thing with the Key_text

In [136]:
vectorizer.fit(df_text.Key_text)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [138]:
X = vectorizer.transform(df_text.Key_text)

In [139]:
X_array = X.toarray()

In [140]:
df_keytext_array = pd.DataFrame(X_array)

In [141]:
title = ['keyvec{}'.format(i) for i in range(len(df_keytext_array.columns.values))]

In [142]:
df_keytext_array.columns = title

In [143]:
df_keytext_array.head()

Unnamed: 0,keyvec0,keyvec1,keyvec2,keyvec3,keyvec4,keyvec5,keyvec6,keyvec7,keyvec8,keyvec9,...,keyvec113403,keyvec113404,keyvec113405,keyvec113406,keyvec113407,keyvec113408,keyvec113409,keyvec113410,keyvec113411,keyvec113412
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# PCA analysis based on the contvecorizer data

In [144]:
from sklearn.decomposition import PCA
pca = PCA(n_components=10)
pca.fit(df_text_array)

PCA(copy=True, iterated_power='auto', n_components=10, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [145]:
x = pca.transform(df_text_array)

In [146]:
df_pca = pd.DataFrame(x)

In [147]:
df_pca.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,-262.875685,-6.124869,-47.397045,8.901035,-30.396772,-20.769448,0.965126,-21.336928,-13.452046,-16.135331
1,-241.089972,50.551082,25.359335,4.148823,-25.371951,-46.133658,-7.239097,-4.004613,12.329303,6.782242
2,-241.089972,50.551082,25.359335,4.148823,-25.371951,-46.133658,-7.239097,-4.004613,12.329303,6.782242
3,-330.33514,56.771106,-0.475937,10.181769,-2.760588,9.500001,11.160319,-15.381759,-2.70597,5.970941
4,-195.602668,-14.313817,73.551084,-8.427435,-9.48614,-21.9402,19.009713,-0.774855,13.591588,53.262521


In [148]:
title_pca = ['pca{}'.format(i) for i in range(len(df_pca.columns.values))]

In [149]:
df_pca.columns = title_pca

In [150]:
df_pca.head()

Unnamed: 0,pca0,pca1,pca2,pca3,pca4,pca5,pca6,pca7,pca8,pca9
0,-262.875685,-6.124869,-47.397045,8.901035,-30.396772,-20.769448,0.965126,-21.336928,-13.452046,-16.135331
1,-241.089972,50.551082,25.359335,4.148823,-25.371951,-46.133658,-7.239097,-4.004613,12.329303,6.782242
2,-241.089972,50.551082,25.359335,4.148823,-25.371951,-46.133658,-7.239097,-4.004613,12.329303,6.782242
3,-330.33514,56.771106,-0.475937,10.181769,-2.760588,9.500001,11.160319,-15.381759,-2.70597,5.970941
4,-195.602668,-14.313817,73.551084,-8.427435,-9.48614,-21.9402,19.009713,-0.774855,13.591588,53.262521


# Truncated SVD

In [151]:
from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(n_components=10, n_iter=7, random_state=42)
svd.fit(df_text_array)  

TruncatedSVD(algorithm='randomized', n_components=10, n_iter=7,
       random_state=42, tol=0.0)

In [152]:
x = svd.transform(df_text_array)

In [153]:
df_svd = pd.DataFrame(x)

In [154]:
df_svd.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,478.171759,-6.121243,-49.395202,-28.386872,-27.988532,-21.659171,0.390376,-21.553569,10.94302,-10.129255
1,500.298748,50.554514,23.324182,-26.154088,-24.886434,-48.061191,-8.28048,-4.376489,-18.548796,16.828192
2,500.298748,50.554514,23.324182,-26.154088,-24.886434,-48.061191,-8.28048,-4.376489,-18.548796,16.828192
3,411.523282,56.775203,-2.839586,-25.902034,-1.418314,7.100727,8.636949,-16.215326,-7.254008,19.529928
4,546.049232,-14.310735,71.644483,-10.266536,-13.081652,-24.891459,16.516488,-1.8331,-27.614176,63.874952


In [155]:
title_svd = ['svd{}'.format(i) for i in range(len(df_svd.columns.values))]

In [156]:
df_svd.columns = title_svd

In [157]:
df_svd.head()

Unnamed: 0,svd0,svd1,svd2,svd3,svd4,svd5,svd6,svd7,svd8,svd9
0,478.171759,-6.121243,-49.395202,-28.386872,-27.988532,-21.659171,0.390376,-21.553569,10.94302,-10.129255
1,500.298748,50.554514,23.324182,-26.154088,-24.886434,-48.061191,-8.28048,-4.376489,-18.548796,16.828192
2,500.298748,50.554514,23.324182,-26.154088,-24.886434,-48.061191,-8.28048,-4.376489,-18.548796,16.828192
3,411.523282,56.775203,-2.839586,-25.902034,-1.418314,7.100727,8.636949,-16.215326,-7.254008,19.529928
4,546.049232,-14.310735,71.644483,-10.266536,-13.081652,-24.891459,16.516488,-1.8331,-27.614176,63.874952


# TfidVectorizer to extract meanings

In [158]:
from sklearn.feature_extraction.text import TfidfVectorizer
import sys

In [159]:
tfidf = TfidfVectorizer(stop_words = 'english', max_features = 100)
tfidf.fit(df_text.Text)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=100, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words='english', strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [160]:
x = tfidf.transform(df_text.Text)

In [161]:
x = x.toarray()

In [162]:
df_tfid = pd.DataFrame(x)

In [163]:
df_tfid.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,0.130351,0.028371,0.006997,0.014239,0.056427,0.056008,0.00745,0.077699,0.0,0.0,...,0.057801,0.008138,0.029388,0.0,0.027645,0.034869,0.089366,0.0,0.021605,0.102527
1,0.106891,0.026002,0.032064,0.019575,0.019393,0.057747,0.020482,0.090633,0.007178,0.0,...,0.052975,0.007458,0.127938,0.020991,0.057007,0.031957,0.151208,0.0,0.046203,0.051254
2,0.106891,0.026002,0.032064,0.019575,0.019393,0.057747,0.020482,0.090633,0.007178,0.0,...,0.052975,0.007458,0.127938,0.020991,0.057007,0.031957,0.151208,0.0,0.046203,0.051254
3,0.15821,0.031155,0.053786,0.031273,0.077455,0.084567,0.016361,0.116351,0.0,0.0,...,0.047605,0.0,0.0,0.0,0.121429,0.022974,0.098136,0.084334,0.126536,0.040941
4,0.062059,0.004935,0.019475,0.004954,0.039263,0.014614,0.082937,0.172026,0.0,0.0,...,0.135741,0.0,0.025561,0.010624,0.105797,0.03882,0.066967,0.013359,0.100224,0.0


In [164]:
title_tfid = ['tfid{}'.format(i) for i in range(len(df_tfid.columns.values))]

In [165]:
df_tfid.columns = title_tfid

In [166]:
df_tfid.head()

Unnamed: 0,tfid0,tfid1,tfid2,tfid3,tfid4,tfid5,tfid6,tfid7,tfid8,tfid9,...,tfid90,tfid91,tfid92,tfid93,tfid94,tfid95,tfid96,tfid97,tfid98,tfid99
0,0.130351,0.028371,0.006997,0.014239,0.056427,0.056008,0.00745,0.077699,0.0,0.0,...,0.057801,0.008138,0.029388,0.0,0.027645,0.034869,0.089366,0.0,0.021605,0.102527
1,0.106891,0.026002,0.032064,0.019575,0.019393,0.057747,0.020482,0.090633,0.007178,0.0,...,0.052975,0.007458,0.127938,0.020991,0.057007,0.031957,0.151208,0.0,0.046203,0.051254
2,0.106891,0.026002,0.032064,0.019575,0.019393,0.057747,0.020482,0.090633,0.007178,0.0,...,0.052975,0.007458,0.127938,0.020991,0.057007,0.031957,0.151208,0.0,0.046203,0.051254
3,0.15821,0.031155,0.053786,0.031273,0.077455,0.084567,0.016361,0.116351,0.0,0.0,...,0.047605,0.0,0.0,0.0,0.121429,0.022974,0.098136,0.084334,0.126536,0.040941
4,0.062059,0.004935,0.019475,0.004954,0.039263,0.014614,0.082937,0.172026,0.0,0.0,...,0.135741,0.0,0.025561,0.010624,0.105797,0.03882,0.066967,0.013359,0.100224,0.0


In [167]:
del(x)

In [168]:
del(X)

In [169]:
del(X_array)

In [170]:
del(df_text)

# Since df_text_array is too huge for further processing, I will select usueful columns based on RandomForest

In [172]:
from sklearn.cross_validation import train_test_split
from sklearn import metrics



In [173]:
df = df.reset_index(drop=True)

In [174]:
df_rf = df_text_array[df.Class>0]

In [175]:
df_rf.tail()

Unnamed: 0,vec0,vec1,vec2,vec3,vec4,vec5,vec6,vec7,vec8,vec9,...,vec169415,vec169416,vec169417,vec169418,vec169419,vec169420,vec169421,vec169422,vec169423,vec169424
3316,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3317,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3318,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3319,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3320,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [176]:
rf_y = df[df.Class>0].Class

In [177]:
Xtrain, Xtest, ytrain, ytest = train_test_split(df_rf, rf_y, random_state=0)

In [178]:
del(df_rf)

In [179]:
del(rf_y)

In [180]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=100, random_state=0)
clf.fit(Xtrain, ytrain)
ypred = clf.predict(Xtest)
metrics.accuracy_score(ypred, ytest)

0.6257521058965102

In [181]:
x = clf.feature_importances_

In [182]:
df_feature = pd.DataFrame(x)

In [183]:
df_feature.head()

Unnamed: 0,0
0,5.5e-05
1,0.000283
2,0.0
3,0.0
4,0.0


In [184]:
len(df_text_array.columns.values)

169425

In [185]:
len(df_feature)

169425

In [186]:
len(Xtrain.columns.values)

169425

In [187]:
df_feature['column_name']= Xtrain.columns.values

In [188]:
df_feature.head()

Unnamed: 0,0,column_name
0,5.5e-05,vec0
1,0.000283,vec1
2,0.0,vec2
3,0.0,vec3
4,0.0,vec4


In [189]:
df_feature.head()

Unnamed: 0,0,column_name
0,5.5e-05,vec0
1,0.000283,vec1
2,0.0,vec2
3,0.0,vec3
4,0.0,vec4


In [190]:
df_feature.head()

Unnamed: 0,0,column_name
0,5.5e-05,vec0
1,0.000283,vec1
2,0.0,vec2
3,0.0,vec3
4,0.0,vec4


In [191]:
df_feature = df_feature.sort_values(0, ascending = False)

In [192]:
important_columns1000 = df_feature.head(1000).column_name.values

In [193]:
important_columns2000 = df_feature.head(2000).column_name.values

In [194]:
important_columns10000 = df_feature.head(10000).column_name.values

In [195]:
df_selected = df_text_array[important_columns1000]

In [196]:
df_selected.head()

Unnamed: 0,vec23674,vec158919,vec23682,vec104161,vec91616,vec23657,vec86309,vec148582,vec121770,vec68314,...,vec2642,vec135246,vec1810,vec44973,vec23395,vec125253,vec55836,vec92152,vec40372,vec29385
0,2,0,1,1,20,1,7,1,15,2,...,0,1,0,0,1,0,0,1,1,0
1,2,7,3,2,7,1,0,2,0,5,...,0,0,0,1,0,1,0,0,0,2
2,2,7,3,2,7,1,0,2,0,5,...,0,0,0,1,0,1,0,0,0,2
3,3,22,2,9,14,2,0,0,1,0,...,0,1,0,1,0,5,1,0,0,0
4,0,6,16,8,3,4,0,1,4,9,...,0,1,0,0,0,1,0,0,0,2


#  merge five dataframes into one

In [197]:
df = df.reset_index(drop=True)

In [198]:
df.tail()

Unnamed: 0,Gene,Variation,Class,text_char_len,text_words,Gene_Share,Variation_Share
8984,1262,5379,-1,76385,11114,0,1
8985,486,3684,-1,27014,4134,0,1
8986,576,5892,-1,23101,3427,0,1
8987,314,2785,-1,67269,10635,0,1
8988,344,7542,-1,27021,4079,0,1


In [199]:
df_merge = pd.concat([df, df_selected,df_pca, df_tfid, df_keytext_array], axis=1)

In [200]:
df_merge.head()

Unnamed: 0,Gene,Variation,Class,text_char_len,text_words,Gene_Share,Variation_Share,vec23674,vec158919,vec23682,...,keyvec113403,keyvec113404,keyvec113405,keyvec113406,keyvec113407,keyvec113408,keyvec113409,keyvec113410,keyvec113411,keyvec113412
0,447,7654,1,39672,6105,1,1,2,0,1,...,0,0,0,0,0,0,0,0,0,0
1,216,8255,2,36691,5783,1,1,2,7,3,...,0,0,0,0,0,0,0,0,0,0
2,216,5191,2,36691,5783,1,1,2,7,3,...,0,0,0,0,0,0,0,0,0,0
3,216,4572,3,36238,5625,1,1,3,22,2,...,0,0,0,0,0,0,0,0,0,0
4,216,3958,4,41308,6248,1,1,0,6,16,...,0,0,0,0,0,0,0,0,0,0


In [201]:
del(df_keytext_array)

In [202]:
del(df_pca)

In [203]:
del(df)

In [204]:
del(df_text_array)

In [205]:
del(df_tfid)

# split train and test again

In [206]:
train = df_merge[df_merge.Class >0]

In [207]:
test = df_merge[df_merge.Class == -1]

In [208]:
train.head()

Unnamed: 0,Gene,Variation,Class,text_char_len,text_words,Gene_Share,Variation_Share,vec23674,vec158919,vec23682,...,keyvec113403,keyvec113404,keyvec113405,keyvec113406,keyvec113407,keyvec113408,keyvec113409,keyvec113410,keyvec113411,keyvec113412
0,447,7654,1,39672,6105,1,1,2,0,1,...,0,0,0,0,0,0,0,0,0,0
1,216,8255,2,36691,5783,1,1,2,7,3,...,0,0,0,0,0,0,0,0,0,0
2,216,5191,2,36691,5783,1,1,2,7,3,...,0,0,0,0,0,0,0,0,0,0
3,216,4572,3,36238,5625,1,1,3,22,2,...,0,0,0,0,0,0,0,0,0,0
4,216,3958,4,41308,6248,1,1,0,6,16,...,0,0,0,0,0,0,0,0,0,0


In [209]:
train.tail()

Unnamed: 0,Gene,Variation,Class,text_char_len,text_words,Gene_Share,Variation_Share,vec23674,vec158919,vec23682,...,keyvec113403,keyvec113404,keyvec113405,keyvec113406,keyvec113407,keyvec113408,keyvec113409,keyvec113410,keyvec113411,keyvec113412
3316,1155,960,4,73895,11112,0,1,1,0,11,...,0,0,0,0,0,0,0,0,0,0
3317,1155,56,1,40127,6118,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3318,1155,2076,1,36200,5119,1,0,0,0,12,...,0,0,0,0,0,0,0,0,0,0
3319,1155,6606,4,32520,4913,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3320,1155,3608,4,67136,9846,1,1,0,1,2,...,0,0,0,0,0,0,0,0,0,0


In [210]:
test.head()

Unnamed: 0,Gene,Variation,Class,text_char_len,text_words,Gene_Share,Variation_Share,vec23674,vec158919,vec23682,...,keyvec113403,keyvec113404,keyvec113405,keyvec113406,keyvec113407,keyvec113408,keyvec113409,keyvec113410,keyvec113411,keyvec113412
3321,28,6404,-1,49829,7495,0,1,1,11,1,...,0,0,0,0,0,0,0,0,0,0
3322,852,5005,-1,31326,4762,0,1,1,2,3,...,0,0,0,0,0,0,0,0,0,0
3323,950,3915,-1,75282,11191,0,1,5,5,28,...,0,0,0,0,0,0,0,0,0,0
3324,657,85,-1,53996,8439,0,1,1,18,18,...,0,0,0,0,0,0,0,0,0,0
3325,1376,2780,-1,76967,11226,0,1,3,3,5,...,0,0,0,0,0,0,0,0,0,0


In [211]:
test.tail()

Unnamed: 0,Gene,Variation,Class,text_char_len,text_words,Gene_Share,Variation_Share,vec23674,vec158919,vec23682,...,keyvec113403,keyvec113404,keyvec113405,keyvec113406,keyvec113407,keyvec113408,keyvec113409,keyvec113410,keyvec113411,keyvec113412
8984,1262,5379,-1,76385,11114,0,1,3,1,35,...,0,0,0,0,0,0,0,0,0,0
8985,486,3684,-1,27014,4134,0,1,1,1,2,...,0,0,0,0,0,0,0,0,0,0
8986,576,5892,-1,23101,3427,0,1,0,6,3,...,0,0,0,0,0,0,0,0,0,0
8987,314,2785,-1,67269,10635,0,1,0,0,13,...,0,0,0,0,0,0,0,0,0,0
8988,344,7542,-1,27021,4079,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [212]:
del(df_merge)

#  split the train dataset for internal evaluation

In [213]:
from sklearn.cross_validation import train_test_split
from sklearn import metrics

In [214]:
y_train = train.pop('Class')

In [215]:
x_train = train

In [216]:
y_test = test.pop('Class')

In [217]:
x_test = test

In [218]:
x_train.head()

Unnamed: 0,Gene,Variation,text_char_len,text_words,Gene_Share,Variation_Share,vec23674,vec158919,vec23682,vec104161,...,keyvec113403,keyvec113404,keyvec113405,keyvec113406,keyvec113407,keyvec113408,keyvec113409,keyvec113410,keyvec113411,keyvec113412
0,447,7654,39672,6105,1,1,2,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,216,8255,36691,5783,1,1,2,7,3,2,...,0,0,0,0,0,0,0,0,0,0
2,216,5191,36691,5783,1,1,2,7,3,2,...,0,0,0,0,0,0,0,0,0,0
3,216,4572,36238,5625,1,1,3,22,2,9,...,0,0,0,0,0,0,0,0,0,0
4,216,3958,41308,6248,1,1,0,6,16,8,...,0,0,0,0,0,0,0,0,0,0


In [219]:
Xtrain, Xtest, ytrain, ytest = train_test_split(x_train, y_train, random_state=0)

#  Use Random Forest for first quick prediction

In [220]:
from sklearn.ensemble import RandomForestClassifier

Xtrain, Xtest, ytrain, ytest = train_test_split(x_train,y_train,random_state=0)
clf = RandomForestClassifier(n_estimators=100, random_state=0)
clf.fit(Xtrain, ytrain)
ypred = clf.predict(Xtest)
metrics.accuracy_score(ypred, ytest)

0.67749699157641396

# 0.67749699157641396 with the key_text

# 0.63658243080625754  with gene share and variation share

#  0.64861612515042122  with word count

0.63658243080625754 without ID

# score was 0.64981949458483756

##  the score was 0.6534296028880866

## 0.638 was the internal evaluatin score without tdif.

In [221]:
proba = clf.predict_proba(x_test)

In [222]:
df = pd.DataFrame(proba)

In [223]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,0.08,0.18,0.03,0.05,0.02,0.05,0.56,0.01,0.02
1,0.18,0.08,0.02,0.3,0.02,0.03,0.33,0.01,0.03
2,0.11,0.12,0.0,0.09,0.03,0.03,0.62,0.0,0.0
3,0.09,0.13,0.03,0.18,0.03,0.05,0.47,0.0,0.02
4,0.13,0.11,0.03,0.13,0.01,0.03,0.53,0.02,0.01


In [224]:
class_name = ['class1', 'class2', 'class3', 'class4', 'class5', 'class6',
       'class7', 'class8', 'class9']

In [225]:
df.columns = class_name

In [226]:
df.head()

Unnamed: 0,class1,class2,class3,class4,class5,class6,class7,class8,class9
0,0.08,0.18,0.03,0.05,0.02,0.05,0.56,0.01,0.02
1,0.18,0.08,0.02,0.3,0.02,0.03,0.33,0.01,0.03
2,0.11,0.12,0.0,0.09,0.03,0.03,0.62,0.0,0.0
3,0.09,0.13,0.03,0.18,0.03,0.05,0.47,0.0,0.02
4,0.13,0.11,0.03,0.13,0.01,0.03,0.53,0.02,0.01


In [227]:
df.to_csv('test1.csv')

In [228]:
df = pd.read_csv('test1.csv')

In [229]:
df.head()

Unnamed: 0.1,Unnamed: 0,class1,class2,class3,class4,class5,class6,class7,class8,class9
0,0,0.08,0.18,0.03,0.05,0.02,0.05,0.56,0.01,0.02
1,1,0.18,0.08,0.02,0.3,0.02,0.03,0.33,0.01,0.03
2,2,0.11,0.12,0.0,0.09,0.03,0.03,0.62,0.0,0.0
3,3,0.09,0.13,0.03,0.18,0.03,0.05,0.47,0.0,0.02
4,4,0.13,0.11,0.03,0.13,0.01,0.03,0.53,0.02,0.01


In [230]:
submission_columns = ['ID', 'class1', 'class2', 'class3', 'class4', 'class5', 'class6',
       'class7', 'class8', 'class9']

In [231]:
df.columns = submission_columns

In [232]:
df.head()

Unnamed: 0,ID,class1,class2,class3,class4,class5,class6,class7,class8,class9
0,0,0.08,0.18,0.03,0.05,0.02,0.05,0.56,0.01,0.02
1,1,0.18,0.08,0.02,0.3,0.02,0.03,0.33,0.01,0.03
2,2,0.11,0.12,0.0,0.09,0.03,0.03,0.62,0.0,0.0
3,3,0.09,0.13,0.03,0.18,0.03,0.05,0.47,0.0,0.02
4,4,0.13,0.11,0.03,0.13,0.01,0.03,0.53,0.02,0.01


In [233]:
df.to_csv('with_key_RF_ver5.csv', index = False)

# 0.66194 at Kaggle