In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

#  Load data

In [2]:
df_train = pd.read_csv('training_variants')

In [3]:
df_train.head()

Unnamed: 0,ID,Gene,Variation,Class
0,0,FAM58A,Truncating Mutations,1
1,1,CBL,W802*,2
2,2,CBL,Q249E,2
3,3,CBL,N454D,3
4,4,CBL,L399V,4


In [4]:
df_test = pd.read_csv('test_variants')

In [5]:
df_test.head()

Unnamed: 0,ID,Gene,Variation
0,0,ACSL4,R570S
1,1,NAGLU,P521L
2,2,PAH,L333F
3,3,ING1,A148D
4,4,TMEM216,G77A


In [6]:
df_train_text = pd.read_csv('training_text', sep="\|\|", engine='python', header=None, skiprows=1, names=["ID","Text"])

In [7]:
df_train_text.head()

Unnamed: 0,ID,Text
0,0,Cyclin-dependent kinases (CDKs) regulate a var...
1,1,Abstract Background Non-small cell lung canc...
2,2,Abstract Background Non-small cell lung canc...
3,3,Recent evidence has demonstrated that acquired...
4,4,Oncogenic mutations in the monomeric Casitas B...


In [8]:
df_test_text = pd.read_csv('test_text', sep="\|\|", engine='python', header=None, skiprows=1, names=["ID","Text"])

In [9]:
df_test_text.head()

Unnamed: 0,ID,Text
0,0,2. This mutation resulted in a myeloproliferat...
1,1,Abstract The Large Tumor Suppressor 1 (LATS1)...
2,2,Vascular endothelial growth factor receptor (V...
3,3,Inflammatory myofibroblastic tumor (IMT) is a ...
4,4,Abstract Retinoblastoma is a pediatric retina...


In [10]:
df_submission = pd.read_csv('submissionFile')

In [11]:
df_submission.head()

Unnamed: 0,ID,class1,class2,class3,class4,class5,class6,class7,class8,class9
0,0,0,0,0,0,0,1,0,0,0
1,1,0,1,0,0,0,0,0,0,0
2,2,0,0,0,0,0,1,0,0,0
3,3,0,0,0,0,0,0,0,1,0
4,4,0,0,0,1,0,0,0,0,0


#  Examine data shape

In [12]:
df_train.shape

(3321, 4)

In [13]:
df_test.shape

(5668, 3)

In [14]:
df_train_text.shape

(3321, 2)

In [15]:
df_test_text.shape

(5668, 2)

In [16]:
df_train_text.iloc[0].Text

"Cyclin-dependent kinases (CDKs) regulate a variety of fundamental cellular processes. CDK10 stands out as one of the last orphan CDKs for which no activating cyclin has been identified and no kinase activity revealed. Previous work has shown that CDK10 silencing increases ETS2 (v-ets erythroblastosis virus E26 oncogene homolog 2)-driven activation of the MAPK pathway, which confers tamoxifen resistance to breast cancer cells. The precise mechanisms by which CDK10 modulates ETS2 activity, and more generally the functions of CDK10, remain elusive. Here we demonstrate that CDK10 is a cyclin-dependent kinase by identifying cyclin M as an activating cyclin. Cyclin M, an orphan cyclin, is the product of FAM58A, whose mutations cause STAR syndrome, a human developmental anomaly whose features include toe syndactyly, telecanthus, and anogenital and renal malformations. We show that STAR syndrome-associated cyclin M mutants are unable to interact with CDK10. Cyclin M silencing phenocopies CDK1

#  merging test and train data for processing

In [17]:
df_test["Class"] = -1

In [18]:
df = pd.concat([df_train, df_test])

In [19]:
df.tail()

Unnamed: 0,ID,Gene,Variation,Class
5663,5663,SLC46A1,R113S,-1
5664,5664,FOXC1,L130F,-1
5665,5665,GSS,R267W,-1
5666,5666,CTSK,G79E,-1
5667,5667,DFNB59,T54I,-1


In [20]:
df.shape

(8989, 4)

In [21]:
df_text = pd.concat([df_train_text, df_test_text])

In [22]:
df_text.head()

Unnamed: 0,ID,Text
0,0,Cyclin-dependent kinases (CDKs) regulate a var...
1,1,Abstract Background Non-small cell lung canc...
2,2,Abstract Background Non-small cell lung canc...
3,3,Recent evidence has demonstrated that acquired...
4,4,Oncogenic mutations in the monomeric Casitas B...


In [23]:
df_text.shape

(8989, 2)

In [24]:
df_text.tail()

Unnamed: 0,ID,Text
5663,5663,The realization in the late 1970s that RAS har...
5664,5664,Hemizygous deletions are common molecular abno...
5665,5665,All most R267W of has with to SMARTpool invest...
5666,5666,Abstract Blood samples from 125 unrelated fami...
5667,5667,"Loss of DNA mismatch repair (MMR) in humans, m..."


In [25]:
del(df_train)


In [26]:
del(df_test)

In [27]:
del(df_train_text)

In [28]:
del(df_test_text)

# chage categorical to numbers

In [29]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

In [30]:
le.fit(df.Gene)

LabelEncoder()

In [31]:
le.transform(df.Gene)

array([447, 216, 216, ..., 576, 314, 344])

In [32]:
df['Gene'] = le.transform(df.Gene)

In [33]:
df.head()

Unnamed: 0,ID,Gene,Variation,Class
0,0,447,Truncating Mutations,1
1,1,216,W802*,2
2,2,216,Q249E,2
3,3,216,N454D,3
4,4,216,L399V,4


In [34]:
df.tail()

Unnamed: 0,ID,Gene,Variation,Class
5663,5663,1262,R113S,-1
5664,5664,486,L130F,-1
5665,5665,576,R267W,-1
5666,5666,314,G79E,-1
5667,5667,344,T54I,-1


In [35]:
le.fit(df.Variation)

LabelEncoder()

In [36]:
df['Variation'] = le.transform(df.Variation)

In [37]:
df.head()

Unnamed: 0,ID,Gene,Variation,Class
0,0,447,7654,1
1,1,216,8255,2
2,2,216,5191,2
3,3,216,4572,3
4,4,216,3958,4


# use CountVectorizer to simply convert text to vector

In [38]:
from sklearn.feature_extraction.text import CountVectorizer

In [39]:
vectorizer = CountVectorizer()

In [40]:
vectorizer.fit(df_text.Text)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [41]:
df_text.head()

Unnamed: 0,ID,Text
0,0,Cyclin-dependent kinases (CDKs) regulate a var...
1,1,Abstract Background Non-small cell lung canc...
2,2,Abstract Background Non-small cell lung canc...
3,3,Recent evidence has demonstrated that acquired...
4,4,Oncogenic mutations in the monomeric Casitas B...


In [42]:
X = vectorizer.transform(df_text.Text)

In [43]:
X_array = X.toarray()

In [44]:
df_text_array = pd.DataFrame(X_array)

In [45]:
df_text_array.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,169415,169416,169417,169418,169419,169420,169421,169422,169423,169424
0,0,14,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,4,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,4,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,3,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# TfidVectorizer to extract meanings

In [46]:
from sklearn.feature_extraction.text import TfidfVectorizer
import sys

In [47]:
tfidf = TfidfVectorizer(stop_words = 'english', max_features = 100)
tfidf.fit(df_text.Text)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=100, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words='english', strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [48]:
x = tfidf.transform(df_text.Text)

In [49]:
x = x.toarray()

In [50]:
df_tfid = pd.DataFrame(x)

In [51]:
df_tfid.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,0.130351,0.028371,0.006997,0.014239,0.056427,0.056008,0.00745,0.077699,0.0,0.0,...,0.057801,0.008138,0.029388,0.0,0.027645,0.034869,0.089366,0.0,0.021605,0.102527
1,0.106891,0.026002,0.032064,0.019575,0.019393,0.057747,0.020482,0.090633,0.007178,0.0,...,0.052975,0.007458,0.127938,0.020991,0.057007,0.031957,0.151208,0.0,0.046203,0.051254
2,0.106891,0.026002,0.032064,0.019575,0.019393,0.057747,0.020482,0.090633,0.007178,0.0,...,0.052975,0.007458,0.127938,0.020991,0.057007,0.031957,0.151208,0.0,0.046203,0.051254
3,0.15821,0.031155,0.053786,0.031273,0.077455,0.084567,0.016361,0.116351,0.0,0.0,...,0.047605,0.0,0.0,0.0,0.121429,0.022974,0.098136,0.084334,0.126536,0.040941
4,0.062059,0.004935,0.019475,0.004954,0.039263,0.014614,0.082937,0.172026,0.0,0.0,...,0.135741,0.0,0.025561,0.010624,0.105797,0.03882,0.066967,0.013359,0.100224,0.0


In [52]:
del(x)

In [53]:
del(X)

In [54]:
del(X_array)

In [55]:
del(df_text)

#  merge three dataframes into one

In [56]:
df = df.reset_index(drop=True)

In [57]:
df.tail()

Unnamed: 0,ID,Gene,Variation,Class
8984,5663,1262,5379,-1
8985,5664,486,3684,-1
8986,5665,576,5892,-1
8987,5666,314,2785,-1
8988,5667,344,7542,-1


In [58]:
df_text_array.tail()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,169415,169416,169417,169418,169419,169420,169421,169422,169423,169424
8984,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8985,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8986,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8987,0,2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8988,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [59]:
df_merge = pd.concat([df, df_text_array, df_tfid], axis=1)

In [60]:
df_merge.head()

Unnamed: 0,ID,Gene,Variation,Class,0,1,2,3,4,5,...,90,91,92,93,94,95,96,97,98,99
0,0,447,7654,1,0,14,0,0,0,0,...,0.057801,0.008138,0.029388,0.0,0.027645,0.034869,0.089366,0.0,0.021605,0.102527
1,1,216,8255,2,0,4,0,0,0,0,...,0.052975,0.007458,0.127938,0.020991,0.057007,0.031957,0.151208,0.0,0.046203,0.051254
2,2,216,5191,2,0,4,0,0,0,0,...,0.052975,0.007458,0.127938,0.020991,0.057007,0.031957,0.151208,0.0,0.046203,0.051254
3,3,216,4572,3,0,0,0,0,0,0,...,0.047605,0.0,0.0,0.0,0.121429,0.022974,0.098136,0.084334,0.126536,0.040941
4,4,216,3958,4,0,3,0,0,0,0,...,0.135741,0.0,0.025561,0.010624,0.105797,0.03882,0.066967,0.013359,0.100224,0.0


In [61]:
del(df)

In [62]:
del(df_text_array)

In [63]:
del(df_tfid)

# split train and test again

In [64]:
train = df_merge[df_merge.Class >0]

In [65]:
test = df_merge[df_merge.Class == -1]

In [66]:
train.head()

Unnamed: 0,ID,Gene,Variation,Class,0,1,2,3,4,5,...,90,91,92,93,94,95,96,97,98,99
0,0,447,7654,1,0,14,0,0,0,0,...,0.057801,0.008138,0.029388,0.0,0.027645,0.034869,0.089366,0.0,0.021605,0.102527
1,1,216,8255,2,0,4,0,0,0,0,...,0.052975,0.007458,0.127938,0.020991,0.057007,0.031957,0.151208,0.0,0.046203,0.051254
2,2,216,5191,2,0,4,0,0,0,0,...,0.052975,0.007458,0.127938,0.020991,0.057007,0.031957,0.151208,0.0,0.046203,0.051254
3,3,216,4572,3,0,0,0,0,0,0,...,0.047605,0.0,0.0,0.0,0.121429,0.022974,0.098136,0.084334,0.126536,0.040941
4,4,216,3958,4,0,3,0,0,0,0,...,0.135741,0.0,0.025561,0.010624,0.105797,0.03882,0.066967,0.013359,0.100224,0.0


In [67]:
train.tail()

Unnamed: 0,ID,Gene,Variation,Class,0,1,2,3,4,5,...,90,91,92,93,94,95,96,97,98,99
3316,3316,1155,960,4,0,0,0,0,0,0,...,0.069656,0.0,0.0,0.003345,0.090857,0.042784,0.129535,0.0,0.066275,0.110279
3317,3317,1155,56,1,0,0,0,0,0,0,...,0.055855,0.0,0.0,0.0,0.011449,0.034658,0.075919,0.0,0.003977,0.138963
3318,3318,1155,2076,1,0,0,0,0,0,0,...,0.110564,0.0,0.0,0.0,0.056405,0.0,0.049092,0.019587,0.044083,0.0
3319,3319,1155,6606,4,0,1,0,0,0,0,...,0.251028,0.008316,0.015016,0.007801,0.09181,0.014253,0.0,0.0,0.036797,0.009524
3320,3320,1155,3608,4,0,1,0,0,0,0,...,0.151879,0.005031,0.036339,0.00472,0.162371,0.047428,0.046752,0.0,0.120221,0.011525


In [68]:
test.head()

Unnamed: 0,ID,Gene,Variation,Class,0,1,2,3,4,5,...,90,91,92,93,94,95,96,97,98,99
3321,0,28,6404,-1,0,3,0,0,0,0,...,0.039927,0.026982,0.073081,0.092814,0.04583,0.042391,0.053184,0.005305,0.035818,0.005151
3322,1,852,5005,-1,0,1,0,0,0,0,...,0.0,0.02535,0.228865,0.0,0.021529,0.11948,0.203433,0.0,0.022434,0.0
3323,2,950,3915,-1,0,0,0,0,0,0,...,0.045918,0.051719,0.123099,0.141145,0.027951,0.040291,0.051632,0.072101,0.012483,0.0
3324,3,657,85,-1,0,1,0,0,0,0,...,0.04015,0.125616,0.1497,0.042424,0.102413,0.034447,0.072156,0.005927,0.02668,0.0
3325,4,1376,2780,-1,0,9,0,0,0,0,...,0.010303,0.088971,0.094295,0.076209,0.039422,0.056354,0.08496,0.063885,0.037657,0.004431


In [69]:
test.tail()

Unnamed: 0,ID,Gene,Variation,Class,0,1,2,3,4,5,...,90,91,92,93,94,95,96,97,98,99
8984,5663,1262,5379,-1,0,0,0,0,0,0,...,0.087895,0.018333,0.00331,0.0,0.034253,0.01571,0.086724,0.012976,0.012979,0.004199
8985,5664,486,3684,-1,1,0,0,0,0,0,...,0.053995,0.060816,0.076868,0.011411,0.072308,0.020846,0.010275,0.028696,0.075349,0.0
8986,5665,576,5892,-1,0,0,0,0,0,0,...,0.027157,0.203919,0.266949,0.239128,0.0,0.008737,0.0,0.0,0.0,0.0
8987,5666,314,2785,-1,0,2,0,0,0,0,...,0.037721,0.026554,0.091099,0.144483,0.049612,0.059163,0.134585,0.006265,0.0047,0.006083
8988,5667,344,7542,-1,0,0,0,0,0,0,...,0.156993,0.023577,0.0,0.011059,0.110125,0.090918,0.159329,0.431086,0.114757,0.040505


In [70]:
del(df_merge)

#  split the train dataset for internal evaluation

In [71]:
from sklearn.cross_validation import train_test_split
from sklearn import metrics



In [72]:
y_train = train.pop('Class')

In [73]:
x_train = train

In [74]:
y_test = test.pop('Class')

In [75]:
x_test = test

In [76]:
x_train.head()

Unnamed: 0,ID,Gene,Variation,0,1,2,3,4,5,6,...,90,91,92,93,94,95,96,97,98,99
0,0,447,7654,0,14,0,0,0,0,0,...,0.057801,0.008138,0.029388,0.0,0.027645,0.034869,0.089366,0.0,0.021605,0.102527
1,1,216,8255,0,4,0,0,0,0,0,...,0.052975,0.007458,0.127938,0.020991,0.057007,0.031957,0.151208,0.0,0.046203,0.051254
2,2,216,5191,0,4,0,0,0,0,0,...,0.052975,0.007458,0.127938,0.020991,0.057007,0.031957,0.151208,0.0,0.046203,0.051254
3,3,216,4572,0,0,0,0,0,0,0,...,0.047605,0.0,0.0,0.0,0.121429,0.022974,0.098136,0.084334,0.126536,0.040941
4,4,216,3958,0,3,0,0,0,0,0,...,0.135741,0.0,0.025561,0.010624,0.105797,0.03882,0.066967,0.013359,0.100224,0.0


In [77]:
Xtrain, Xtest, ytrain, ytest = train_test_split(x_train, y_train, random_state=0)

#  Use Random Forest for first quick prediction

In [78]:
from sklearn.ensemble import RandomForestClassifier

Xtrain, Xtest, ytrain, ytest = train_test_split(x_train,y_train,random_state=0)
clf = RandomForestClassifier(n_estimators=100, random_state=0)
clf.fit(Xtrain, ytrain)
ypred = clf.predict(Xtest)
metrics.accuracy_score(ypred, ytest)

0.6534296028880866

##  the score was 0.6534296028880866

## 0.638 was the internal evaluatin score without tdif.

#  Make a prediction to submit to Kaggle

In [79]:
prediction = clf.predict_proba(x_test)

In [80]:
df_prediction  = pd.DataFrame(prediction)

In [81]:
df_prediction.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,0.04,0.21,0.01,0.04,0.0,0.02,0.67,0.0,0.01
1,0.19,0.1,0.0,0.35,0.0,0.03,0.33,0.0,0.0
2,0.12,0.2,0.01,0.06,0.03,0.04,0.53,0.01,0.0
3,0.16,0.15,0.03,0.1,0.03,0.01,0.51,0.0,0.01
4,0.17,0.09,0.02,0.25,0.02,0.03,0.42,0.0,0.0


In [82]:
df_submission.head()

Unnamed: 0,ID,class1,class2,class3,class4,class5,class6,class7,class8,class9
0,0,0,0,0,0,0,1,0,0,0
1,1,0,1,0,0,0,0,0,0,0
2,2,0,0,0,0,0,1,0,0,0
3,3,0,0,0,0,0,0,0,1,0
4,4,0,0,0,1,0,0,0,0,0


In [83]:
df_submission.columns.values

array(['ID', 'class1', 'class2', 'class3', 'class4', 'class5', 'class6',
       'class7', 'class8', 'class9'], dtype=object)

In [84]:
class_name = ['class1', 'class2', 'class3', 'class4', 'class5', 'class6',
       'class7', 'class8', 'class9']

In [85]:
df_prediction.columns = class_name

In [86]:
df_prediction.head()

Unnamed: 0,class1,class2,class3,class4,class5,class6,class7,class8,class9
0,0.04,0.21,0.01,0.04,0.0,0.02,0.67,0.0,0.01
1,0.19,0.1,0.0,0.35,0.0,0.03,0.33,0.0,0.0
2,0.12,0.2,0.01,0.06,0.03,0.04,0.53,0.01,0.0
3,0.16,0.15,0.03,0.1,0.03,0.01,0.51,0.0,0.01
4,0.17,0.09,0.02,0.25,0.02,0.03,0.42,0.0,0.0


In [87]:
df_prediction.to_csv('predict1.csv')

In [88]:
df_prediction2 = pd.read_csv('predict1.csv')

In [89]:
df_prediction2.head()

Unnamed: 0.1,Unnamed: 0,class1,class2,class3,class4,class5,class6,class7,class8,class9
0,0,0.04,0.21,0.01,0.04,0.0,0.02,0.67,0.0,0.01
1,1,0.19,0.1,0.0,0.35,0.0,0.03,0.33,0.0,0.0
2,2,0.12,0.2,0.01,0.06,0.03,0.04,0.53,0.01,0.0
3,3,0.16,0.15,0.03,0.1,0.03,0.01,0.51,0.0,0.01
4,4,0.17,0.09,0.02,0.25,0.02,0.03,0.42,0.0,0.0


In [90]:
df_prediction2.columns

Index(['Unnamed: 0', 'class1', 'class2', 'class3', 'class4', 'class5',
       'class6', 'class7', 'class8', 'class9'],
      dtype='object')

In [91]:
submission_columns = ['ID', 'class1', 'class2', 'class3', 'class4', 'class5', 'class6',
       'class7', 'class8', 'class9']

In [92]:
df_prediction2.columns = submission_columns

In [93]:
df_prediction2.head()

Unnamed: 0,ID,class1,class2,class3,class4,class5,class6,class7,class8,class9
0,0,0.04,0.21,0.01,0.04,0.0,0.02,0.67,0.0,0.01
1,1,0.19,0.1,0.0,0.35,0.0,0.03,0.33,0.0,0.0
2,2,0.12,0.2,0.01,0.06,0.03,0.04,0.53,0.01,0.0
3,3,0.16,0.15,0.03,0.1,0.03,0.01,0.51,0.0,0.01
4,4,0.17,0.09,0.02,0.25,0.02,0.03,0.42,0.0,0.0


In [94]:
df_prediction2.to_csv('first_submission_rf2.csv', index = False)

## Kaggle submission this file.