In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

#  Load data

In [2]:
df_train = pd.read_csv('training_variants')

In [3]:
df_train.head()

Unnamed: 0,ID,Gene,Variation,Class
0,0,FAM58A,Truncating Mutations,1
1,1,CBL,W802*,2
2,2,CBL,Q249E,2
3,3,CBL,N454D,3
4,4,CBL,L399V,4


In [4]:
df_test = pd.read_csv('test_variants')

In [5]:
df_test.head()

Unnamed: 0,ID,Gene,Variation
0,0,ACSL4,R570S
1,1,NAGLU,P521L
2,2,PAH,L333F
3,3,ING1,A148D
4,4,TMEM216,G77A


In [6]:
df_train_text = pd.read_csv('training_text', sep="\|\|", engine='python', header=None, skiprows=1, names=["ID","Text"])

In [7]:
df_train_text.head()

Unnamed: 0,ID,Text
0,0,Cyclin-dependent kinases (CDKs) regulate a var...
1,1,Abstract Background Non-small cell lung canc...
2,2,Abstract Background Non-small cell lung canc...
3,3,Recent evidence has demonstrated that acquired...
4,4,Oncogenic mutations in the monomeric Casitas B...


In [8]:
df_test_text = pd.read_csv('test_text', sep="\|\|", engine='python', header=None, skiprows=1, names=["ID","Text"])

In [9]:
df_test_text.head()

Unnamed: 0,ID,Text
0,0,2. This mutation resulted in a myeloproliferat...
1,1,Abstract The Large Tumor Suppressor 1 (LATS1)...
2,2,Vascular endothelial growth factor receptor (V...
3,3,Inflammatory myofibroblastic tumor (IMT) is a ...
4,4,Abstract Retinoblastoma is a pediatric retina...


In [10]:
df_submission = pd.read_csv('submissionFile')

In [11]:
df_submission.head()

Unnamed: 0,ID,class1,class2,class3,class4,class5,class6,class7,class8,class9
0,0,0,0,0,0,0,1,0,0,0
1,1,0,1,0,0,0,0,0,0,0
2,2,0,0,0,0,0,1,0,0,0
3,3,0,0,0,0,0,0,0,1,0
4,4,0,0,0,1,0,0,0,0,0


#  Examine data shape

In [12]:
df_train.shape

(3321, 4)

In [13]:
df_test.shape

(5668, 3)

In [14]:
df_train.groupby('Class').describe()

Unnamed: 0_level_0,ID,ID,ID,ID,ID,ID,ID,ID
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
Class,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
1,568.0,1431.56162,950.721629,0.0,542.75,1265.0,2280.25,3318.0
2,452.0,1644.789823,1055.856113,1.0,840.75,1442.5,2745.5,3290.0
3,89.0,1645.191011,772.938547,3.0,975.0,1650.0,2224.0,3255.0
4,686.0,1440.048105,891.1061,4.0,622.5,1575.5,2209.75,3320.0
5,242.0,2026.157025,817.487914,6.0,1412.25,2467.5,2637.5,3289.0
6,275.0,2163.410909,831.190933,20.0,1434.0,2484.0,2817.5,3314.0
7,953.0,1755.01469,948.957122,28.0,1059.0,1670.0,2708.0,3310.0
8,19.0,1554.684211,973.6362,121.0,529.0,1768.0,2276.0,3114.0
9,37.0,936.675676,790.795277,122.0,131.0,1083.0,1751.0,3061.0


In [15]:
df_train.groupby('Gene').describe()

Unnamed: 0_level_0,Class,Class,Class,Class,Class,Class,Class,Class,ID,ID,ID,ID,ID,ID,ID,ID
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
Gene,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
ABL1,26.0,2.384615,1.358732,2.0,2.00,2.0,2.00,7.0,26.0,849.5,7.648529,837.0,843.25,849.5,855.75,862.0
ACVR1,3.0,5.333333,2.886751,2.0,4.50,7.0,7.00,7.0,3.0,1069.0,1.000000,1068.0,1068.50,1069.0,1069.50,1070.0
AGO2,5.0,1.200000,0.447214,1.0,1.00,1.0,1.00,2.0,5.0,2087.0,1.581139,2085.0,2086.00,2087.0,2088.00,2089.0
AKT1,28.0,5.892857,1.950105,2.0,5.00,7.0,7.00,8.0,28.0,1353.5,8.225975,1340.0,1346.75,1353.5,1360.25,1367.0
AKT2,11.0,6.909091,1.814086,2.0,7.00,7.0,7.00,9.0,11.0,1373.0,3.316625,1368.0,1370.50,1373.0,1375.50,1378.0
AKT3,4.0,5.750000,2.500000,2.0,5.75,7.0,7.00,7.0,4.0,3180.5,1.290994,3179.0,3179.75,3180.5,3181.25,3182.0
ALK,69.0,5.768116,1.918524,2.0,5.00,7.0,7.00,7.0,69.0,1540.0,20.062403,1506.0,1523.00,1540.0,1557.00,1574.0
APC,5.0,2.200000,1.643168,1.0,1.00,1.0,4.00,4.0,5.0,1725.0,1.581139,1723.0,1724.00,1725.0,1726.00,1727.0
AR,20.0,5.650000,2.007224,1.0,5.00,6.5,7.00,7.0,20.0,1791.5,5.916080,1782.0,1786.75,1791.5,1796.25,1801.0
ARAF,7.0,6.285714,1.889822,2.0,7.00,7.0,7.00,7.0,7.0,1805.0,2.160247,1802.0,1803.50,1805.0,1806.50,1808.0


In [16]:
df_train_text.shape

(3321, 2)

In [17]:
df_test_text.shape

(5668, 2)

In [18]:
df_train_text.iloc[0].Text

"Cyclin-dependent kinases (CDKs) regulate a variety of fundamental cellular processes. CDK10 stands out as one of the last orphan CDKs for which no activating cyclin has been identified and no kinase activity revealed. Previous work has shown that CDK10 silencing increases ETS2 (v-ets erythroblastosis virus E26 oncogene homolog 2)-driven activation of the MAPK pathway, which confers tamoxifen resistance to breast cancer cells. The precise mechanisms by which CDK10 modulates ETS2 activity, and more generally the functions of CDK10, remain elusive. Here we demonstrate that CDK10 is a cyclin-dependent kinase by identifying cyclin M as an activating cyclin. Cyclin M, an orphan cyclin, is the product of FAM58A, whose mutations cause STAR syndrome, a human developmental anomaly whose features include toe syndactyly, telecanthus, and anogenital and renal malformations. We show that STAR syndrome-associated cyclin M mutants are unable to interact with CDK10. Cyclin M silencing phenocopies CDK1

#  merging test and train data for processing

In [19]:
df_test["Class"] = -1

In [20]:
df = pd.concat([df_train, df_test])

In [21]:
df.tail()

Unnamed: 0,ID,Gene,Variation,Class
5663,5663,SLC46A1,R113S,-1
5664,5664,FOXC1,L130F,-1
5665,5665,GSS,R267W,-1
5666,5666,CTSK,G79E,-1
5667,5667,DFNB59,T54I,-1


In [22]:
df.shape

(8989, 4)

In [23]:
df_text = pd.concat([df_train_text, df_test_text])

In [24]:
df_text.head()

Unnamed: 0,ID,Text
0,0,Cyclin-dependent kinases (CDKs) regulate a var...
1,1,Abstract Background Non-small cell lung canc...
2,2,Abstract Background Non-small cell lung canc...
3,3,Recent evidence has demonstrated that acquired...
4,4,Oncogenic mutations in the monomeric Casitas B...


In [25]:
df_text.shape

(8989, 2)

In [26]:
df_text.tail()

Unnamed: 0,ID,Text
5663,5663,The realization in the late 1970s that RAS har...
5664,5664,Hemizygous deletions are common molecular abno...
5665,5665,All most R267W of has with to SMARTpool invest...
5666,5666,Abstract Blood samples from 125 unrelated fami...
5667,5667,"Loss of DNA mismatch repair (MMR) in humans, m..."


In [27]:
del(df_train)


In [28]:
del(df_test)

In [29]:
del(df_train_text)

In [30]:
del(df_test_text)

# chage categorical to numbers

In [31]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

In [32]:
le.fit(df.Gene)

LabelEncoder()

In [33]:
le.transform(df.Gene)

array([447, 216, 216, ..., 576, 314, 344])

In [34]:
df['Gene'] = le.transform(df.Gene)

In [35]:
df.head()

Unnamed: 0,ID,Gene,Variation,Class
0,0,447,Truncating Mutations,1
1,1,216,W802*,2
2,2,216,Q249E,2
3,3,216,N454D,3
4,4,216,L399V,4


In [36]:
df.tail()

Unnamed: 0,ID,Gene,Variation,Class
5663,5663,1262,R113S,-1
5664,5664,486,L130F,-1
5665,5665,576,R267W,-1
5666,5666,314,G79E,-1
5667,5667,344,T54I,-1


In [37]:
le.fit(df.Variation)

LabelEncoder()

In [38]:
df['Variation'] = le.transform(df.Variation)

In [39]:
df.head()

Unnamed: 0,ID,Gene,Variation,Class
0,0,447,7654,1
1,1,216,8255,2
2,2,216,5191,2
3,3,216,4572,3
4,4,216,3958,4


# use CountVectorizer to simply convert text to vector

In [40]:
from sklearn.feature_extraction.text import CountVectorizer

In [41]:
vectorizer = CountVectorizer()

In [42]:
vectorizer.fit(df_text.Text)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [43]:
df_text.head()

Unnamed: 0,ID,Text
0,0,Cyclin-dependent kinases (CDKs) regulate a var...
1,1,Abstract Background Non-small cell lung canc...
2,2,Abstract Background Non-small cell lung canc...
3,3,Recent evidence has demonstrated that acquired...
4,4,Oncogenic mutations in the monomeric Casitas B...


In [44]:
X = vectorizer.transform(df_text.Text)

In [45]:
X_array = X.toarray()

In [46]:
df_text_array = pd.DataFrame(X_array)

In [47]:
df_text_array.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,169415,169416,169417,169418,169419,169420,169421,169422,169423,169424
0,0,14,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,4,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,4,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,3,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [48]:
del(X)

In [49]:
del(X_array)

In [50]:
del(df_text)

#  merge two dataframes into one

In [51]:
df = df.reset_index(drop=True)

In [52]:
df.tail()

Unnamed: 0,ID,Gene,Variation,Class
8984,5663,1262,5379,-1
8985,5664,486,3684,-1
8986,5665,576,5892,-1
8987,5666,314,2785,-1
8988,5667,344,7542,-1


In [53]:
df_text_array = df_text_array.reset_index(drop=True)

In [54]:
 result = pd.concat([df, df_text_array], axis=1)

In [55]:
result.head()

Unnamed: 0,ID,Gene,Variation,Class,0,1,2,3,4,5,...,169415,169416,169417,169418,169419,169420,169421,169422,169423,169424
0,0,447,7654,1,0,14,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,216,8255,2,0,4,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,216,5191,2,0,4,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,216,4572,3,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4,216,3958,4,0,3,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [56]:
del(df)

In [57]:
del(df_text_array)

# split train and test again

In [58]:
train = result[result.Class >0]

In [59]:
test = result[result.Class == -1]

In [60]:
train.head()

Unnamed: 0,ID,Gene,Variation,Class,0,1,2,3,4,5,...,169415,169416,169417,169418,169419,169420,169421,169422,169423,169424
0,0,447,7654,1,0,14,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,216,8255,2,0,4,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,216,5191,2,0,4,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,216,4572,3,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4,216,3958,4,0,3,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [61]:
train.tail()

Unnamed: 0,ID,Gene,Variation,Class,0,1,2,3,4,5,...,169415,169416,169417,169418,169419,169420,169421,169422,169423,169424
3316,3316,1155,960,4,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3317,3317,1155,56,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3318,3318,1155,2076,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3319,3319,1155,6606,4,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3320,3320,1155,3608,4,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [62]:
test.head()

Unnamed: 0,ID,Gene,Variation,Class,0,1,2,3,4,5,...,169415,169416,169417,169418,169419,169420,169421,169422,169423,169424
3321,0,28,6404,-1,0,3,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3322,1,852,5005,-1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3323,2,950,3915,-1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3324,3,657,85,-1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3325,4,1376,2780,-1,0,9,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [63]:
test.tail()

Unnamed: 0,ID,Gene,Variation,Class,0,1,2,3,4,5,...,169415,169416,169417,169418,169419,169420,169421,169422,169423,169424
8984,5663,1262,5379,-1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8985,5664,486,3684,-1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8986,5665,576,5892,-1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8987,5666,314,2785,-1,0,2,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8988,5667,344,7542,-1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [64]:
del(result)

#  split the train dataset for internal evaluation

In [65]:
from sklearn.cross_validation import train_test_split
from sklearn import metrics



In [66]:
y_train = train.pop('Class')

In [67]:
x_train = train

In [71]:
y_test = test.pop('Class')

In [72]:
x_test = test

In [68]:
x_train.head()

Unnamed: 0,ID,Gene,Variation,0,1,2,3,4,5,6,...,169415,169416,169417,169418,169419,169420,169421,169422,169423,169424
0,0,447,7654,0,14,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,216,8255,0,4,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,216,5191,0,4,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,216,4572,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4,216,3958,0,3,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [69]:
Xtrain, Xtest, ytrain, ytest = train_test_split(x_train, y_train, random_state=0)

#  Use Random Forest for first quick prediction

In [70]:
from sklearn.ensemble import RandomForestClassifier

Xtrain, Xtest, ytrain, ytest = train_test_split(x_train,y_train,random_state=0)
clf = RandomForestClassifier(n_estimators=100, random_state=0)
clf.fit(Xtrain, ytrain)
ypred = clf.predict(Xtest)
metrics.accuracy_score(ypred, ytest)

0.63898916967509023

## 0.638 is the internal evaluatin score.

#  Make a prediction to submit to Kaggle

In [73]:
prediction = clf.predict_proba(x_test)

In [94]:
df_prediction  = pd.DataFrame(prediction)

In [95]:
df_prediction.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,0.05,0.18,0.0,0.05,0.02,0.01,0.67,0.01,0.01
1,0.17,0.14,0.0,0.38,0.03,0.04,0.23,0.01,0.0
2,0.17,0.19,0.0,0.05,0.02,0.03,0.52,0.02,0.0
3,0.1,0.16,0.0,0.15,0.03,0.03,0.51,0.0,0.02
4,0.13,0.12,0.01,0.26,0.03,0.01,0.41,0.02,0.01


In [96]:
df_submission.head()

Unnamed: 0,ID,class1,class2,class3,class4,class5,class6,class7,class8,class9
0,0,0,0,0,0,0,1,0,0,0
1,1,0,1,0,0,0,0,0,0,0
2,2,0,0,0,0,0,1,0,0,0
3,3,0,0,0,0,0,0,0,1,0
4,4,0,0,0,1,0,0,0,0,0


In [97]:
df_submission.columns.values

array(['ID', 'class1', 'class2', 'class3', 'class4', 'class5', 'class6',
       'class7', 'class8', 'class9'], dtype=object)

In [98]:
class_name = ['class1', 'class2', 'class3', 'class4', 'class5', 'class6',
       'class7', 'class8', 'class9']

In [99]:
df_prediction.columns = class_name

In [100]:
df_prediction.head()

Unnamed: 0,class1,class2,class3,class4,class5,class6,class7,class8,class9
0,0.05,0.18,0.0,0.05,0.02,0.01,0.67,0.01,0.01
1,0.17,0.14,0.0,0.38,0.03,0.04,0.23,0.01,0.0
2,0.17,0.19,0.0,0.05,0.02,0.03,0.52,0.02,0.0
3,0.1,0.16,0.0,0.15,0.03,0.03,0.51,0.0,0.02
4,0.13,0.12,0.01,0.26,0.03,0.01,0.41,0.02,0.01


In [101]:
df_prediction.to_csv('predict1.csv')

In [102]:
df_prediction2 = pd.read_csv('predict1.csv')

In [103]:
df_prediction2.head()

Unnamed: 0.1,Unnamed: 0,class1,class2,class3,class4,class5,class6,class7,class8,class9
0,0,0.05,0.18,0.0,0.05,0.02,0.01,0.67,0.01,0.01
1,1,0.17,0.14,0.0,0.38,0.03,0.04,0.23,0.01,0.0
2,2,0.17,0.19,0.0,0.05,0.02,0.03,0.52,0.02,0.0
3,3,0.1,0.16,0.0,0.15,0.03,0.03,0.51,0.0,0.02
4,4,0.13,0.12,0.01,0.26,0.03,0.01,0.41,0.02,0.01


In [104]:
df_prediction2.columns

Index(['Unnamed: 0', 'class1', 'class2', 'class3', 'class4', 'class5',
       'class6', 'class7', 'class8', 'class9'],
      dtype='object')

In [105]:
submission_columns = ['ID', 'class1', 'class2', 'class3', 'class4', 'class5', 'class6',
       'class7', 'class8', 'class9']

In [106]:
df_prediction2.columns = submission_columns

In [108]:
df_prediction2.head()

Unnamed: 0,ID,class1,class2,class3,class4,class5,class6,class7,class8,class9
0,0,0.05,0.18,0.0,0.05,0.02,0.01,0.67,0.01,0.01
1,1,0.17,0.14,0.0,0.38,0.03,0.04,0.23,0.01,0.0
2,2,0.17,0.19,0.0,0.05,0.02,0.03,0.52,0.02,0.0
3,3,0.1,0.16,0.0,0.15,0.03,0.03,0.51,0.0,0.02
4,4,0.13,0.12,0.01,0.26,0.03,0.01,0.41,0.02,0.01


In [109]:
df_prediction2.to_csv('first_submission_rf1.csv', index = False)

## Kaggle submission this file.

In [90]:
test.ID

3321       0
3322       1
3323       2
3324       3
3325       4
3326       5
3327       6
3328       7
3329       8
3330       9
3331      10
3332      11
3333      12
3334      13
3335      14
3336      15
3337      16
3338      17
3339      18
3340      19
3341      20
3342      21
3343      22
3344      23
3345      24
3346      25
3347      26
3348      27
3349      28
3350      29
        ... 
8959    5638
8960    5639
8961    5640
8962    5641
8963    5642
8964    5643
8965    5644
8966    5645
8967    5646
8968    5647
8969    5648
8970    5649
8971    5650
8972    5651
8973    5652
8974    5653
8975    5654
8976    5655
8977    5656
8978    5657
8979    5658
8980    5659
8981    5660
8982    5661
8983    5662
8984    5663
8985    5664
8986    5665
8987    5666
8988    5667
Name: ID, Length: 5668, dtype: int64

In [87]:
df_prediction['ID'] = test.ID

In [88]:
df_prediction.head()

Unnamed: 0,class1,class2,class3,class4,class5,class6,class7,class8,class9,ID
0,0.05,0.18,0.0,0.05,0.02,0.01,0.67,0.01,0.01,
1,0.17,0.14,0.0,0.38,0.03,0.04,0.23,0.01,0.0,
2,0.17,0.19,0.0,0.05,0.02,0.03,0.52,0.02,0.0,
3,0.1,0.16,0.0,0.15,0.03,0.03,0.51,0.0,0.02,
4,0.13,0.12,0.01,0.26,0.03,0.01,0.41,0.02,0.01,


In [89]:
df_prediction[df_submission.columns.values]

Unnamed: 0,ID,class1,class2,class3,class4,class5,class6,class7,class8,class9
0,,0.05,0.18,0.00,0.05,0.02,0.01,0.67,0.01,0.01
1,,0.17,0.14,0.00,0.38,0.03,0.04,0.23,0.01,0.00
2,,0.17,0.19,0.00,0.05,0.02,0.03,0.52,0.02,0.00
3,,0.10,0.16,0.00,0.15,0.03,0.03,0.51,0.00,0.02
4,,0.13,0.12,0.01,0.26,0.03,0.01,0.41,0.02,0.01
5,,0.13,0.10,0.00,0.57,0.04,0.04,0.10,0.01,0.01
6,,0.11,0.14,0.04,0.22,0.05,0.04,0.38,0.00,0.02
7,,0.14,0.09,0.00,0.34,0.17,0.05,0.20,0.01,0.00
8,,0.14,0.25,0.00,0.21,0.04,0.08,0.27,0.00,0.01
9,,0.12,0.23,0.00,0.11,0.03,0.04,0.45,0.01,0.01


In [74]:
x = clf.feature_importances_

In [75]:
df = pd.DataFrame(x)

In [77]:
df.head()

Unnamed: 0,0
0,0.058251
1,0.005005
2,0.057148
3,0.000103
4,0.000214


In [78]:
df.columns = ['a']

In [80]:
df.head()

Unnamed: 0,a
0,0.058251
1,0.005005
2,0.057148
3,0.000103
4,0.000214


In [81]:
df.describe()

Unnamed: 0,a
count,169428.0
mean,6e-06
std,0.000202
min,0.0
25%,0.0
50%,0.0
75%,0.0
max,0.058251


In [99]:
df[df.a >0.0004].index.values

array([     0,      1,      2,   1590,   6087,   7149,   8011,  12216,
        20557,  20560,  23660,  23677,  23685,  23757,  23779,  24392,
        24875,  25691,  26362,  26651,  27353,  27531,  28386,  29384,
        30276,  30910,  31189,  32675,  33604,  33738,  33943,  34203,
        34405,  34581,  34588,  35858,  36395,  37167,  39619,  40364,
        40373,  40622,  42423,  42486,  43526,  43608,  44909,  44916,
        45061,  46341,  46451,  46692,  47039,  47120,  47122,  47195,
        47324,  47593,  47928,  52174,  52387,  53084,  54111,  54343,
        54396,  54397,  56295,  56601,  56657,  57252,  57266,  59778,
        59931,  61284,  62156,  62157,  63690,  63701,  63838,  64319,
        64372,  64993,  65010,  65270,  66051,  66170,  66649,  67296,
        67589,  68038,  68274,  68317,  68342,  68367,  72205,  72366,
        72450,  75833,  78462,  79542,  81384,  83746,  85533,  85764,
        85845,  85849,  85947,  86280,  86285,  86312,  86332,  86536,
      

In [100]:
columns = df[df.a >0.0004].index.values

In [101]:
df_train = train[columns]

In [102]:
df_train.head()

Unnamed: 0,0,1,2,1590,6087,7149,8011,12216,20557,20560,...,162499,163450,164687,164761,164907,165092,165453,165928,167694,167711
0,0,14,0,0,0,0,0,0,0,0,...,0,0,0,0,0,2,0,0,0,0
1,0,4,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,0,4,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,4,0,0,0,0
4,0,3,0,0,0,0,0,0,0,0,...,0,0,0,0,0,4,0,0,0,0


In [104]:
df_test = test[columns]

In [105]:
y_test = test.Class

# XGBoost

In [110]:
import time
import xgboost as xgb

In [106]:
random_state = 0

In [134]:
Xtrain, Xtest, ytrain, ytest = train_test_split(df_train, y_train, test_size=0.2, random_state=1)

In [135]:
dtrain = xgb.DMatrix(Xtrain.values, ytrain.values)
dvalid = xgb.DMatrix(Xtest.values, ytest.values)



In [136]:
dtest = xgb.DMatrix(df_test.values, y_test)
d_all_train = xgb.DMatrix(df_train.values, y_train)

In [137]:
num_boost_round = 1250

In [150]:
params = {}
# use softmax multi-class classification
param['objective'] = 'multi:softprob'
# scale weight of positive examples
param['eta'] = 0.01
param['max_depth'] = 9
param['silent'] = 1
param['nthread'] = 4
param['num_class'] = 9

In [151]:
num_round = 200

In [171]:
early_stopping_rounds = 60

In [172]:
watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
gbm = xgb.train(params, dtrain, num_boost_round, evals=watchlist, early_stopping_rounds=early_stopping_rounds, verbose_eval=True)



[0]	train-rmse:3.49643	eval-rmse:3.51574
Multiple eval metrics have been passed: 'eval-rmse' will be used for early stopping.

Will train until eval-rmse hasn't improved in 60 rounds.
[1]	train-rmse:2.85528	eval-rmse:2.9166
[2]	train-rmse:2.47653	eval-rmse:2.56506
[3]	train-rmse:2.25646	eval-rmse:2.37587
[4]	train-rmse:2.11935	eval-rmse:2.28899
[5]	train-rmse:2.04201	eval-rmse:2.23212
[6]	train-rmse:1.98101	eval-rmse:2.20395
[7]	train-rmse:1.9433	eval-rmse:2.1885
[8]	train-rmse:1.93036	eval-rmse:2.18431
[9]	train-rmse:1.91832	eval-rmse:2.18081
[10]	train-rmse:1.90674	eval-rmse:2.17863
[11]	train-rmse:1.90152	eval-rmse:2.17766
[12]	train-rmse:1.89908	eval-rmse:2.17673
[13]	train-rmse:1.89678	eval-rmse:2.17534
[14]	train-rmse:1.89384	eval-rmse:2.17314
[15]	train-rmse:1.88499	eval-rmse:2.17538
[16]	train-rmse:1.88288	eval-rmse:2.17396
[17]	train-rmse:1.87836	eval-rmse:2.17179
[18]	train-rmse:1.87541	eval-rmse:2.17242
[19]	train-rmse:1.874	eval-rmse:2.17204
[20]	train-rmse:1.87095	eval-rms

In [173]:
predict_matrix = bst.predict( xg_test )

NameError: name 'bst' is not defined

In [None]:
gbm.

In [165]:
predict_matrix = gbm.predict_proba(dtest)

AttributeError: 'Booster' object has no attribute 'predict_proba'

In [162]:
predict_matrix.shape

(5668,)

In [163]:
df_test.shape

(5668, 238)

In [160]:
df = pd.DataFrame(predict_matrix)

In [161]:
df

Unnamed: 0,0
0,4.406569
1,3.651589
2,5.185692
3,4.219996
4,3.500709
5,3.211769
6,5.253383
7,4.613370
8,5.981335
9,3.459876


In [174]:
 clf = XGBoostClassifier(
        eval_metric = 'auc',
        num_class = 2,
        nthread = 4,
        eta = 0.1,
        num_boost_round = 80,
        max_depth = 12,
        subsample = 0.5,
        colsample_bytree = 1.0,
        silent = 1,
        )

NameError: name 'XGBoostClassifier' is not defined

In [None]:
181/2

In [None]:
param = {}
# use softmax multi-class classification
param['objective'] = 'multi:softprob'
# scale weight of positive examples
param['eta'] = 0.1
param['max_depth'] = 6
param['silent'] = 1
param['nthread'] = 4
param['num_class'] = 9

In [176]:
num_round = 200

In [None]:
xg_train = xgb.DMatrix( x_train, label=y_train)

In [None]:
bst = xgb.train(param, xg_train, num_round)

In [None]:
test_Y=np.repeat(0, len(x_test))
xg_test = xgb.DMatrix(x_test, label=test_Y)

In [None]:
predict_matrix = bst.predict( xg_test )

In [31]:
from sklearn.feature_extraction import DictVectorizer

In [32]:
vec = DictVectorizer()

In [34]:
vec.fit(df_train_text.Text)

AttributeError: 'str' object has no attribute 'items'

In [35]:
from sklearn.feature_extraction.text import CountVectorizer

In [36]:
vectorizer = CountVectorizer()

In [37]:
vectorizer.fit(df_train_text.iloc[0].Text)

ValueError: Iterable over raw text documents expected, string object received.

In [38]:
s  = df_train_text.iloc[0].Text

In [40]:
type(s)

str

In [41]:
corpus = ['This is the first document.', 'This is the second second document.', 'And the third one.', 'Is this the first document?']

In [43]:
type(corpus[0])

str

In [47]:
vectorizer.fit(df_train_text.Text)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [50]:
X = vectorizer.transform(df_train_text.Text)

In [54]:
X_array = X.toarray()

In [64]:
X_array.shape

(3321, 155732)

In [55]:
from sklearn.decomposition import PCA

In [56]:
pca = PCA(n_components=12)

In [57]:
pca.fit(X_array)

PCA(copy=True, iterated_power='auto', n_components=12, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [58]:
pca.transform(X_array)

array([[-327.38533677,  -10.40703505,  -59.73368035, ...,  -18.04991177,
          15.55202366,    6.92667225],
       [-304.49837688,   66.92031229,  -10.94140942, ...,  -15.11556823,
         -22.75145669,  -11.39619574],
       [-304.49837688,   66.92031229,  -10.94140942, ...,  -15.11556823,
         -22.75145669,  -11.39619574],
       ..., 
       [-355.92108808,  -42.0466479 ,   59.33470337, ...,  -71.57361113,
          13.46047484,    4.50701897],
       [-402.18267248,    5.51842793,    1.89231461, ...,    7.56319876,
         -10.86781253,    8.67933973],
       [  77.4538821 ,  -20.59990173,   44.265623  , ...,   19.04884885,
          -9.88357427,    0.86774867]])

In [59]:
y = pca.transform(X_array)

In [60]:
df = pd.DataFrame(y)

In [61]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,-327.385337,-10.407035,-59.733680,-0.177220,-12.810966,-3.638846,-24.576224,11.295630,37.693224,-18.049912,15.552024,6.926672
1,-304.498377,66.920312,-10.941409,9.437255,-27.589438,14.138782,-37.779722,34.032083,15.281713,-15.115568,-22.751457,-11.396196
2,-304.498377,66.920312,-10.941409,9.437255,-27.589438,14.138782,-37.779722,34.032083,15.281713,-15.115568,-22.751457,-11.396196
3,-393.291760,62.711417,-35.439660,-8.393497,3.798922,-11.305377,5.765690,25.751629,9.485337,-11.877161,-1.351196,-2.270336
4,-260.505365,18.722803,50.174878,23.323408,-4.530400,21.075299,-19.956941,25.210526,-9.865957,36.995784,-20.720734,-4.158687
5,-260.505365,18.722803,50.174878,23.323408,-4.530400,21.075299,-19.956941,25.210526,-9.865957,36.995784,-20.720734,-4.158687
6,-260.505365,18.722803,50.174878,23.323408,-4.530400,21.075299,-19.956941,25.210526,-9.865957,36.995784,-20.720734,-4.158687
7,398.196223,155.234761,-93.657749,-64.576068,15.671675,-29.822627,-30.165369,78.588851,31.062789,-39.398288,-2.240496,-100.570615
8,265.329508,13.210008,-5.197030,18.762737,1.908598,21.197584,-43.053837,69.024657,50.751990,43.643038,-19.530215,-53.270841
9,-277.678178,17.467975,-73.654227,13.175373,10.100599,-7.397906,-13.942197,31.660563,67.349446,-15.091235,6.619014,-42.732768


In [66]:
df2 = pd.DataFrame(X_array)

In [67]:
df2.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,155722,155723,155724,155725,155726,155727,155728,155729,155730,155731
0,0,14,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,4,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,4,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,3,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [68]:
from sklearn.cross_validation import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics



In [69]:
x_train = df2

In [70]:
y_train = df_train.Class

In [71]:
Xtrain, Xtest, ytrain, ytest = train_test_split(x_train, y_train, random_state=0)

In [72]:
from sklearn.ensemble import RandomForestClassifier

Xtrain, Xtest, ytrain, ytest = train_test_split(x_train,y_train,random_state=0)
clf = RandomForestClassifier(n_estimators=100, random_state=0)
clf.fit(Xtrain, ytrain)
ypred = clf.predict(Xtest)
metrics.accuracy_score(ypred, ytest)

0.6305655836341757

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
Xtrain, Xtest, ytrain, ytest = train_test_split(x_train,y_train, random_state=0)
gbt = GradientBoostingClassifier(max_depth=30, n_estimators=50)
gbt.fit(Xtrain, ytrain)
ypred = gbt.predict(Xtest)
metrics.accuracy_score(ypred, ytest)

In [81]:
x = clf.feature_importances_

In [82]:
x

array([  8.53939958e-05,   2.60545946e-04,   0.00000000e+00, ...,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00])

In [83]:
x_train.columns

RangeIndex(start=0, stop=155732, step=1)

In [84]:
max(x)

0.005177154977148689

In [86]:
np.mean(x)

6.4212878534918963e-06

In [88]:
df3 = pd.DataFrame(x)

In [90]:
df3.columns = ['a']

In [95]:
df3[df3.a > 0.001].index.values

array([ 21727,  21741,  21747,  33804,  41746,  57599,  59064,  63343,
        66647,  69960,  77202,  79675,  84623,  96176, 105869, 112467,
       119970, 124307, 134627, 136709, 141392, 144036, 146218, 151971,
       154113])

In [96]:
columns = df3[df3.a > 0.001].index.values

In [102]:
df4 = x_train[columns]

In [103]:
df4

Unnamed: 0,21727,21741,21747,33804,41746,57599,59064,63343,66647,69960,...,112467,119970,124307,134627,136709,141392,144036,146218,151971,154113
0,1,2,1,0,9,1,43,2,13,4,...,15,0,10,4,1,0,1,0,3,4
1,1,2,3,0,0,0,8,5,3,3,...,0,0,12,0,2,0,1,7,7,0
2,1,2,3,0,0,0,8,5,3,3,...,0,0,12,0,2,0,1,7,7,0
3,2,3,2,0,6,0,2,0,8,10,...,1,0,7,0,0,1,0,22,16,0
4,4,0,16,0,0,0,3,9,1,2,...,4,0,2,45,1,0,0,6,20,0
5,4,0,16,0,0,0,3,9,1,2,...,4,0,2,45,1,0,0,6,20,0
6,4,0,16,0,0,0,3,9,1,2,...,4,0,2,45,1,0,0,6,20,0
7,17,4,11,0,25,0,26,11,21,13,...,13,0,19,1,2,3,6,29,16,0
8,9,2,31,0,0,0,48,16,2,4,...,74,0,27,46,1,0,19,29,54,0
9,5,2,15,0,0,0,45,7,1,2,...,70,0,25,1,0,0,19,23,34,0


In [74]:
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder()

In [75]:
enc.fit(df_train.Gene)

ValueError: could not convert string to float: 'RUNX1'

In [76]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

In [77]:
le.fit(df_train.Gene)

LabelEncoder()

In [78]:
le.transform(df_train.Gene)

array([ 85,  39,  39, ..., 221, 221, 221])

In [79]:
df_train.head()

Unnamed: 0,ID,Gene,Variation,Class
0,0,FAM58A,Truncating Mutations,1
1,1,CBL,W802*,2
2,2,CBL,Q249E,2
3,3,CBL,N454D,3
4,4,CBL,L399V,4
