In [6]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import re
import nltk
import string
import matplotlib.pyplot as plt
from sklearn.ensemble import VotingClassifier
from sklearn.decomposition import LatentDirichletAllocation
from collections import Counter
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import scipy.sparse as sp
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss, accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import StratifiedKFold
from xgboost import XGBClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import lightgbm as lgb
from string import punctuation
from sklearn import metrics
from sklearn.metrics import confusion_matrix, classification_report 
from sklearn.model_selection import GridSearchCV 
from sklearn.svm import SVC
from sklearn.ensemble import BaggingClassifier, ExtraTreesClassifier, RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import VotingClassifier

In [7]:
for dirname, _, filenames in os.walk('/cancer-diagnosis'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [8]:
training_text = pd.read_csv("cancer-diagnosis/training_text",sep="\|\|", header=None, skiprows=1, names=["ID","Text"])

  """Entry point for launching an IPython kernel.


In [9]:
training_text.head(5)

Unnamed: 0,ID,Text
0,0,Cyclin-dependent kinases (CDKs) regulate a var...
1,1,Abstract Background Non-small cell lung canc...
2,2,Abstract Background Non-small cell lung canc...
3,3,Recent evidence has demonstrated that acquired...
4,4,Oncogenic mutations in the monomeric Casitas B...


In [10]:
training_text.shape

(3321, 2)

In [12]:
training_variants = pd.read_csv("cancer-diagnosis/training_variants")

In [13]:
training_variants.head(5)

Unnamed: 0,ID,Gene,Variation,Class
0,0,FAM58A,Truncating Mutations,1
1,1,CBL,W802*,2
2,2,CBL,Q249E,2
3,3,CBL,N454D,3
4,4,CBL,L399V,4


In [14]:
#Merging variants and text on ID

In [15]:
training_merge = training_variants.merge(training_text,left_on="ID",right_on="ID")

In [16]:
training_merge.head(5)
#Now we have the data

Unnamed: 0,ID,Gene,Variation,Class,Text
0,0,FAM58A,Truncating Mutations,1,Cyclin-dependent kinases (CDKs) regulate a var...
1,1,CBL,W802*,2,Abstract Background Non-small cell lung canc...
2,2,CBL,Q249E,2,Abstract Background Non-small cell lung canc...
3,3,CBL,N454D,3,Recent evidence has demonstrated that acquired...
4,4,CBL,L399V,4,Oncogenic mutations in the monomeric Casitas B...


In [17]:
training_merge.shape

(3321, 5)

In [58]:
test_text = pd.read_csv("cancertreatment/stage2_test_text.csv",sep="\|\|", header=None, skiprows=1, names=["ID","Text"])

  """Entry point for launching an IPython kernel.


In [59]:
test_text.shape

(986, 2)

In [60]:
test_variants = pd.read_csv("cancertreatment/stage2_test_variants.csv")

In [61]:
test_variants.shape

(986, 3)

In [62]:
test_merge = test_variants.merge(test_text,left_on="ID",right_on="ID")

In [63]:
# Let's understand the type of values present in each column of our dataframe 'train_merge' dataframe.
test_merge.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 986 entries, 0 to 985
Data columns (total 4 columns):
ID           986 non-null int64
Gene         986 non-null object
Variation    986 non-null object
Text         986 non-null object
dtypes: int64(1), object(3)
memory usage: 38.5+ KB


In [21]:
training_merge.describe(include='all')

Unnamed: 0,ID,Gene,Variation,Class,Text
count,3321.0,3321,3321,3321.0,3321
unique,,264,2996,,1921
top,,BRCA1,Truncating Mutations,,The PTEN (phosphatase and tensin homolog) phos...
freq,,264,93,,53
mean,1660.0,,,4.365854,
std,958.834449,,,2.309781,
min,0.0,,,1.0,
25%,830.0,,,2.0,
50%,1660.0,,,4.0,
75%,2490.0,,,7.0,


In [22]:
training_merge.isnull().sum()

ID           0
Gene         0
Variation    0
Class        0
Text         0
dtype: int64

In [23]:
training_merge.columns

Index(['ID', 'Gene', 'Variation', 'Class', 'Text'], dtype='object')

In [24]:
training_merge["Text_num_words"] = training_merge["Text"].apply(lambda x: len(str(x).split()) )
training_merge["Text_num_chars"] = training_merge["Text"].apply(lambda x: len(str(x)) )

In [25]:
training_merge['Variation'].describe()

count                     3321
unique                    2996
top       Truncating Mutations
freq                        93
Name: Variation, dtype: object

In [26]:
training_merge['Gene'].describe()

count      3321
unique      264
top       BRCA1
freq        264
Name: Gene, dtype: object

In [27]:
#Tokenzing-splitting up a larger body of text into smaller lines, words or even creating words 

In [28]:
#imputing gene row value to null data of text rows as for all other columns, Gene values are present in Text data
training_merge['Text'] = training_merge.apply(lambda row: row['Gene'] if pd.isnull(row['Text']) else row['Text'],axis=1)

In [29]:
training_merge.isnull().sum()

ID                0
Gene              0
Variation         0
Class             0
Text              0
Text_num_words    0
Text_num_chars    0
dtype: int64

In [30]:
training_merge.drop(["Text_num_words","Text_num_chars"],axis=1,inplace=True)

In [31]:
stop_words = set(stopwords.words('english'))

In [32]:
def data_text_preprocess(total_text, ind, col):
    # Remove int values from text data as that might not be imp
    if type(total_text) is not int:
        string = ""
        # replacing all special char with space
        total_text = re.sub('[^a-zA-Z0-9\n]', ' ', str(total_text))
        # replacing multiple spaces with single space
        total_text = re.sub('\s+',' ', str(total_text))
        # bring whole text to same lower-case scale.
        total_text = total_text.lower()
        
        for word in total_text.split():
        # if the word is a not a stop word then retain that word from text
            if not word in stop_words:
                string += word + " "
        
        training_merge[col][ind] = string

In [39]:
for index, row in training_merge.iterrows():
    if type(row['Text']) is str:
        data_text_preprocess(row['Text'], index, 'Text')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [41]:
training_merge.shape

(3321, 5)

In [43]:
training_merge.head()

Unnamed: 0,ID,Gene,Variation,Class,Text
0,0,FAM58A,Truncating Mutations,1,cyclin dependent kinases cdks regulate variety...
1,1,CBL,W802*,2,abstract background non small cell lung cancer...
2,2,CBL,Q249E,2,abstract background non small cell lung cancer...
3,3,CBL,N454D,3,recent evidence demonstrated acquired uniparen...
4,4,CBL,L399V,4,oncogenic mutations monomeric casitas b lineag...


In [45]:
training_merge['Text'][0]

'cyclin dependent kinases cdks regulate variety fundamental cellular processes cdk10 stands one last orphan cdks activating cyclin identified kinase activity revealed previous work shown cdk10 silencing increases ets2 v ets erythroblastosis virus e26 oncogene homolog 2 driven activation mapk pathway confers tamoxifen resistance breast cancer cells precise mechanisms cdk10 modulates ets2 activity generally functions cdk10 remain elusive demonstrate cdk10 cyclin dependent kinase identifying cyclin activating cyclin cyclin orphan cyclin product fam58a whose mutations cause star syndrome human developmental anomaly whose features include toe syndactyly telecanthus anogenital renal malformations show star syndrome associated cyclin mutants unable interact cdk10 cyclin silencing phenocopies cdk10 silencing increasing c raf conferring tamoxifen resistance breast cancer cells cdk10 cyclin phosphorylates ets2 vitro cells positively controls ets2 degradation proteasome ets2 protein levels increa

In [48]:
lda = LatentDirichletAllocation(max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)

In [49]:
tfidf = TfidfVectorizer(
    min_df=5, max_features=16000, strip_accents='unicode',lowercase =True, 
    analyzer='word', token_pattern=r'\w+', ngram_range=(1,4), use_idf=True, 
    smooth_idf=True, sublinear_tf=True, stop_words = 'english'
).fit(training_merge['Text'])

In [64]:
X_train_tfidfmatrix = tfidf.transform(training_merge['Text'].values)
X_test_tfidfmatrix = tfidf.transform(test_merge['Text'].values)

y_train = training_merge['Class'].values

In [65]:
lda.fit(X_train_tfidfmatrix)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='online', learning_offset=50.0,
             max_doc_update_iter=100, max_iter=5, mean_change_tol=0.001,
             n_jobs=1, n_topics=10, perp_tol=0.1, random_state=0,
             topic_word_prior=None, total_samples=1000000.0, verbose=0)

In [66]:
lda.fit(X_test_tfidfmatrix)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='online', learning_offset=50.0,
             max_doc_update_iter=100, max_iter=5, mean_change_tol=0.001,
             n_jobs=1, n_topics=10, perp_tol=0.1, random_state=0,
             topic_word_prior=None, total_samples=1000000.0, verbose=0)

In [67]:
print(X_train_tfidfmatrix.shape)

(3321, 16000)


In [68]:
print(X_test_tfidfmatrix.shape)

(986, 16000)


In [69]:
X_train, X_test, Y_train, Y_test = train_test_split(X_train_tfidfmatrix,y_train,random_state=1)

In [70]:
clfA = MultinomialNB().fit(X_train, Y_train)
predicted= clfA.predict(X_test)
print("MultinomialNB Accuracy:",metrics.accuracy_score(Y_test, predicted))
print(classification_report(Y_test,predicted))

MultinomialNB Accuracy: 0.546329723225
             precision    recall  f1-score   support

          1       0.48      0.54      0.51       142
          2       0.71      0.13      0.22       117
          3       0.00      0.00      0.00        27
          4       0.65      0.48      0.55       170
          5       0.36      0.32      0.34        50
          6       0.78      0.41      0.54        61
          7       0.54      0.96      0.69       250
          8       0.00      0.00      0.00         5
          9       0.00      0.00      0.00         9

avg / total       0.55      0.55      0.50       831



  'precision', 'predicted', average, warn_for)


In [71]:
#Creating a stacked model

In [72]:
seed = 1075
np.random.seed(seed)
rf = RandomForestClassifier()
et = ExtraTreesClassifier()
knn = KNeighborsClassifier()
svc = SVC()
rg = RidgeClassifier()
clf_array = [rf, et, knn, svc, rg]
for clf in clf_array:
    vanilla_scores = cross_val_score(clf,X_train_tfidfmatrix,y_train, cv=10, n_jobs=-1)
    bagging_clf = BaggingClassifier(clf, 
       max_samples=0.4, max_features=10, random_state=seed)
    bagging_scores = cross_val_score(bagging_clf,X_train_tfidfmatrix,y_train, cv=10, 
       n_jobs=-1)
    
    print (clf.__class__.__name__,vanilla_scores.mean(), vanilla_scores.std())
    print (clf.__class__.__name__,bagging_scores.mean(), bagging_scores.std())

RandomForestClassifier 0.453221758339 0.0694804800442
RandomForestClassifier 0.401049442675 0.0674994732279
ExtraTreesClassifier 0.434194126335 0.0739748739112
ExtraTreesClassifier 0.401864454613 0.0626731841339
KNeighborsClassifier 0.376455914597 0.0790622862998
KNeighborsClassifier 0.378857800092 0.0872023182411
SVC 0.28697303984 0.00162263819212
SVC 0.28697303984 0.00162263819212
RidgeClassifier 0.441322476082 0.0644431783677
RidgeClassifier 0.28697303984 0.00162263819212


In [80]:
rf = ['rf',RandomForestClassifier()]
et = ['et',ExtraTreesClassifier()]
knn =['knn',KNeighborsClassifier()]
svc = ['svc',SVC(probability=True)]
rg = ['rg',RidgeClassifier()]
ada_boost = ['adb',AdaBoostClassifier()]
grad_boost = ['gdb',GradientBoostingClassifier()]
xgb_boost = ['xgb',XGBClassifier()]
clfs=[rf, et, knn, svc, ada_boost, grad_boost, xgb_boost]

In [81]:
eclf = VotingClassifier(estimators=clfs, voting='hard')

In [82]:
eclf.fit(X_train_tfidfmatrix,y_train)

VotingClassifier(estimators=[['rf', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_...
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
       subsample=1, verbosity=1)]],
         n_jobs=1, voting='hard', weights=None)

In [92]:
eclf

VotingClassifier(estimators=[['rf', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_...
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
       subsample=1, verbosity=1)]],
         n_jobs=1, voting='hard', weights=None)

In [86]:
y_pred=eclf.predict(X_test.toarray())

In [94]:
print(accuracy_score(Y_test,y_pred))

0.839951865223


In [87]:
print(classification_report(Y_test,y_pred))

             precision    recall  f1-score   support

          1       0.80      0.83      0.82       142
          2       0.98      0.73      0.83       117
          3       0.72      0.48      0.58        27
          4       0.86      0.91      0.89       170
          5       0.69      0.48      0.56        50
          6       0.86      0.84      0.85        61
          7       0.82      0.96      0.88       250
          8       1.00      0.80      0.89         5
          9       1.00      0.89      0.94         9

avg / total       0.84      0.84      0.83       831



In [None]:
y_test_predicted=eclf.predict_proba(X_test_tfidfmatrix)

In [None]:
submission_df = pd.DataFrame(y_test_predicted, columns=['class' + str(c + 1) for c in range(9)])
submission_df['ID'] = test_merge['ID']

In [None]:
submission_df.head()

In [None]:
submission_df.to_csv('sub.csv', index=False)