In [78]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import re
import nltk
import string
from collections import Counter
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss, accuracy_score
from sklearn.metrics import confusion_matrix, classification_report 
from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from keras.utils.np_utils import to_categorical
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import StratifiedKFold
from xgboost import XGBClassifier

In [37]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/cancertreatment/stage2_test_text.csv
/kaggle/input/cancertreatment/test_text
/kaggle/input/cancertreatment/stage_2_private_solution.csv
/kaggle/input/cancertreatment/stage2_sample_submission.csv
/kaggle/input/cancertreatment/stage1_solution_filtered.csv
/kaggle/input/cancertreatment/test_variants
/kaggle/input/cancertreatment/stage2_test_variants.csv
/kaggle/input/cancer-diagnosis/training_text
/kaggle/input/cancer-diagnosis/training_variants


In [38]:
training_text = pd.read_csv("../input/cancer-diagnosis/training_text",sep="\|\|", header=None, skiprows=1, names=["ID","Text"])

  """Entry point for launching an IPython kernel.


In [39]:
training_text.head(5)

Unnamed: 0,ID,Text
0,0,Cyclin-dependent kinases (CDKs) regulate a var...
1,1,Abstract Background Non-small cell lung canc...
2,2,Abstract Background Non-small cell lung canc...
3,3,Recent evidence has demonstrated that acquired...
4,4,Oncogenic mutations in the monomeric Casitas B...


In [40]:
training_text.shape

(3321, 2)

In [41]:
training_variants = pd.read_csv("../input/cancer-diagnosis/training_variants")

In [42]:
training_variants.head(5)

Unnamed: 0,ID,Gene,Variation,Class
0,0,FAM58A,Truncating Mutations,1
1,1,CBL,W802*,2
2,2,CBL,Q249E,2
3,3,CBL,N454D,3
4,4,CBL,L399V,4


In [43]:
#Merging variants and text on ID

In [44]:
training_merge = training_variants.merge(training_text,left_on="ID",right_on="ID")

In [45]:
training_merge.head(5)
#Now we have the data

Unnamed: 0,ID,Gene,Variation,Class,Text
0,0,FAM58A,Truncating Mutations,1,Cyclin-dependent kinases (CDKs) regulate a var...
1,1,CBL,W802*,2,Abstract Background Non-small cell lung canc...
2,2,CBL,Q249E,2,Abstract Background Non-small cell lung canc...
3,3,CBL,N454D,3,Recent evidence has demonstrated that acquired...
4,4,CBL,L399V,4,Oncogenic mutations in the monomeric Casitas B...


In [46]:
training_merge.shape

(3321, 5)

In [47]:
test_text = pd.read_csv("../input/cancertreatment/stage2_test_text.csv",sep="\|\|", header=None, skiprows=1, names=["ID","Text"])

  """Entry point for launching an IPython kernel.


In [48]:
test_text.shape

(986, 2)

In [49]:
test_variants = pd.read_csv("../input/cancertreatment/stage2_test_variants.csv")

In [50]:
test_variants.shape

(986, 3)

In [51]:
test_merge = test_variants.merge(test_text,left_on="ID",right_on="ID")

In [52]:
# Let's understand the type of values present in each column of our dataframe 'train_merge' dataframe.
test_merge.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 986 entries, 0 to 985
Data columns (total 4 columns):
ID           986 non-null int64
Gene         986 non-null object
Variation    986 non-null object
Text         986 non-null object
dtypes: int64(1), object(3)
memory usage: 38.5+ KB


In [53]:
training_merge.describe(include='all')

Unnamed: 0,ID,Gene,Variation,Class,Text
count,3321.0,3321,3321,3321.0,3316
unique,,264,2996,,1920
top,,BRCA1,Truncating Mutations,,The PTEN (phosphatase and tensin homolog) phos...
freq,,264,93,,53
mean,1660.0,,,4.365854,
std,958.834449,,,2.309781,
min,0.0,,,1.0,
25%,830.0,,,2.0,
50%,1660.0,,,4.0,
75%,2490.0,,,7.0,


In [54]:
training_merge.isnull().sum()

ID           0
Gene         0
Variation    0
Class        0
Text         5
dtype: int64

In [55]:
training_merge.columns

Index(['ID', 'Gene', 'Variation', 'Class', 'Text'], dtype='object')

In [56]:
training_merge["Text_num_words"] = training_merge["Text"].apply(lambda x: len(str(x).split()) )
training_merge["Text_num_chars"] = training_merge["Text"].apply(lambda x: len(str(x)) )

In [57]:
training_merge['Variation'].describe()

count                     3321
unique                    2996
top       Truncating Mutations
freq                        93
Name: Variation, dtype: object

In [58]:
training_merge['Gene'].describe()

count      3321
unique      264
top       BRCA1
freq        264
Name: Gene, dtype: object

In [59]:
#Tokenzing-splitting up a larger body of text into smaller lines, words or even creating words 

In [60]:
#imputing gene row value to null data of text rows as for all other columns, Gene values are present in Text data
training_merge['Text'] = training_merge.apply(lambda row: row['Gene'] if pd.isnull(row['Text']) else row['Text'],axis=1)

In [61]:
training_merge.isnull().sum()

ID                0
Gene              0
Variation         0
Class             0
Text              0
Text_num_words    0
Text_num_chars    0
dtype: int64

In [62]:
X_m=training_merge[["Text","Variation","Gene"]]

In [63]:
X_t=training_merge["Text"]

In [64]:
tfidf = TfidfVectorizer(
    min_df=5, max_features=16000, strip_accents='unicode',lowercase =True, 
    analyzer='word', token_pattern=r'\w+', ngram_range=(1,4), use_idf=True, 
    smooth_idf=True, sublinear_tf=True, stop_words = 'english'
).fit(X_t)

In [65]:
X_train_tfidfmatrix = tfidf.transform(training_merge['Text'].values)
X_test_tfidfmatrix = tfidf.transform(test_merge['Text'].values)

y_train = training_merge['Class'].values

In [66]:
X_train_tfidfmatrix.shape

(3321, 16000)

In [67]:
X_test_tfidfmatrix.shape

(986, 16000)

In [68]:
#Evaluate function to help predict

In [69]:
def evaluate(X, y, clf=None):
    probas = cross_val_predict(clf, X, y, cv=StratifiedKFold(n_splits=5, random_state=8), 
                              n_jobs=-1, method='predict_proba', verbose=2)
    pred_indices = np.argmax(probas, axis=1)
    classes = np.unique(y)
    preds = classes[pred_indices]
    print('Log loss: {}'.format(log_loss(y, probas)))
    print('Accuracy: {}'.format(accuracy_score(y, preds)))

In [70]:
evaluate(X_train_tfidfmatrix, y_train, clf=XGBClassifier())

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Log loss: 1.4778248799122922
Accuracy: 0.5061728395061729


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed: 22.4min finished


In [71]:
clf = XGBClassifier()
clf.fit(X_train_tfidfmatrix, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='multi:softprob', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [72]:
y_test_predicted = clf.predict_proba(X_test_tfidfmatrix)

In [76]:
y_pred=clf.predict(X_test_tfidfmatrix)

In [None]:
submission_df = pd.DataFrame(y_test_predicted, columns=['class' + str(c + 1) for c in range(9)])
submission_df['ID'] = test_merge['ID']

In [None]:
submission_df.head()

In [None]:
submission_df.to_csv('sub.csv', index=False)