In [None]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from scipy.sparse import hstack
from sklearn.preprocessing import normalize
import time, re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
import sklearn.metrics as metrics
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import log_loss
from sklearn.linear_model import LogisticRegression
from sklearn.calibration import CalibratedClassifierCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename)) 

In [None]:
codestarttime = time.clock()
# Read training variant csv.
trainingV = pd.read_csv(r'/kaggle/input/msk-redefining-cancer-treatment/training_variants',encoding = 'utf-8')

In [None]:
trainingV.head()

In [None]:
# Read training text csv.
trainingT = pd.read_csv(r'/kaggle/input/msk-redefining-cancer-treatment/training_text',sep='\|\|', header = None, skiprows = 1, names = ['ID','Text'],encoding = 'utf-8')


In [None]:
trainingT.head()

In [None]:
#Merging both Data Frames
trainData = trainingV.merge(trainingT,how= 'inner')

In [None]:
trainData.head()

In [None]:
# Re-ordering columns
trainData = trainData.reindex(columns=['ID','Gene','Variation','Text','Class'])

In [None]:
trainData.head()

Check for null values

In [None]:
trainData.isnull().sum()

In [None]:
trainData.shape

Removing Null values

In [None]:
trainData = trainData[~trainData.Text.isnull()]

In [None]:
trainData.shape

In [None]:
trainData.info()

Check the distribution of data for each class

In [None]:
df = trainData.groupby('Class').Gene.describe()
df = df.reset_index()
df

In [None]:
plt.figure(figsize=(15,5))
sns.barplot(x = 'Class',y = 'count',data= df)
plt.title('Count of Gene in Each Class')

We could see that count of Class 7 is high and Class 8 and 9 are very less in count

 Install WordCloud for plotting the most common words in each class

In [None]:
# Install WordCloud for plotting the most common words in each class
!pip install wordcloud


In [None]:
#Function to remove any special character, any extra spaces in the Text column
def text_preprocessing(total_text):
    if type(total_text) is not int:
        string = ""
        # replace every special char with space
        total_text = re.sub('[^a-zA-Z0-9\n]', ' ', str(total_text))
        # replace multiple spaces with single space
        total_text = re.sub('\s+', ' ', total_text)
        # converting all the chars into lower-case.
        total_text = total_text.lower()

        for word in total_text.split():
            string += word + " "

        return string

Apply Text_processing function to Text Column

In [None]:
#text processing stage.
start_time = time.clock()
trainData.Text = trainData.Text.apply(text_preprocessing)
print('Time took for preprocessing the text :',time.clock() - start_time, "seconds")


In [None]:
trainData.Text.head()

In [None]:
# Get top n words in the Text
def get_top_n_words(corpus, n=None):
    vec = CountVectorizer(stop_words='english').fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    string = ''
    words = [string + x[0] for x in words_freq[:n]]
    return ' '.join(words)

In [None]:
# Plot Word Cloud for the top N words in the class
def plot_wordCloud(df,Class):
    df = df[df.Class == Class]
    text = df.Text
    common2500Words = get_top_n_words(text,2500)
    wordcloud = WordCloud(background_color="white").generate(common2500Words)
    plt.figure(figsize= (15,5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    plt.show()

__Plot Words cloud for top 2500 words in each class__

In [None]:
plot_wordCloud(df = trainData,Class = 1)

In [None]:
plot_wordCloud(df = trainData,Class = 2)

In [None]:
plot_wordCloud(df = trainData,Class = 3)

In [None]:
plot_wordCloud(df = trainData,Class = 4)

In [None]:
plot_wordCloud(df = trainData,Class = 5)

In [None]:
plot_wordCloud(df = trainData,Class = 6)

In [None]:
plot_wordCloud(df = trainData,Class = 7)

In [None]:
plot_wordCloud(df = trainData,Class = 8)

In [None]:
plot_wordCloud(df = trainData,Class = 9)

#### Let's see the top 100 words in all the class

In [None]:
topWords = get_top_n_words(trainData.Text, n=100)
topWords

###### Mutations, Cells, Patients, protein, tumoir, variants, kinase are some of the words that occur in almost all the classes

#### Lets do a barplot for the count of top 10 words in each class

In [None]:
def get_top_words(corpus, n=None):
    vec = CountVectorizer(stop_words='english').fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0)         
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words = [x[0] for x in words_freq[:n]]
    count = [x[1] for x in words_freq[:n]]
    return words,count

In [None]:
def plot_topwords(df,Class):
    df = df[df.Class == Class]
    text = df.Text
    Words,Count = get_top_words(text,10)
    plt.figure(figsize= (12,5))
    sns.barplot(Count,Words)
    plt.title('Class =' + str(Class))
    plt.xlabel("Count of Words")
    plt.show()

In [None]:
plot_topwords(df = trainData,Class = 1)

In [None]:
plot_topwords(df = trainData,Class = 2)

In [None]:
plot_topwords(df = trainData,Class = 3)

In [None]:
plot_topwords(df = trainData,Class = 4)

In [None]:
plot_topwords(df = trainData,Class = 5)

In [None]:
plot_topwords(df = trainData,Class = 6)

In [None]:
plot_topwords(df = trainData,Class = 7)

In [None]:
plot_topwords(df = trainData,Class = 8)

In [None]:
plot_topwords(df = trainData,Class = 9)

### Find the total number of words for each Text row

In [None]:
trainData['Number of Words'] = trainData.Text.apply(lambda x: len(x.split()))
trainData.head()

### Distribution of number of words 

In [None]:
plt.figure(figsize=(12, 8))
sns.distplot(trainData['Number of Words'])
plt.xlabel('Number of words in text', fontsize=12)
plt.ylabel('Count', fontsize=12)
plt.title("Frequency of number of words", fontsize=15)
plt.show()

##### We can see that the mean number of words in all the rows is 10000 

#### Lets do a box plot for the number of words less than 30000

In [None]:
df = trainData[trainData['Number of Words'] < 30000]
plt.figure(figsize=(12,8))
sns.boxplot(y= 'Number of Words' ,x='Class', data= df )
plt.xlabel('Class', fontsize=12)
plt.ylabel('Text - Number of words', fontsize=12)
plt.show()

#### Split the data in to train and test

In [None]:
X = trainData.drop(columns=['ID','Class'])
Y = trainData.Class
x_train,x_test, y_train, y_test = train_test_split(X,Y,train_size = 0.7, random_state = 100)

In [None]:
x_train.shape

In [None]:
y_train.shape

#### Convert Gene column in to a Count Vectorizer matrix

In [None]:
geneCV = CountVectorizer()
xtrain_gene_feature = geneCV.fit_transform(x_train.Gene)
xtest_gene_feature = geneCV.transform(x_test.Gene)

#### Convert Variation column in to a Count Vectorizer matrix

In [None]:
variationCV = CountVectorizer()
xtrain_variation_feature = variationCV.fit_transform(x_train.Variation)
xtest_variation_feature = variationCV.transform(x_test.Variation)

#### Convert Text column in to a Count Vectorizer matrix

In [None]:
textCV = CountVectorizer(stop_words= 'english',min_df= 5 )
xtrain_text_feature = textCV.fit_transform(x_train.Text)
xtest_text_feature = textCV.transform(x_test.Text)


#### Normalise the Text Column count vector matrix so that each is a unit vector


In [None]:
xtrain_text_feature =  normalize(xtrain_text_feature, axis=0)
xtest_text_feature =  normalize(xtest_text_feature, axis=0)

#### Combine all the three count vector matrix

In [None]:
train_gene_var_text = hstack((xtrain_gene_feature,xtrain_variation_feature,xtrain_text_feature)).tocsr()
test_gene_var_text = hstack((xtest_gene_feature,xtest_variation_feature,xtest_text_feature)).tocsr()

In [None]:
train_gene_var_text.shape

In [None]:
def predict_and_plot_confusion_matrix(train_x, train_y,test_x, test_y, clf):
    clf.fit(train_x, train_y)
#     sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
#     sig_clf.fit(train_x, train_y)
    pred_y = clf.predict(test_x)

    # for calculating log_loss we willl provide the array of probabilities belongs to each class
    print("Log loss :",log_loss(test_y, clf.predict_proba(test_x)))
    # calculating the number of data points that are misclassified
    print("Number of mis-classified points :", np.count_nonzero((pred_y- test_y))/test_y.shape[0])
    plot_confusion_matrix(test_y, pred_y)

In [None]:
def plot_confusion_matrix(y_test,pred_y):
    plt.figure(figsize=(20,7))
    labels = [1,2,3,4,5,6,7,8,9]
    confuMatrix = confusion_matrix(y_test,pred_y)
    sns.heatmap(confuMatrix,annot= True,cmap="YlGnBu", fmt=".3f", xticklabels=labels, yticklabels=labels)
    plt.xlabel('Predicted Class')
    plt.ylabel('Original Class')

## Lets' Build Machine Learning Models

### Logistic regression

In [None]:
# We build a Logistic regression with default Parameters and check the perfomance
LR = LogisticRegression()
LR.fit(train_gene_var_text,y_train)

Accuracy and loss (cross-entropy) measure two different things. Cross-entropy loss awards lower loss to predictions which are closer to the class label. The accuracy, on the other hand, is a binary true/false for a particular sample. That is, Loss here is a continuous variable i.e. it’s best when predictions are close to 1 (for true labels) and close to 0 (for false ones). While accuracy is kind of discrete.

Lets' calculate Log Loss for all the Machine Learning Models here

In [None]:
#predict log-loss for train data
predict_y = LR.predict_proba(train_gene_var_text)
print("The train log loss is:",log_loss(y_train, predict_y, labels=LR.classes_, eps=1e-15))

#predict log-loss for test data
predict_y = LR.predict_proba(test_gene_var_text)
print("The test log loss is:",log_loss(y_test, predict_y, labels=LR.classes_, eps=1e-15))


#### Lets do a Grid Search CV to find suitable parameters

In [None]:
#We build logistic regression and find out best parameters(alpha and penalty) with Grid search and 10 fold CV on train data.

#Make a dict of our parameters
alpha = [10 ** x for x in range(-6, 3)]
params = {'C':alpha,'penalty':['l2']}

#Build Logistic regression
clfLR = LogisticRegression(class_weight='balanced',multi_class='multinomial',solver='newton-cg',n_jobs= -1)


In [None]:
#Grid Search with 10 fold CV
random = GridSearchCV(clfLR,param_grid=params,n_jobs= -1,cv=10)
random.fit(train_gene_var_text,y_train)

In [None]:
best_alpha = random.best_params_['C']
best_penalty = random.best_params_['penalty']
print('The best value for Cost, C is ', best_alpha)

In [None]:
#build logistic regression with best hyper-parameters(alpha and penalty)
#instead of using one vs rest we use multinomial which performs better compared to ovr in this case.
#multinomial does not support linear solver so we use newton-cg as our optimization problem solver.
clfLR = LogisticRegression(class_weight='balanced', C=best_alpha, penalty=best_penalty,multi_class='multinomial',solver='newton-cg')
clfLR.fit(train_gene_var_text, y_train)

In [None]:
#predict log-loss for train data
predict_y = clfLR.predict_proba(train_gene_var_text)
print('For values of best alpha = ', best_alpha,'penalty',best_penalty, "The train log loss is:",log_loss(y_train, predict_y, labels=LR.classes_, eps=1e-15))

#predict log-loss for test data
predict_y = clfLR.predict_proba(test_gene_var_text)
print('For values of best alpha = ', best_alpha,'penalty',best_penalty, "The test log loss is:",log_loss(y_test, predict_y, labels=LR.classes_, eps=1e-15))

To avoid rounding error while multiplying probabilites we use log-probability estimates.
Probability calibration with sigmoid regression.

In [None]:
sig_clfLR = CalibratedClassifierCV(clfLR, method="sigmoid")
sig_clfLR.fit(train_gene_var_text, y_train)

In [None]:
#predict log-loss for train data
predict_y = sig_clfLR.predict_proba(train_gene_var_text)
print('For values of best alpha = ', best_alpha,'penalty',best_penalty, "The train log loss is:",log_loss(y_train, predict_y, labels=clfLR.classes_, eps=1e-15))

#predict log-loss for test data
predict_y = sig_clfLR.predict_proba(test_gene_var_text)
print('For values of best alpha = ', best_alpha,'penalty',best_penalty, "The test log loss is:",log_loss(y_test, predict_y, labels=clfLR.classes_, eps=1e-15))

In [None]:
predict_and_plot_confusion_matrix(train_gene_var_text,y_train,test_gene_var_text,y_test,clf = sig_clfLR)

### Random Classifier

In [None]:
# Running the random forest with default parameters.
rfc = RandomForestClassifier()
rfc.fit(train_gene_var_text,y_train)

In [None]:
#predict log-loss for train data
predict_y = rfc.predict_proba(train_gene_var_text)
print("The train log loss is:",log_loss(y_train, predict_y, labels=rfc.classes_, eps=1e-15))

#predict log-loss for test data
predict_y = rfc.predict_proba(test_gene_var_text)
print("The test log loss is:",log_loss(y_test, predict_y, labels=rfc.classes_, eps=1e-15))

In [None]:
# Create the parameter grid based on the results of random search 
param_grid = {
    'max_depth': [4,8,10],
    'min_samples_leaf': range(100, 400, 200),
    'min_samples_split': range(200, 500, 200),
    'n_estimators': [100,200, 300], 
    'max_features': [5, 10]
}
# Create a based model
rf = RandomForestClassifier()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1,verbose = 1)

grid_search.fit(train_gene_var_text,y_train)

In [None]:
# printing the optimal accuracy score and hyperparameters
print('We can get accuracy of',grid_search.best_score_,'using',grid_search.best_params_)

In [None]:
# Building the Random Forest Classifier with best parameters
rfc = RandomForestClassifier(bootstrap=True,
                             max_depth=4,
                             min_samples_leaf=100, 
                             min_samples_split=200,
                             max_features=5,
                             n_estimators=100)
rfc.fit(train_gene_var_text,y_train)

In [None]:
#predict log-loss for train data
predict_y = rfc.predict_proba(train_gene_var_text)
print('For values of best alpha = ', best_alpha,'penalty',best_penalty, "The train log loss is:",log_loss(y_train, predict_y, labels=rfc.classes_, eps=1e-15))

#predict log-loss for test data
predict_y = rfc.predict_proba(test_gene_var_text)
print('For values of best alpha = ', best_alpha,'penalty',best_penalty, "The test log loss is:",log_loss(y_test, predict_y, labels=rfc.classes_, eps=1e-15))

In [None]:
# to avoid rounding error while multiplying probabilites we use log-probability estimates.
#Probability calibration with sigmoid regression.
sig_clfRFC = CalibratedClassifierCV(rfc, method="sigmoid")
sig_clfRFC.fit(train_gene_var_text, y_train)

In [None]:
#predict log-loss for train data
predict_y = sig_clfRFC.predict_proba(train_gene_var_text)
print('For values of best alpha = ', best_alpha,'penalty',best_penalty, "The train log loss is:",log_loss(y_train, predict_y, labels=rfc.classes_, eps=1e-15))

#predict log-loss for test data
predict_y = sig_clfRFC.predict_proba(test_gene_var_text)
print('For values of best alpha = ', best_alpha,'penalty',best_penalty, "The test log loss is:",log_loss(y_test, predict_y, labels=rfc.classes_, eps=1e-15))

In [None]:
predict_and_plot_confusion_matrix(train_gene_var_text,y_train,test_gene_var_text,y_test,clf = sig_clfRFC)

### Naive Bayes

In [None]:
# Build Naive Bayes with default parameters
mnb = MultinomialNB()

mnb.fit(train_gene_var_text,y_train)


In [None]:
#predict log-loss for train data
predict_y = mnb.predict_proba(train_gene_var_text)
print( "The train log loss is:",log_loss(y_train, predict_y, labels=mnb.classes_, eps=1e-15))

#predict log-loss for test data
predict_y = mnb.predict_proba(test_gene_var_text)
print("The test log loss is:",log_loss(y_test, predict_y, labels=mnb.classes_, eps=1e-15))

In [None]:

# We build Multinomial NB and find out best parameters(alpha) with grid search and 10 fold CV on train data.

#Make a dict of our parameters
alpha = [0.00001, 0.0001, 0.001, 0.1, 1, 10, 100,1000]
params = {'alpha':alpha}

#build multinomial NB
clf = MultinomialNB()
#Grid Search with 10 fold CV
random = GridSearchCV(clf,param_grid=params,cv=10,return_train_score=True,n_jobs=2)
random.fit(train_gene_var_text, y_train)

In [None]:
best_alpha = random.best_params_['alpha']
print("The best value for aplha is ", best_alpha)

The best value for alpha is the default value that is 1. 

In [None]:
# to avoid rounding error while multiplying probabilites we use log-probability estimates.
#Probability calibration with sigmoid regression.
sig_clfMNB = CalibratedClassifierCV(mnb, method="sigmoid")
sig_clfMNB.fit(train_gene_var_text, y_train)

In [None]:
#predict log-loss for train data
predict_y = sig_clfMNB.predict_proba(train_gene_var_text)
print('For values of best alpha = ', best_alpha,'penalty',best_penalty, "The train log loss is:",log_loss(y_train, predict_y, labels=mnb.classes_, eps=1e-15))

#predict log-loss for test data
predict_y = sig_clfMNB.predict_proba(test_gene_var_text)
print('For values of best alpha = ', best_alpha,'penalty',best_penalty, "The test log loss is:",log_loss(y_test, predict_y, labels=mnb.classes_, eps=1e-15))

In [None]:
predict_and_plot_confusion_matrix(train_gene_var_text,y_train,test_gene_var_text,y_test,clf = sig_clfMNB)

### Linear SVM Model

#### Let's train a linear SVM model

In [None]:
# Build a Linear SVC model with default parameters
from sklearn.svm import LinearSVC
linearsvc = LinearSVC()
linearsvc.fit(train_gene_var_text,y_train)


In [None]:
# specify range of parameters (C) as a list
params = {"C": [0.1, 1, 10, 100]}

model = LinearSVC()

# set up grid search scheme
# note that we are still using the 5 fold CV scheme we set up earlier
model_cv = GridSearchCV(estimator = model, param_grid = params,                          
                        cv = 10, 
                        verbose = 1,
                        n_jobs = -1,
                       return_train_score=True)   
# fit the model - it will fit 5 folds across all values of C
model_cv.fit(train_gene_var_text,y_train)  

In [None]:
best_C = model_cv.best_params_['C']
print("The best value for C is ", best_C)

#### To avoid rounding error while multiplying probabilites we use log-probability estimates. Probability calibration with sigmoid regression.

In [None]:
sig_clfLinearSVC = CalibratedClassifierCV(LinearSVC(C = 0.1), method="sigmoid")
sig_clfLinearSVC.fit(train_gene_var_text, y_train)

In [None]:
#predict log-loss for train data
predict_y = sig_clfLinearSVC.predict_proba(train_gene_var_text)
print('For values of best alpha = ', best_alpha,'penalty',best_penalty, "The train log loss is:",log_loss(y_train, predict_y, labels=linearsvc.classes_, eps=1e-15))

#predict log-loss for test data
predict_y = sig_clfLinearSVC.predict_proba(test_gene_var_text)
print('For values of best alpha = ', best_alpha,'penalty',best_penalty, "The test log loss is:",log_loss(y_test, predict_y, labels=linearsvc.classes_, eps=1e-15))

In [None]:
predict_and_plot_confusion_matrix(train_gene_var_text,y_train,test_gene_var_text,y_test,clf = sig_clfLinearSVC)

## Conculsion

Out of all above models, Calibrated Logistic Regression gives lesser Test Log Loss of __0.99__ followed by Calibrated SVM with Test Log Loss of __1.01__ and then Calibrated Naive Bayes with Loss of __1.26__ and then Calibrated Random Forest Classifier with a loss of __1.52__

Our model logistic regression gives log-loss __0.99__ and error __34.0%__

#### Lets test our test data with Calibrated Logistic Regression Model

Load Test Data

In [None]:
# Read  Test Variant files
testV = pd.read_csv(r'/kaggle/input/msk-redefining-cancer-treatment/test_variants',encoding = 'utf-8')
testV.head()

In [None]:
# Read  Test Text files
testT = pd.read_csv(r'/kaggle/input/msk-redefining-cancer-treatment/test_text',sep='\|\|', header = None, skiprows = 1, names = ['ID','Text'],encoding = 'utf-8')
testT.head()

In [None]:
testData = pd.merge(testV,testT)
testData.head()

In [None]:
#text processing stage.
start_time = time.clock()
testData.Text = testData.Text.apply(text_preprocessing)
print('Time took for preprocessing the text :',time.clock() - start_time, "seconds")


#### Convert Gene column in to a Count Vectorizer matrix

In [None]:
testData_gene_feature = geneCV.transform(testData.Gene)

#### Convert Variation column in to a Count Vectorizer matrix

In [None]:
testData_variation_feature = variationCV.transform(testData.Variation)

#### Convert Text column in to a Count Vectorizer matrix

In [None]:
testData_text_feature = textCV.transform(testData.Text.astype(str))

#### Normalise the Text Column count vector matrix so that each is a unit vector


In [None]:
testData_text_feature =  normalize(testData_text_feature, axis=0)

#### Combine all the three count vector matrix

In [None]:
testData_gene_var_text = hstack((testData_gene_feature,testData_variation_feature,testData_text_feature)).tocsr()

In [None]:
testData_gene_var_text.shape

In [None]:
final_pred = sig_clfLR.predict(testData_gene_var_text)

In [None]:
final_pred

In [None]:
testData['predicted_class'] = final_pred

In [None]:
testData.head()

Preparing submission data

In [None]:
submission_df = pd.get_dummies(testData['predicted_class'],prefix= 'class',prefix_sep= ' ')

In [None]:
submission_df.reset_index(inplace= True)
submission_df.rename(columns={'index':'ID'},inplace= True)

In [None]:
submission_df.to_csv('submission.csv', index=False)

In [None]:
codeendtime = time.clock()
print('Code execution took: ', str((codeendtime - codestarttime)/60), 'mins')