In [172]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import re
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss, accuracy_score
from sklearn.svm import SVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from keras.preprocessing.text import Tokenizer
from keras.utils.np_utils import to_categorical
from sklearn.ensemble import VotingClassifier

In [173]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/cancer-diagnosis/training_text
/kaggle/input/cancer-diagnosis/training_variants
/kaggle/input/cancertreatment/stage2_test_text.csv
/kaggle/input/cancertreatment/test_text
/kaggle/input/cancertreatment/stage_2_private_solution.csv
/kaggle/input/cancertreatment/stage2_sample_submission.csv
/kaggle/input/cancertreatment/stage1_solution_filtered.csv
/kaggle/input/cancertreatment/test_variants
/kaggle/input/cancertreatment/stage2_test_variants.csv


In [174]:
training_text = pd.read_csv("../input/cancer-diagnosis/training_text",sep="\|\|", header=None, skiprows=1, names=["ID","Text"])

  """Entry point for launching an IPython kernel.


In [175]:
training_text.head(5)

Unnamed: 0,ID,Text
0,0,Cyclin-dependent kinases (CDKs) regulate a var...
1,1,Abstract Background Non-small cell lung canc...
2,2,Abstract Background Non-small cell lung canc...
3,3,Recent evidence has demonstrated that acquired...
4,4,Oncogenic mutations in the monomeric Casitas B...


In [176]:
training_text.shape

(3321, 2)

In [177]:
training_variants = pd.read_csv("../input/cancer-diagnosis/training_variants")

In [178]:
training_variants.head(5)

Unnamed: 0,ID,Gene,Variation,Class
0,0,FAM58A,Truncating Mutations,1
1,1,CBL,W802*,2
2,2,CBL,Q249E,2
3,3,CBL,N454D,3
4,4,CBL,L399V,4


In [179]:
training_variants.shape

(3321, 4)

In [180]:
#Merging variants and text on ID

In [181]:
training_merge = training_variants.merge(training_text,left_on="ID",right_on="ID")

In [182]:
training_merge.head(5)
#Now we have the data

Unnamed: 0,ID,Gene,Variation,Class,Text
0,0,FAM58A,Truncating Mutations,1,Cyclin-dependent kinases (CDKs) regulate a var...
1,1,CBL,W802*,2,Abstract Background Non-small cell lung canc...
2,2,CBL,Q249E,2,Abstract Background Non-small cell lung canc...
3,3,CBL,N454D,3,Recent evidence has demonstrated that acquired...
4,4,CBL,L399V,4,Oncogenic mutations in the monomeric Casitas B...


In [183]:
training_merge.shape

(3321, 5)

In [184]:
test_text = pd.read_csv("../input/cancertreatment/stage2_test_text.csv",sep="\|\|", header=None, skiprows=1, names=["ID","Text"])

  """Entry point for launching an IPython kernel.


In [185]:
test_text.shape

(986, 2)

In [186]:
test_variants = pd.read_csv("../input/cancertreatment/stage2_test_variants.csv")

In [187]:
test_variants.shape

(986, 3)

In [188]:
test_merge = test_variants.merge(test_text,left_on="ID",right_on="ID")

In [189]:
# Let's understand the type of values present in each column of our dataframe 'train_merge' dataframe.
test_merge.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 986 entries, 0 to 985
Data columns (total 4 columns):
ID           986 non-null int64
Gene         986 non-null object
Variation    986 non-null object
Text         986 non-null object
dtypes: int64(1), object(3)
memory usage: 38.5+ KB


In [190]:
training_merge.describe(include='all')

Unnamed: 0,ID,Gene,Variation,Class,Text
count,3321.0,3321,3321,3321.0,3316
unique,,264,2996,,1920
top,,BRCA1,Truncating Mutations,,The PTEN (phosphatase and tensin homolog) phos...
freq,,264,93,,53
mean,1660.0,,,4.365854,
std,958.834449,,,2.309781,
min,0.0,,,1.0,
25%,830.0,,,2.0,
50%,1660.0,,,4.0,
75%,2490.0,,,7.0,


In [191]:
training_merge.isnull().sum()

ID           0
Gene         0
Variation    0
Class        0
Text         5
dtype: int64

In [192]:
training_merge.columns

Index(['ID', 'Gene', 'Variation', 'Class', 'Text'], dtype='object')

In [193]:
training_merge["Text_num_words"] = training_merge["Text"].apply(lambda x: len(str(x).split()) )
training_merge["Text_num_chars"] = training_merge["Text"].apply(lambda x: len(str(x)) )

In [194]:
training_merge['Variation'].describe()

count                     3321
unique                    2996
top       Truncating Mutations
freq                        93
Name: Variation, dtype: object

In [195]:
training_merge['Gene'].describe()

count      3321
unique      264
top       BRCA1
freq        264
Name: Gene, dtype: object

In [196]:
from collections import Counter
# Import WordNetLemmatizer
from nltk.stem import WordNetLemmatizer
# Import word_tokenize
from nltk.tokenize import word_tokenize
# Import stopwords
from nltk.corpus import stopwords
# Import string
import string
#Importing 

In [197]:
#Tokenzing-splitting up a larger body of text into smaller lines, words or even creating words 

In [198]:
#imputing gene row value to null data of text rows as for all other columns, Gene values are present in Text data
training_merge['Text'] = training_merge.apply(lambda row: row['Gene'] if pd.isnull(row['Text']) else row['Text'],axis=1)

In [199]:
training_merge.isnull().sum()

ID                0
Gene              0
Variation         0
Class             0
Text              0
Text_num_words    0
Text_num_chars    0
dtype: int64

In [200]:
#imputing gene row value to null data of text rows as for all other columns, Gene values are present in Text data
test_merge['Text'] = test_merge.apply(lambda row: row['Gene'] if pd.isnull(row['Text']) else row['Text'],axis=1)

In [201]:
mincl=[3,5,6,8,9]
maxcl=[1,2,4,7]

In [202]:
dfA=training_merge[training_merge['Class'].isin(mincl)]
dfB=training_merge[training_merge['Class'].isin(maxcl)]

In [203]:
dfA.head(5)

Unnamed: 0,ID,Gene,Variation,Class,Text,Text_num_words,Text_num_chars
3,3,CBL,N454D,3,Recent evidence has demonstrated that acquired...,5572,36238
6,6,CBL,V430M,5,Oncogenic mutations in the monomeric Casitas B...,6202,41308
14,14,CBL,P428L,5,Oncogenic mutations in the monomeric Casitas B...,6202,41308
18,18,CBL,M374V,5,Oncogenic mutations in the monomeric Casitas B...,6202,41308
20,20,CBL,H94Y,6,Abstract Background Non-small cell lung canc...,11958,78000


In [204]:
dfA.describe()

Unnamed: 0,ID,Class,Text_num_words,Text_num_chars
count,662.0,662.0,662.0,662.0
mean,1957.531722,5.456193,7666.942598,50961.94864
std,874.612086,1.378586,4694.537482,31233.168057
min,3.0,3.0,1.0,3.0
25%,1311.5,5.0,4951.75,33028.0
50%,2289.0,5.5,6463.0,43019.0
75%,2664.5,6.0,8812.0,58319.0
max,3314.0,9.0,45177.0,297907.0


In [205]:
dfB.describe()

Unnamed: 0,ID,Class,Text_num_words,Text_num_chars
count,2659.0,2659.0,2659.0,2659.0
mean,1585.924784,4.094396,10020.205716,66766.388116
std,964.662185,2.41276,8388.648466,55764.62638
min,0.0,1.0,1.0,3.0
25%,748.5,2.0,4715.0,31199.5
50%,1545.0,4.0,7082.0,46804.0
75%,2331.5,7.0,12935.0,85401.5
max,3320.0,7.0,76782.0,523393.0


In [206]:
#taking class column as dependent variable ie which needs to be find out from all other columns in our data
ym=training_merge.Class
yA=dfA.Class
yB=dfB.Class

In [207]:
X_A=dfA[["Text","Variation","Gene"]]
X_B=dfB[["Text","Variation","Gene"]]
X_m=training_merge[["Text","Variation","Gene"]]

In [208]:
X_A.head()

Unnamed: 0,Text,Variation,Gene
3,Recent evidence has demonstrated that acquired...,N454D,CBL
6,Oncogenic mutations in the monomeric Casitas B...,V430M,CBL
14,Oncogenic mutations in the monomeric Casitas B...,P428L,CBL
18,Oncogenic mutations in the monomeric Casitas B...,M374V,CBL
20,Abstract Background Non-small cell lung canc...,H94Y,CBL


In [209]:
X_B.head()

Unnamed: 0,Text,Variation,Gene
0,Cyclin-dependent kinases (CDKs) regulate a var...,Truncating Mutations,FAM58A
1,Abstract Background Non-small cell lung canc...,W802*,CBL
2,Abstract Background Non-small cell lung canc...,Q249E,CBL
4,Oncogenic mutations in the monomeric Casitas B...,L399V,CBL
5,Oncogenic mutations in the monomeric Casitas B...,V391I,CBL


In [210]:
# Definig vectorizing object for Text column
vect_text= CountVectorizer(stop_words ='english')

#Defining vectorizing object for Variation column
vect_variation= CountVectorizer(stop_words ='english')

##Defining vectorizing object for Gene column
gene_variation= CountVectorizer(stop_words ='english')

In [211]:
#vectorizing  for Text column which gives the count of repeated words for each row for both the dataframes
vect_text.fit(X_A["Text"])
vect_text.fit(X_B["Text"])
vect_text.fit(X_m["Text"])

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words='english',
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [212]:
#vectorizing for Variation column  which gives the count of repeated words for each row
vect_variation.fit(X_A["Variation"])
vect_variation.fit(X_B["Variation"])
vect_variation.fit(X_m["Variation"])

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words='english',
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [213]:
gene_variation.fit(X_A["Gene"])
gene_variation.fit(X_B["Gene"])
gene_variation.fit(X_m["Gene"])

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words='english',
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [214]:
len(vect_text.vocabulary_)

155436

In [215]:
len(vect_variation.vocabulary_)

3018

In [216]:
len(gene_variation.vocabulary_)

263

In [217]:
vect_text.vocabulary_

{'cyclin': 46416,
 'dependent': 49831,
 'kinases': 84518,
 'cdks': 39059,
 'regulate': 122979,
 'variety': 149216,
 'fundamental': 63308,
 'cellular': 39396,
 'processes': 116246,
 'cdk10': 38984,
 'stands': 134564,
 'orphan': 106331,
 'activating': 21738,
 'identified': 77289,
 'kinase': 84487,
 'activity': 21828,
 'revealed': 124400,
 'previous': 115708,
 'work': 152130,
 'shown': 131283,
 'silencing': 131834,
 'increases': 79006,
 'ets2': 58046,
 'ets': 58044,
 'erythroblastosis': 57693,
 'virus': 149982,
 'e26': 54074,
 'oncogene': 105646,
 'homolog': 74768,
 'driven': 52980,
 'activation': 21744,
 'mapk': 92253,
 'pathway': 109370,
 'confers': 43377,
 'tamoxifen': 138590,
 'resistance': 123868,
 'breast': 33867,
 'cancer': 36529,
 'cells': 39240,
 'precise': 115237,
 'mechanisms': 93442,
 'modulates': 96818,
 'generally': 66518,
 'functions': 63287,
 'remain': 123237,
 'elusive': 56114,
 'demonstrate': 49617,
 'identifying': 77318,
 'product': 116305,
 'fam58a': 60058,
 'mutations

In [218]:
vect_variation.vocabulary_

{'truncating': 2650,
 'mutations': 1623,
 'w802': 2878,
 'q249e': 1913,
 'n454d': 1667,
 'l399v': 1440,
 'v391i': 2743,
 'v430m': 2747,
 'deletion': 483,
 'y371h': 2946,
 'c384r': 224,
 'p395a': 1813,
 'k382e': 1243,
 'r420q': 2179,
 'c381a': 222,
 'p428l': 1816,
 'd390y': 372,
 'q367p': 1922,
 'm374v': 1593,
 'y371s': 2947,
 'h94y': 1097,
 'c396r': 225,
 'g375p': 918,
 's376f': 2429,
 'p417a': 1815,
 'h398y': 1060,
 's2g': 2409,
 'y846c': 3005,
 'c228t': 201,
 'h412y': 1062,
 'h876q': 1092,
 'promoter': 1880,
 'p704s': 1851,
 'amplification': 155,
 'c250t': 210,
 'g1809r': 866,
 'g1809k': 865,
 'd1709e': 317,
 'd1709a': 316,
 'e1705a': 528,
 'd1810a': 329,
 'e1705k': 529,
 't1365m': 2529,
 'v648g': 2789,
 't844m': 2628,
 'a707t': 122,
 'hypermethylation': 1100,
 'r1343l': 2004,
 'a209t': 72,
 'y280h': 2933,
 'd927g': 473,
 'n510k': 1674,
 'f248s': 742,
 'l708p': 1499,
 'v995m': 2828,
 'y412f': 2954,
 'f74s': 777,
 'r1040l': 1964,
 'r453c': 2183,
 'r1209w': 1986,
 'a1022e': 21,
 'q984k

In [219]:
gene_variation.vocabulary_

{'fam58a': 85,
 'cbl': 39,
 'shoc2': 227,
 'tert': 245,
 'dicer1': 62,
 'ptprt': 198,
 'rheb': 213,
 'shq1': 228,
 'ccnd2': 41,
 'rad50': 202,
 'ccnd3': 42,
 'rit1': 216,
 'ccne1': 43,
 'rybp': 222,
 'tgfbr1': 248,
 'tgfbr2': 249,
 'msh6': 154,
 'kmt2d': 134,
 'lats1': 137,
 'pbrm1': 177,
 'sf3b1': 226,
 'lats2': 138,
 'egfr': 66,
 'nkx2': 165,
 'eif1ax': 67,
 'arid2': 12,
 'brd4': 33,
 'hist1h1c': 110,
 'errfi1': 79,
 'chek2': 55,
 'pak1': 175,
 'tmprss2': 250,
 'h3f3a': 109,
 'elf3': 68,
 'ros1': 218,
 'asxl2': 15,
 'cdh1': 44,
 'epcam': 71,
 'ep300': 69,
 'epas1': 70,
 'tp53': 251,
 'tp53bp1': 252,
 'smad2': 229,
 'smad3': 230,
 'smad4': 231,
 'cdk4': 46,
 'aurkb': 20,
 'cdk6': 47,
 'fbxw7': 89,
 'cdk8': 48,
 'cdkn1a': 49,
 'cdkn1b': 50,
 'cdkn2a': 51,
 'cdkn2b': 52,
 'cdkn2c': 53,
 'asxl1': 14,
 'erbb2': 72,
 'erbb3': 73,
 'erbb4': 74,
 'ercc2': 75,
 'brip1': 34,
 'ercc3': 76,
 'ercc4': 77,
 'abl1': 0,
 'cebpa': 54,
 'erg': 78,
 'hla': 111,
 'pdgfra': 178,
 'pdgfrb': 179,
 'rbm10':

In [220]:
#transforming count of Variation words in to matrix
variation_tranform_train_A=vect_variation.transform(X_A["Variation"])
variation_tranform_train_B=vect_variation.transform(X_B["Variation"])
variation_tranform_train_m=vect_variation.transform(X_m["Variation"])

In [221]:
#transforming count of Text words in to matrix
text_transformed_train_A= vect_text.transform(X_A["Text"])
text_transformed_train_B= vect_text.transform(X_B["Text"])
text_transformed_train_m= vect_text.transform(X_m["Text"])

In [222]:
#transforming count of gene words in to matrix
gene_transformed_train_A= gene_variation.transform(X_A["Gene"])
gene_transformed_train_B= gene_variation.transform(X_B["Gene"])
gene_transformed_train_m= gene_variation.transform(X_m["Gene"])

In [223]:
#merging train data of two Matrix horixzontally to train the model
import scipy.sparse as sp
XA_final = sp.hstack((variation_tranform_train_A,text_transformed_train_A,gene_transformed_train_A))
XB_final = sp.hstack((variation_tranform_train_B,text_transformed_train_B,gene_transformed_train_B))
Xm_final = sp.hstack((variation_tranform_train_m,text_transformed_train_m,gene_transformed_train_m))

In [224]:
XA_final.shape

(662, 158717)

In [225]:
XB_final.shape

(2659, 158717)

In [226]:
yA.shape

(662,)

In [227]:
yB.shape

(2659,)

In [228]:
Xm_final.shape

(3321, 158717)

In [229]:
ym.shape

(3321,)

In [230]:
# splitting into test and train
from sklearn.model_selection  import train_test_split
from imblearn.over_sampling import SMOTE
XA_train, XA_test, yA_train, yA_test = train_test_split(XA_final, yA, random_state=1)

In [231]:
XB_train, XB_test, yB_train, yB_test = train_test_split(XB_final, yB, random_state=1)

In [232]:
print(XA_train.shape)

(496, 158717)


In [233]:
print(yA_train.shape)

(496,)


In [234]:
print(XA_test.shape)

(166, 158717)


In [235]:
print(yA_test.shape)

(166,)


In [236]:
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.metrics import confusion_matrix, classification_report 
# Model Generation Using Multinomial Naive Bayes
clfA = MultinomialNB().fit(XA_train, yA_train)
predicted= clfA.predict(XA_test)
print("MultinomialNB Accuracy:",metrics.accuracy_score(yA_test, predicted))
print(classification_report(yA_test,predicted))

MultinomialNB Accuracy: 0.7590361445783133
              precision    recall  f1-score   support

           3       0.86      0.60      0.71        20
           5       0.68      0.78      0.72        54
           6       0.83      0.82      0.83        78
           8       0.50      0.20      0.29         5
           9       0.64      0.78      0.70         9

    accuracy                           0.76       166
   macro avg       0.70      0.64      0.65       166
weighted avg       0.76      0.76      0.76       166



In [237]:
# Model Generation Using Multinomial Naive Bayes
clfB = MultinomialNB().fit(XB_train, yB_train)
predicted= clfB.predict(XB_test)
print("MultinomialNB Accuracy:",metrics.accuracy_score(yB_test, predicted))
print(classification_report(yB_test,predicted))

MultinomialNB Accuracy: 0.6827067669172933
              precision    recall  f1-score   support

           1       0.61      0.71      0.66       140
           2       0.50      0.52      0.51       117
           4       0.79      0.65      0.71       175
           7       0.76      0.77      0.76       233

    accuracy                           0.68       665
   macro avg       0.67      0.66      0.66       665
weighted avg       0.69      0.68      0.68       665



In [238]:
from sklearn.ensemble import VotingClassifier

In [239]:
#Using Average weighting on the models that we have generated
#Using a voting classifier on Multinomial NB after hyperparameter tuning would be a waste as it NB is a pretty simple Model

In [240]:
Xm_train, Xm_test, ym_train, ym_test = train_test_split(Xm_final, ym, random_state=1)

In [241]:
# Model Generation Using Multinomial Naive Bayes
clfm = MultinomialNB().fit(Xm_train, ym_train)
predict= clfm.predict(Xm_test)
print("MultinomialNB Accuracy:",metrics.accuracy_score(ym_test, predict))
print(classification_report(ym_test,predict))

MultinomialNB Accuracy: 0.5812274368231047
              precision    recall  f1-score   support

           1       0.50      0.61      0.55       142
           2       0.41      0.46      0.44       117
           3       0.50      0.26      0.34        27
           4       0.77      0.54      0.63       170
           5       0.33      0.48      0.39        50
           6       0.73      0.57      0.64        61
           7       0.67      0.70      0.69       250
           8       1.00      0.20      0.33         5
           9       0.73      0.89      0.80         9

    accuracy                           0.58       831
   macro avg       0.63      0.52      0.54       831
weighted avg       0.61      0.58      0.58       831



In [242]:
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
from sklearn import datasets
from sklearn.multiclass import OneVsRestClassifier

In [243]:
#We see that the accuracy is skewed for the entire model as the data is imbalanced as we saw in the class distribution

In [244]:
#We would now try to tune the SVM hyperparameters and stack the classifiers built from them to be able to aid out 
#our final machine learning model

In [245]:
#Tuning the hyperparameters for the minority data

In [246]:
from sklearn.model_selection import GridSearchCV 
  
# defining parameter range 
param_grid = {'C': [0.1, 1], #10, 100, 1000],  
              'gamma': [1, 0.1, 0.01], #0.001, 0.0001], 
              'kernel': ['linear']}  
  
gridA = GridSearchCV(SVC(probability=True), param_grid, refit = True, verbose = 3,) 
  
# fitting the model for grid search 
gridA.fit(XA_train, yA_train) 

Fitting 3 folds for each of 6 candidates, totalling 18 fits
[CV] C=0.1, gamma=1, kernel=linear ...................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ....... C=0.1, gamma=1, kernel=linear, score=0.665, total=  13.3s
[CV] C=0.1, gamma=1, kernel=linear ...................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   13.3s remaining:    0.0s


[CV] ....... C=0.1, gamma=1, kernel=linear, score=0.651, total=  11.4s
[CV] C=0.1, gamma=1, kernel=linear ...................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   24.6s remaining:    0.0s


[CV] ....... C=0.1, gamma=1, kernel=linear, score=0.718, total=  12.8s
[CV] C=0.1, gamma=0.1, kernel=linear .................................
[CV] ..... C=0.1, gamma=0.1, kernel=linear, score=0.665, total=  12.7s
[CV] C=0.1, gamma=0.1, kernel=linear .................................
[CV] ..... C=0.1, gamma=0.1, kernel=linear, score=0.651, total=  11.2s
[CV] C=0.1, gamma=0.1, kernel=linear .................................
[CV] ..... C=0.1, gamma=0.1, kernel=linear, score=0.718, total=  13.1s
[CV] C=0.1, gamma=0.01, kernel=linear ................................
[CV] .... C=0.1, gamma=0.01, kernel=linear, score=0.665, total=  12.5s
[CV] C=0.1, gamma=0.01, kernel=linear ................................
[CV] .... C=0.1, gamma=0.01, kernel=linear, score=0.651, total=  11.1s
[CV] C=0.1, gamma=0.01, kernel=linear ................................
[CV] .... C=0.1, gamma=0.01, kernel=linear, score=0.718, total=  12.6s
[CV] C=1, gamma=1, kernel=linear .....................................
[CV] .

[Parallel(n_jobs=1)]: Done  18 out of  18 | elapsed:  3.9min finished


GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='auto_deprecated', kernel='rbf', max_iter=-1,
                           probability=True, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='warn', n_jobs=None,
             param_grid={'C': [0.1, 1], 'gamma': [1, 0.1, 0.01],
                         'kernel': ['linear']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=3)

In [247]:
# print best parameter after tuning 
print(gridA.best_params_) 
  
# print how our model looks after hyper-parameter tuning 
print(gridA.best_estimator_) 

{'C': 1, 'gamma': 1, 'kernel': 'linear'}
SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=1, kernel='linear',
    max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001,
    verbose=False)


In [248]:
gridA_predictions = gridA.predict(XA_test) 
  
# print classification report 
print(classification_report(yA_test, gridA_predictions)) 

              precision    recall  f1-score   support

           3       0.65      0.75      0.70        20
           5       0.78      0.72      0.75        54
           6       0.84      0.87      0.86        78
           8       0.60      0.60      0.60         5
           9       1.00      0.78      0.88         9

    accuracy                           0.80       166
   macro avg       0.77      0.74      0.76       166
weighted avg       0.80      0.80      0.80       166



In [249]:
# defining parameter range 
param_grid = {'C': [0.1, 1], #10, 100, 1000],  
              'gamma': [1],
              'kernel': ['linear']}  
gridB = GridSearchCV(SVC(probability=True), param_grid, refit = True, verbose = 3) 
  
# fitting the model for grid search 
gridB.fit(XB_train, yB_train) 

Fitting 3 folds for each of 2 candidates, totalling 6 fits
[CV] C=0.1, gamma=1, kernel=linear ...................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ....... C=0.1, gamma=1, kernel=linear, score=0.647, total= 2.1min
[CV] C=0.1, gamma=1, kernel=linear ...................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  2.1min remaining:    0.0s


[CV] ....... C=0.1, gamma=1, kernel=linear, score=0.674, total= 2.2min
[CV] C=0.1, gamma=1, kernel=linear ...................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  4.3min remaining:    0.0s


[CV] ....... C=0.1, gamma=1, kernel=linear, score=0.673, total= 2.4min
[CV] C=1, gamma=1, kernel=linear .....................................
[CV] ......... C=1, gamma=1, kernel=linear, score=0.647, total= 2.4min
[CV] C=1, gamma=1, kernel=linear .....................................
[CV] ......... C=1, gamma=1, kernel=linear, score=0.684, total= 2.6min
[CV] C=1, gamma=1, kernel=linear .....................................
[CV] ......... C=1, gamma=1, kernel=linear, score=0.694, total= 2.6min


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed: 14.4min finished


GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='auto_deprecated', kernel='rbf', max_iter=-1,
                           probability=True, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='warn', n_jobs=None,
             param_grid={'C': [0.1, 1], 'gamma': [1], 'kernel': ['linear']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=3)

In [250]:
gridB_predictions = gridB.predict(XB_test) 
  
# print classification report 
print(classification_report(yB_test, gridB_predictions)) 

              precision    recall  f1-score   support

           1       0.63      0.65      0.64       140
           2       0.54      0.56      0.55       117
           4       0.73      0.71      0.72       175
           7       0.77      0.75      0.76       233

    accuracy                           0.68       665
   macro avg       0.67      0.67      0.67       665
weighted avg       0.69      0.68      0.69       665



In [251]:
eclf2 = VotingClassifier(estimators=[('svmA',gridA), ('svmB', gridB)], voting='soft')

In [252]:
# fitting the model for grid search 
eclf2.fit(Xm_train, ym_train) 

Fitting 3 folds for each of 6 candidates, totalling 18 fits
[CV] C=0.1, gamma=1, kernel=linear ...................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ....... C=0.1, gamma=1, kernel=linear, score=0.571, total= 4.6min
[CV] C=0.1, gamma=1, kernel=linear ...................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  4.6min remaining:    0.0s


[CV] ....... C=0.1, gamma=1, kernel=linear, score=0.561, total= 4.3min
[CV] C=0.1, gamma=1, kernel=linear ...................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  8.9min remaining:    0.0s


[CV] ....... C=0.1, gamma=1, kernel=linear, score=0.570, total= 4.7min
[CV] C=0.1, gamma=0.1, kernel=linear .................................
[CV] ..... C=0.1, gamma=0.1, kernel=linear, score=0.571, total= 4.7min
[CV] C=0.1, gamma=0.1, kernel=linear .................................
[CV] ..... C=0.1, gamma=0.1, kernel=linear, score=0.561, total= 4.3min
[CV] C=0.1, gamma=0.1, kernel=linear .................................
[CV] ..... C=0.1, gamma=0.1, kernel=linear, score=0.570, total= 4.6min
[CV] C=0.1, gamma=0.01, kernel=linear ................................
[CV] .... C=0.1, gamma=0.01, kernel=linear, score=0.571, total= 4.5min
[CV] C=0.1, gamma=0.01, kernel=linear ................................
[CV] .... C=0.1, gamma=0.01, kernel=linear, score=0.561, total= 4.3min
[CV] C=0.1, gamma=0.01, kernel=linear ................................
[CV] .... C=0.1, gamma=0.01, kernel=linear, score=0.570, total= 4.6min
[CV] C=1, gamma=1, kernel=linear .....................................
[CV] .

[Parallel(n_jobs=1)]: Done  18 out of  18 | elapsed: 83.6min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 3 folds for each of 2 candidates, totalling 6 fits
[CV] C=0.1, gamma=1, kernel=linear ...................................
[CV] ....... C=0.1, gamma=1, kernel=linear, score=0.571, total= 4.5min
[CV] C=0.1, gamma=1, kernel=linear ...................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  4.5min remaining:    0.0s


[CV] ....... C=0.1, gamma=1, kernel=linear, score=0.561, total= 4.3min
[CV] C=0.1, gamma=1, kernel=linear ...................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  8.9min remaining:    0.0s


[CV] ....... C=0.1, gamma=1, kernel=linear, score=0.570, total= 4.7min
[CV] C=1, gamma=1, kernel=linear .....................................
[CV] ......... C=1, gamma=1, kernel=linear, score=0.580, total= 5.1min
[CV] C=1, gamma=1, kernel=linear .....................................
[CV] ......... C=1, gamma=1, kernel=linear, score=0.564, total= 4.4min
[CV] C=1, gamma=1, kernel=linear .....................................
[CV] ......... C=1, gamma=1, kernel=linear, score=0.577, total= 4.8min


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed: 27.8min finished


VotingClassifier(estimators=[('svmA',
                              GridSearchCV(cv='warn',
                                           error_score='raise-deprecating',
                                           estimator=SVC(C=1.0, cache_size=200,
                                                         class_weight=None,
                                                         coef0=0.0,
                                                         decision_function_shape='ovr',
                                                         degree=3,
                                                         gamma='auto_deprecated',
                                                         kernel='rbf',
                                                         max_iter=-1,
                                                         probability=True,
                                                         random_state=None,
                                                         shrinking=True,
      

In [253]:
#eclf2.probability = True

In [254]:
ym_pred=eclf2.predict(Xm_test)

In [255]:
print(classification_report(ym_test, ym_pred))

              precision    recall  f1-score   support

           1       0.56      0.54      0.55       142
           2       0.54      0.16      0.25       117
           3       0.29      0.07      0.12        27
           4       0.61      0.75      0.67       170
           5       0.52      0.22      0.31        50
           6       0.82      0.52      0.64        61
           7       0.60      0.90      0.72       250
           8       0.50      0.40      0.44         5
           9       1.00      0.56      0.71         9

    accuracy                           0.60       831
   macro avg       0.60      0.46      0.49       831
weighted avg       0.59      0.60      0.56       831



In [257]:
for i in range(1,9):
    print("Before SMOTE, counts of label {}: {}".format(i,sum(ym_train == i))) 
 
# apply near miss 
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=2) 
  
X_train_miss, y_train_miss = sm.fit_sample(Xm_train, ym_train.ravel()) 
  
print('After SMOTE, the shape of train_X:{}'.format(X_train_miss.shape)) 
print('After SMOTE, the shape of train_y: {} \n'.format(y_train_miss.shape)) 

for i in range(1,9):
    print("After SMOTE, counts of label {}: {}".format(i,sum(y_train_miss == i))) 

Before SMOTE, counts of label 1: 426
Before SMOTE, counts of label 2: 335
Before SMOTE, counts of label 3: 62
Before SMOTE, counts of label 4: 516
Before SMOTE, counts of label 5: 192
Before SMOTE, counts of label 6: 214
Before SMOTE, counts of label 7: 703
Before SMOTE, counts of label 8: 14
After SMOTE, the shape of train_X:(6327, 158717)
After SMOTE, the shape of train_y: (6327,) 

After SMOTE, counts of label 1: 703
After SMOTE, counts of label 2: 703
After SMOTE, counts of label 3: 703
After SMOTE, counts of label 4: 703
After SMOTE, counts of label 5: 703
After SMOTE, counts of label 6: 703
After SMOTE, counts of label 7: 703
After SMOTE, counts of label 8: 703


In [None]:
#Hyperparameter tuning for Random Forest

alpha = [100,200]
         #500,1000,2000]
max_depth = [5, 10]
val_log_loss_array = []
for i in alpha:
    for j in max_depth:
        print("for n_estimators =", i,"and max depth = ", j)
        clf = RandomForestClassifier(n_estimators=i, criterion='gini', max_depth=j, random_state=42)
        clf.fit(X_train_miss, y_train_miss)
        calib_clf = CalibratedClassifierCV(clf, method="sigmoid")
        calib_clf.fit(X_train_miss, y_train_miss)
        calib_clf_probs = calib_clf.predict_proba(X_test)
        val_log_loss_array.append(log_loss(y_test,calib_clf_probs, labels=clf.classes_, eps=1e-15))
        print("Log Loss :",log_loss(y_test,calib_clf_probs)) 

best_alpha = np.argmin(val_log_loss_array)
clf = RandomForestClassifier(n_estimators=alpha[int(best_alpha/2)], criterion='gini', max_depth=max_depth[int(best_alpha%2)], random_state=42)
clf.fit(X_train_miss,y_train_miss)
calib_clf = CalibratedClassifierCV(clf, method="sigmoid")
calib_clf.fit(X_train_miss, y_train_miss)

In [258]:
ran_clf=RandomForestClassifier(bootstrap=True,
                                                             class_weight=None,
                                                             criterion='gini',
                                                             max_depth=10,
                                                             max_features='auto',
                                                             max_leaf_nodes=None,
                                                             min_impurity_decrease=0.0,
                                                             min_impurity_split=None,
                                                             min_samples_leaf=1,
                                                             min_samples_split=2,
                                                             min_weight_fraction_leaf=0.0,
                                                             n_estimators=200,
                                                             n_jobs=None,
                                                             oob_score=False,
                                                             random_state=42,
                                                             verbose=0,
                                                             warm_start=False)

In [259]:
ran_clf.fit(X_train_miss,y_train_miss)
y_pred=ran_clf.predict(Xm_test)
print(classification_report(ym_test,ym_pred))

              precision    recall  f1-score   support

           1       0.56      0.54      0.55       142
           2       0.54      0.16      0.25       117
           3       0.29      0.07      0.12        27
           4       0.61      0.75      0.67       170
           5       0.52      0.22      0.31        50
           6       0.82      0.52      0.64        61
           7       0.60      0.90      0.72       250
           8       0.50      0.40      0.44         5
           9       1.00      0.56      0.71         9

    accuracy                           0.60       831
   macro avg       0.60      0.46      0.49       831
weighted avg       0.59      0.60      0.56       831



In [262]:
from sklearn import svm

In [263]:
voting_clf = VotingClassifier([('svc', svm.SVC(probability=True)),
                            ('nsb', MultinomialNB()),
                            ('rfor', RandomForestClassifier())],voting='soft')

In [264]:
voting_clf.fit(X_train_miss, y_train_miss)



VotingClassifier(estimators=[('svc',
                              SVC(C=1.0, cache_size=200, class_weight=None,
                                  coef0=0.0, decision_function_shape='ovr',
                                  degree=3, gamma='auto_deprecated',
                                  kernel='rbf', max_iter=-1, probability=True,
                                  random_state=None, shrinking=True, tol=0.001,
                                  verbose=False)),
                             ('nsb',
                              MultinomialNB(alpha=1.0, class_prior=None,
                                            fit_prior=True)),
                             ('rfor',
                              RandomForestClassifier(b...
                                                     class_weight=None,
                                                     criterion='gini',
                                                     max_depth=None,
                                                    

In [267]:
y_pred_class=voting_clf.predict(Xm_test)
classification_report(ym_test, y_pred_class)

'              precision    recall  f1-score   support\n\n           1       0.53      0.63      0.58       142\n           2       0.50      0.52      0.51       117\n           3       0.48      0.56      0.52        27\n           4       0.76      0.59      0.66       170\n           5       0.35      0.46      0.40        50\n           6       0.79      0.69      0.74        61\n           7       0.72      0.71      0.72       250\n           8       1.00      0.40      0.57         5\n           9       0.69      1.00      0.82         9\n\n    accuracy                           0.62       831\n   macro avg       0.65      0.62      0.61       831\nweighted avg       0.64      0.62      0.63       831\n'

In [None]:
print(accuracy_score(ym_test,y_pred_class))

In [268]:
y_pred_class

array([1, 7, 1, 7, 4, 1, 1, 2, 4, 1, 4, 1, 6, 1, 4, 1, 4, 7, 7, 1, 7, 3,
       5, 1, 4, 7, 7, 7, 5, 7, 1, 4, 7, 9, 7, 7, 4, 3, 2, 2, 1, 7, 7, 4,
       1, 7, 2, 7, 1, 3, 5, 4, 1, 7, 7, 1, 4, 4, 4, 5, 2, 6, 3, 7, 2, 7,
       4, 5, 4, 7, 1, 7, 7, 7, 1, 1, 7, 7, 7, 1, 6, 2, 4, 1, 2, 1, 4, 4,
       1, 7, 7, 7, 7, 5, 1, 2, 7, 2, 4, 4, 2, 6, 5, 1, 4, 7, 1, 5, 1, 7,
       6, 4, 2, 7, 7, 5, 7, 4, 1, 6, 7, 7, 2, 5, 1, 2, 5, 7, 3, 7, 4, 1,
       1, 7, 6, 7, 4, 2, 4, 4, 2, 7, 1, 7, 2, 2, 9, 2, 2, 7, 4, 7, 1, 7,
       6, 2, 7, 5, 1, 9, 4, 7, 2, 7, 6, 5, 7, 7, 1, 6, 3, 6, 7, 2, 1, 7,
       7, 2, 5, 7, 2, 4, 5, 7, 2, 4, 2, 7, 1, 1, 1, 2, 7, 5, 5, 4, 5, 3,
       5, 4, 1, 2, 7, 1, 6, 7, 7, 8, 9, 7, 1, 1, 2, 7, 3, 7, 4, 7, 7, 6,
       1, 7, 7, 7, 6, 5, 7, 7, 7, 1, 1, 4, 7, 4, 5, 3, 2, 4, 4, 2, 7, 2,
       5, 7, 7, 5, 7, 7, 9, 4, 1, 7, 4, 7, 7, 4, 7, 5, 1, 7, 7, 1, 1, 2,
       1, 7, 7, 7, 9, 7, 7, 7, 4, 6, 4, 2, 2, 7, 7, 1, 5, 4, 5, 5, 1, 1,
       7, 6, 7, 1, 1, 5, 1, 9, 1, 1, 2, 2, 4, 6, 7,

 **Using TF-IDF**

tf=TfidfVectorizer()
text_tf= tf.fit_transform(training_merge['Text'])

text_tf

X_train, X_test, y_train, y_test = train_test_split(
    text_tf, training_merge['Class'], test_size=0.3, random_state=123)

X_train.shape

from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.metrics import confusion_matrix, classification_report 
# Model Generation Using Multinomial Naive Bayes
clf = MultinomialNB().fit(X_train, y_train)
predicted= clf.predict(X_test)
print("MultinomialNB Accuracy:",metrics.accuracy_score(y_test, predicted))
print(classification_report(y_test,predicted))

#Trying to handle imbalanced data

#SMOTE

from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.metrics import confusion_matrix, classification_report 
# Model Generation Using Multinomial Naive Bayes
clf = MultinomialNB().fit(X_train_miss, y_train_miss)
predicted= clf.predict(X_test)
print("MultinomialNB Accuracy:",metrics.accuracy_score(y_test, predicted))
print(classification_report(y_test,predicted))

#USING SMOTE

for i in range(1,9):
    print("Before SMOTE, counts of label {}: {}".format(i,sum(y_train == i))) 
 
# apply near miss 
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=2) 
  
X_train_miss, y_train_miss = sm.fit_sample(X_train, y_train.ravel()) 
  
print('After SMOTE, the shape of train_X:{}'.format(X_train_miss.shape)) 
print('After SMOTE, the shape of train_y: {} \n'.format(y_train_miss.shape)) 

for i in range(1,9):
    print("After SMOTE, counts of label {}: {}".format(i,sum(y_train_miss == i))) 

from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.metrics import confusion_matrix, classification_report 
# Model Generation Using Multinomial Naive Bayes
clf = MultinomialNB().fit(X_train_miss, y_train_miss)
predicted= clf.predict(X_test)
print("MultinomialNB Accuracy:",metrics.accuracy_score(y_test, predicted))
print(classification_report(y_test,predicted))

#ACCURACY DOES BETTER WITH SMOTE SO WE WILL USE SMOTE

# Let's run Linear SVM model using the selected variables
from sklearn import metrics
from sklearn import svm
from sklearn.metrics import classification_report
svc_model=svm.LinearSVC()
svc_model.fit(X_train_miss,y_train_miss)

#predicting the Test data using our trained Linear SVM model
y_pred_class = svc_model.predict(X_test)

print(classification_report(y_test, y_pred_class))