In [56]:
# imports relevant modules

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, plot_confusion_matrix, f1_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
import xgboost as xgboost
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer 

# Import CountVectorizer from feature_extraction.text.
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
pip install xgboost

In [57]:
#train_colorectal = pd.read_csv('../data/train_colorectal.csv')
train_nlp = pd.read_csv('../data/train_colorectal.csv')

In [58]:
train_nlp

Unnamed: 0,ID,Text,Gene,Variation,Class
0,28,sequencing studies have identified many recurr...,TERT,C228T,7
1,31,sequencing studies have identified many recurr...,TERT,Promoter Mutations,7
2,33,the current world health organization classifi...,TERT,Amplification,2
3,34,sequencing studies have identified many recurr...,TERT,C250T,7
4,35,abstract dicer plays a critical role in micr...,DICER1,G1809R,4
...,...,...,...,...,...
916,3256,neuroblastoma the most common paediatric solid...,CASP8,Promoter Hypermethylation,4
917,3262,ret is a singlepass transmembrane receptor tyr...,RET,S891A,7
918,3269,oncogenic fusion of the ret rearranged during ...,RET,Fusions,2
919,3278,ret is a singlepass transmembrane receptor tyr...,RET,A883F,7


In [59]:
# set up data for modelling

X = train_nlp['Text']
y = train_nlp['Class']

In [60]:
# Check distribution since this is a classification problem

y.value_counts(normalize = True)

7    0.378936
1    0.187839
4    0.156352
2    0.143322
5    0.049946
6    0.048860
3    0.019544
9    0.008686
8    0.006515
Name: Class, dtype: float64

In [61]:
# split the data into the training and testing sets

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.1,
                                                    stratify=y,
                                                    random_state=42
                                                    )

In [62]:

class LemmaTokenizer:
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]



In [63]:
# instantiate our CountVectorizer with default parameter and exclude stop words

cvec = CountVectorizer(analyzer='word', tokenizer=LemmaTokenizer(), ngram_range=(1, 1))


In [64]:
# fit the vectorizer on our corpus.
cvec.fit(X_train)



CountVectorizer(tokenizer=<__main__.LemmaTokenizer object at 0x000000F3F5E36E50>)

In [65]:
# transform the corpus.
X_train = cvec.transform(X_train)

In [66]:
X_train

<828x86393 sparse matrix of type '<class 'numpy.int64'>'
	with 1557478 stored elements in Compressed Sparse Row format>

In [67]:
# observe x shape

X_train.shape

(828, 86393)

In [68]:

cvec.get_feature_names()[10:25]

['+a',
 '+ap',
 '+association',
 '+at',
 '+bach',
 '+bp',
 '+chx',
 '+d',
 '+dd',
 '+delptpqp',
 '+distal',
 '+dmso',
 '+dox',
 '+edel',
 '+egf']

In [69]:
# transform test
X_test = cvec.transform(X_test)

In [70]:
X_test.shape

(93, 86393)

In [71]:
# Naiive Bayes

In [72]:
# choose multinomial naiive bayes

# instantiate our model

nb = MultinomialNB()

In [73]:
# fit our model

model = nb.fit(X_train, y_train)

In [74]:
# generate our predictions

predictions = model.predict(X_test)

In [75]:
# accuracy score of our model on the training set.

model.score(X_train, y_train)

0.7729468599033816

In [76]:
# accuracy score of our model on the testing set.

model.score(X_test, y_test)

0.6021505376344086

In [77]:
# 0.58

In [78]:
predictions 

array([7, 9, 7, 1, 6, 1, 7, 2, 2, 1, 4, 7, 1, 7, 7, 2, 1, 1, 1, 7, 1, 7,
       2, 7, 2, 2, 7, 1, 7, 7, 3, 5, 4, 1, 1, 6, 2, 7, 4, 1, 1, 7, 7, 2,
       1, 7, 7, 7, 6, 2, 4, 2, 7, 7, 2, 7, 5, 7, 1, 4, 7, 2, 7, 2, 7, 7,
       1, 5, 7, 4, 7, 1, 1, 4, 1, 2, 7, 4, 7, 7, 2, 7, 7, 7, 7, 1, 7, 7,
       1, 7, 1, 7, 5], dtype=int64)

In [79]:
# KNN

In [80]:
k_range = list(range(1,31))
weight_options = ["uniform", "distance"]

param_grid = dict(n_neighbors = k_range, weights = weight_options)

In [81]:
#KNN using GridSearch to find optimum KNN value

knn = KNeighborsClassifier() 
opt_knn = GridSearchCV(knn, param_grid, cv=5)
opt_knn.fit(X_train,y_train)

GridSearchCV(cv=5, estimator=KNeighborsClassifier(),
             param_grid={'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
                                         13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
                                         23, 24, 25, 26, 27, 28, 29, 30],
                         'weights': ['uniform', 'distance']})

In [82]:
# check knn best parameter

opt_knn.best_params_

{'n_neighbors': 5, 'weights': 'distance'}

In [83]:
# generate predictions
predictions1 = opt_knn.predict(X_test)

In [84]:
opt_knn.score(X_train, y_train)

0.9214975845410628

In [85]:
opt_knn.score(X_test, y_test)   #0.6

0.4731182795698925

In [86]:
# SVM

In [87]:
# Instantiate support vector machine.
svc = SVC()

In [88]:
gs1 = GridSearchCV(estimator=SVC(),
             param_grid={'C': [1, 10], 'kernel': ('linear', 'rbf', 'poly'), 'degree':[2]})
gs1.fit(X_train,y_train);

In [89]:
predictions2 = gs1.predict(X_test)

In [90]:
gs1.score(X_train, y_train)

0.7608695652173914

In [91]:
gs1.score(X_test, y_test)

0.5591397849462365

In [92]:
# 0.62

In [151]:
# xgboost


In [None]:
params = {
    'objective': 'multi:softprob',
    'eval_metric': 'mlogloss',
    'learning_rate': 1e-1,
    'max_depth': 5,
    'num_class': 9,
    'nthread': 4,
    'seed': 42}

In [None]:
xg_reg = xgboost.XGBRegressor(colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 5, alpha = 10, n_estimators = 10)

In [30]:
xg_reg.fit(X_train,y_train)

NameError: name 'xg_reg' is not defined

In [155]:
xg_reg.score(X_train, y_train)

0.11904450763169128

In [156]:
xg_reg.score(X_test,y_test)

-0.09203302132047564

In [None]:
# rf




In [93]:
rf = RandomForestClassifier(n_estimators=100)

In [None]:
pre_score = cross_val_score(estimator = rf,
                            X = X_train, 
                            y = y_train,
                            scoring = 'accuracy',
                            cv = 10,
                            verbose = 0)

print('Random Forest mean score: %5.4f' %np.mean(pre_score))




In [None]:
# gridsearch for random forests

rf_params = {
    'n_estimators': [100, 150, 200],
    'max_depth': [None, 1, 2, 3, 4, 5],
}
gs = GridSearchCV(rf, param_grid=rf_params, cv=5)
gs.fit(X_train, y_train)
print(gs.best_score_)
gs.best_params_

In [None]:
# Random Forests using GridSearchCV

gs.score(X_train, y_train)

In [None]:
# Random Forests using GridSearchCV

gs.score(X_test, y_test)

In [None]:
# logistic regression



In [None]:
parameters = {'C': [0.001, 0.01, 0.1, 1, 10],
              'class_weight': [None, 'balanced'],
              'penalty': ['l1', 'l2']}

In [None]:
lr = LogisticRegression(solver = 'liblinear', 
                        max_iter = 1000,
                        random_state = 42)

gs_results = GridSearchCV(estimator = lr,                                    # Specify the model we want to GridSearch.
                          param_grid = parameters,                           # Specify the grid of parameters we want to search.
                          scoring = 'accuracy',                                # Specify recall as the metric to optimize 
                          cv = 5).fit(X_train, y_train) 

In [None]:
gs_results.best_estimator_.get_params()

In [None]:
gs_results.best_score_

In [None]:
logit = LogisticRegression(
 C= 0.001,
 class_weight= None,
 dual= False,
 fit_intercept= True,
 intercept_scaling= 1,
 l1_ratio= None,
 max_iter= 1000,
 multi_class= 'auto',
 n_jobs= None,
 penalty= 'l2',
 random_state= 42,
 solver= 'liblinear',
 tol= 0.0001,
 verbose= 0,
 warm_start= False)

In [None]:

logit.fit(X = X_train,
          y = y_train)

In [None]:
logit.score(X_train, y_train)

In [None]:
logit.score(X_test, y_test)