In [163]:
# imports relevant modules

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, plot_confusion_matrix, f1_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
import xgboost as xgboost
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer 

# Import CountVectorizer from feature_extraction.text.
from sklearn.feature_extraction.text import CountVectorizer

In [82]:
pip install xgboost

Collecting xgboostNote: you may need to restart the kernel to use updated packages.
  Downloading xgboost-1.4.2-py3-none-win_amd64.whl (97.8 MB)
Installing collected packages: xgboost
Successfully installed xgboost-1.4.2



In [115]:
#train_colorectal = pd.read_csv('../data/train_colorectal.csv')
train_nlp = pd.read_csv('../data/train_colorectal.csv')

In [116]:
train_nlp.head()

Unnamed: 0,ID,Text,Gene,Variation,Class
0,28,sequencing studies have identified many recurr...,TERT,C228T,7
1,31,sequencing studies have identified many recurr...,TERT,Promoter Mutations,7
2,33,the current world health organization classifi...,TERT,Amplification,2
3,34,sequencing studies have identified many recurr...,TERT,C250T,7
4,35,abstract dicer plays a critical role in micr...,DICER1,G1809R,4


In [117]:
# set up data for modelling

X = train_nlp['Text']
y = train_nlp['Class']

In [118]:
# Check distribution since this is a classification problem

y.value_counts(normalize = True)

7    0.378936
1    0.187839
4    0.156352
2    0.143322
5    0.049946
6    0.048860
3    0.019544
9    0.008686
8    0.006515
Name: Class, dtype: float64

In [119]:
# split the data into the training and testing sets

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    stratify=y,
                                                    random_state=42
                                                    )

In [120]:

class LemmaTokenizer:
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]



In [121]:
# instantiate our CountVectorizer with default parameter and exclude stop words

cvec = CountVectorizer(analyzer='word', tokenizer=LemmaTokenizer(), ngram_range=(1, 1))


In [122]:
# fit the vectorizer on our corpus.
cvec.fit(X_train)



CountVectorizer(tokenizer=<__main__.LemmaTokenizer object at 0x000000DF96391D30>)

In [123]:
# transform the corpus.
X_train = cvec.transform(X_train)

In [124]:
X_train

<736x82232 sparse matrix of type '<class 'numpy.int64'>'
	with 1387628 stored elements in Compressed Sparse Row format>

In [125]:
# observe x shape

X_train.shape

(736, 82232)

In [126]:

cvec.get_feature_names()[10:25]

['+a',
 '+ap',
 '+association',
 '+at',
 '+bach',
 '+bp',
 '+chx',
 '+d',
 '+dd',
 '+delptpqp',
 '+distal',
 '+dmso',
 '+dox',
 '+edel',
 '+egf']

In [127]:
# transform test
X_test = cvec.transform(X_test)

In [128]:
X_test.shape

(185, 82232)

In [129]:
# Naiive Bayes

In [130]:
# choose multinomial naiive bayes

# instantiate our model

nb = MultinomialNB()

In [131]:
# fit our model

model = nb.fit(X_train, y_train)

In [132]:
# generate our predictions

predictions = model.predict(X_test)

In [133]:
# accuracy score of our model on the training set.

model.score(X_train, y_train)

0.7771739130434783

In [134]:
# accuracy score of our model on the testing set.

model.score(X_test, y_test)

0.5945945945945946

In [135]:
# 0.58

In [136]:
predictions 

array([7, 7, 5, 4, 4, 4, 2, 4, 1, 6, 1, 7, 6, 7, 1, 7, 2, 7, 7, 7, 1, 7,
       7, 7, 7, 7, 2, 7, 4, 7, 4, 5, 7, 4, 7, 4, 5, 7, 7, 4, 7, 1, 7, 2,
       7, 1, 4, 7, 9, 7, 4, 2, 1, 4, 5, 7, 7, 1, 7, 5, 7, 4, 2, 1, 5, 7,
       2, 2, 7, 7, 2, 2, 7, 7, 7, 7, 4, 1, 7, 7, 1, 7, 9, 7, 7, 1, 7, 1,
       7, 4, 7, 4, 3, 1, 2, 4, 1, 7, 7, 7, 5, 1, 7, 7, 7, 1, 2, 7, 7, 7,
       7, 7, 7, 4, 4, 1, 7, 7, 7, 2, 1, 2, 1, 1, 1, 2, 2, 1, 7, 2, 1, 1,
       6, 5, 3, 7, 7, 7, 1, 7, 6, 7, 7, 7, 4, 1, 7, 4, 1, 2, 1, 7, 4, 1,
       1, 7, 7, 5, 7, 7, 1, 1, 7, 1, 2, 2, 4, 7, 7, 1, 3, 7, 1, 2, 7, 1,
       7, 1, 7, 7, 1, 3, 7, 7, 2], dtype=int64)

In [137]:
# KNN

In [138]:
k_range = list(range(1,31))
weight_options = ["uniform", "distance"]

param_grid = dict(n_neighbors = k_range, weights = weight_options)

In [139]:
#KNN using GridSearch to find optimum KNN value

knn = KNeighborsClassifier() 
opt_knn = GridSearchCV(knn, param_grid, cv=5)
opt_knn.fit(X_train,y_train)

GridSearchCV(cv=5, estimator=KNeighborsClassifier(),
             param_grid={'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
                                         13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
                                         23, 24, 25, 26, 27, 28, 29, 30],
                         'weights': ['uniform', 'distance']})

In [140]:
# check knn best parameter

opt_knn.best_params_

{'n_neighbors': 6, 'weights': 'distance'}

In [141]:
# generate predictions
predictions1 = opt_knn.predict(X_test)

In [142]:
opt_knn.score(X_train, y_train)

0.9279891304347826

In [143]:
opt_knn.score(X_test, y_test)   #0.6

0.5243243243243243

In [144]:
# SVM

In [145]:
# Instantiate support vector machine.
svc = SVC()

In [146]:
gs1 = GridSearchCV(estimator=SVC(),
             param_grid={'C': [1, 10], 'kernel': ('linear', 'rbf', 'poly'), 'degree':[2]})
gs1.fit(X_train,y_train);

In [147]:
predictions2 = gs1.predict(X_test)

In [148]:
gs1.score(X_train, y_train)

0.7635869565217391

In [149]:
gs1.score(X_test, y_test)

0.6162162162162163

In [150]:
# 0.62

In [151]:
# xgboost


In [152]:
params = {
    'objective': 'multi:softprob',
    'eval_metric': 'mlogloss',
    'learning_rate': 1e-1,
    'max_depth': 5,
    'num_class': 9,
    'nthread': 4,
    'seed': 42}

In [153]:
xg_reg = xgboost.XGBRegressor(colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 5, alpha = 10, n_estimators = 10)

In [154]:
xg_reg.fit(X_train,y_train)

XGBRegressor(alpha=10, base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.3, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.1, max_delta_step=0, max_depth=5,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=10, n_jobs=4, num_parallel_tree=1, random_state=0,
             reg_alpha=10, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [155]:
xg_reg.score(X_train, y_train)

0.11904450763169128

In [156]:
xg_reg.score(X_test,y_test)

-0.09203302132047564

In [None]:
# rf




In [158]:
rf = RandomForestClassifier(n_estimators=100)

In [159]:
pre_score = cross_val_score(estimator = rf,
                            X = X_train, 
                            y = y_train,
                            scoring = 'accuracy',
                            cv = 10,
                            verbose = 0)

print('Random Forest mean score: %5.4f' %np.mean(pre_score))




Random Forest mean score: 0.6074


In [160]:
# gridsearch for random forests

rf_params = {
    'n_estimators': [100, 150, 200],
    'max_depth': [None, 1, 2, 3, 4, 5],
}
gs = GridSearchCV(rf, param_grid=rf_params, cv=5)
gs.fit(X_train, y_train)
print(gs.best_score_)
gs.best_params_

0.6168505239933811


{'max_depth': None, 'n_estimators': 200}

In [161]:
# Random Forests using GridSearchCV

gs.score(X_train, y_train)

0.9293478260869565

In [162]:
# Random Forests using GridSearchCV

gs.score(X_test, y_test)

0.6108108108108108

In [None]:
# logistic regression



In [164]:
parameters = {'C': [0.001, 0.01, 0.1, 1, 10],
              'class_weight': [None, 'balanced'],
              'penalty': ['l1', 'l2']}

In [None]:
lr = LogisticRegression(solver = 'liblinear', 
                        max_iter = 1000,
                        random_state = 42)

gs_results = GridSearchCV(estimator = lr,                                    # Specify the model we want to GridSearch.
                          param_grid = parameters,                           # Specify the grid of parameters we want to search.
                          scoring = 'accuracy',                                # Specify recall as the metric to optimize 
                          cv = 5).fit(X_train, y_train) 

In [None]:
gs_results.best_estimator_.get_params()

In [None]:
gs_results.best_score_

In [None]:
pre_score = cross_val_score(estimator = lr,
                            X = X_train, 
                            y = y_train,
                            scoring = 'accuracy',
                            cv = 10,
                            verbose = 0)

print('Logistic Regression mean score: %5.4f' %np.mean(pre_score))

In [None]:
logit = LogisticRegression(solver = 'liblinear', 
                           max_iter = 1000,
                           C = 0.1,
                           class_weight = None,
                           penalty = 'l2'
                           )

In [None]:

logit.fit(X = X_train,
          y = y_train)

In [None]:
logit.score(X_train, y_train)

In [None]:
logit.score(X_test, y_test)