In [185]:
# imports relevant modules

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, plot_confusion_matrix, f1_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

# Import CountVectorizer from feature_extraction.text.
from sklearn.feature_extraction.text import CountVectorizer

In [158]:
#train_colorectal = pd.read_csv('../data/train_colorectal.csv')
train_nlp = pd.read_csv('../data/train_nlp.csv')

In [159]:
# set up data for modelling

X = train_nlp['Text']
y = train_nlp['Class']

In [160]:
# Check distribution since this is a classification problem

y.value_counts(normalize = True)

7    0.287093
4    0.206876
1    0.170688
2    0.136309
6    0.082328
5    0.072979
3    0.026840
9    0.011158
8    0.005730
Name: Class, dtype: float64

In [161]:
# split the data into the training and testing sets

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.33,
                                                    stratify=y,
                                                    random_state=42
                                                    )

In [162]:
# instantiate our CountVectorizer with default parameter and exclude stop words

cvec = CountVectorizer(stop_words = 'english', analyzer='word')


In [163]:
# fit the vectorizer on our corpus.
cvec.fit(X_train)

CountVectorizer(stop_words='english')

In [164]:
# transform the corpus.
X_train = cvec.transform(X_train)

In [165]:
X_train

<2221x131020 sparse matrix of type '<class 'numpy.int64'>'
	with 3172645 stored elements in Compressed Sparse Row format>

In [166]:
# observe x shape

X_train.shape

(2221, 131020)

In [167]:

cvec.get_feature_names()[10:25]

['__pdel',
 '__radiolabeled',
 '__um',
 '__wdel',
 '__yins',
 '__âacd_____________ric',
 '_a_at',
 '_a_trx',
 '_aaai',
 '_aagacccg',
 '_aagggtt',
 '_akt_',
 '_alignedsam',
 '_alk_',
 '_alu_i']

In [168]:
# transform test
X_test = cvec.transform(X_test)

In [169]:
X_test.shape

(1095, 131020)

In [170]:
# Naiive Bayes

In [171]:
# choose multinomial naiive bayes

# instantiate our model

nb = MultinomialNB()

In [172]:
# fit our model

model = nb.fit(X_train, y_train)

In [173]:
# generate our predictions

predictions = model.predict(X_test)

In [174]:
# accuracy score of our model on the training set.

model.score(X_train, y_train)

0.7559657811796489

In [175]:
# accuracy score of our model on the testing set.

model.score(X_test, y_test)

0.5881278538812785

In [176]:
predictions 

array([2, 7, 7, ..., 2, 5, 7], dtype=int64)

In [177]:
# KNN

In [178]:
k_range = list(range(1,31))
weight_options = ["uniform", "distance"]

param_grid = dict(n_neighbors = k_range, weights = weight_options)

In [179]:
#KNN using GridSearch to find optimum KNN value

knn = KNeighborsClassifier() 
opt_knn = GridSearchCV(knn, param_grid, cv=5)
opt_knn.fit(X_train,y_train)

GridSearchCV(cv=5, estimator=KNeighborsClassifier(),
             param_grid={'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
                                         13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
                                         23, 24, 25, 26, 27, 28, 29, 30],
                         'weights': ['uniform', 'distance']})

In [180]:
# check knn best parameter

opt_knn.best_params_

{'n_neighbors': 16, 'weights': 'distance'}

In [181]:
# generate predictions
predictions1 = opt_knn.predict(X_test)

In [182]:
opt_knn.score(X_train, y_train)

0.9054479963980189

In [183]:
opt_knn.score(X_test, y_test)

0.6

In [None]:
# SVM

In [186]:
# Instantiate support vector machine.
svc = SVC()

In [None]:
gs1 = GridSearchCV(estimator=SVC(),
             param_grid={'C': [1, 10], 'kernel': ('linear', 'rbf', 'poly'), 'degree':[2]})
gs1.fit(X_train,y_train);

In [None]:
predictions2 = gs1.predict(X_test)

In [None]:
gs1.score(X_train, y_train)

In [None]:
gs1.score(X_test, y_test)