In [1]:
# imports relevant modules

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, plot_confusion_matrix, f1_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer 

# Import CountVectorizer from feature_extraction.text.
from sklearn.feature_extraction.text import CountVectorizer

In [5]:
#train_colorectal = pd.read_csv('../data/train_colorectal.csv')
train_nlp = pd.read_csv('../data/train_nlp.csv')

In [6]:
# set up data for modelling

X = train_nlp['Text']
y = train_nlp['Class']

In [7]:
# Check distribution since this is a classification problem

y.value_counts(normalize = True)

7    0.287093
4    0.206876
1    0.170688
2    0.136309
6    0.082328
5    0.072979
3    0.026840
9    0.011158
8    0.005730
Name: Class, dtype: float64

In [8]:
# split the data into the training and testing sets

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.33,
                                                    stratify=y,
                                                    random_state=42
                                                    )

In [9]:

class LemmaTokenizer:
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]



In [12]:
# instantiate our CountVectorizer with default parameter and exclude stop words

cvec = CountVectorizer(analyzer='word', tokenizer=LemmaTokenizer())


In [13]:
# fit the vectorizer on our corpus.
cvec.fit(X_train)

CountVectorizer(tokenizer=<__main__.LemmaTokenizer object at 0x000000DFA58B8970>)

In [14]:
# transform the corpus.
X_train = cvec.transform(X_train)

In [15]:
X_train

<2221x129145 sparse matrix of type '<class 'numpy.int64'>'
	with 3310110 stored elements in Compressed Sparse Row format>

In [16]:
# observe x shape

X_train.shape

(2221, 129145)

In [17]:

cvec.get_feature_names()[10:25]

['++++____psvzio',
 '+++âˆ',
 '++f',
 '++presence',
 '++âˆâˆ',
 '+_+delg',
 '+a',
 '+ap',
 '+at',
 '+b',
 '+ba',
 '+bach',
 '+bp',
 '+bx',
 '+byl']

In [18]:
# transform test
X_test = cvec.transform(X_test)

In [19]:
X_test.shape

(1095, 129145)

In [20]:
# Naiive Bayes

In [21]:
# choose multinomial naiive bayes

# instantiate our model

nb = MultinomialNB()

In [22]:
# fit our model

model = nb.fit(X_train, y_train)

In [23]:
# generate our predictions

predictions = model.predict(X_test)

In [24]:
# accuracy score of our model on the training set.

model.score(X_train, y_train)

0.7537145429986493

In [25]:
# accuracy score of our model on the testing set.

model.score(X_test, y_test)

0.5872146118721461

In [26]:
# 0.58

In [27]:
predictions 

array([2, 7, 7, ..., 2, 5, 7], dtype=int64)

In [28]:
# KNN

In [29]:
k_range = list(range(1,31))
weight_options = ["uniform", "distance"]

param_grid = dict(n_neighbors = k_range, weights = weight_options)

In [30]:
#KNN using GridSearch to find optimum KNN value

knn = KNeighborsClassifier() 
opt_knn = GridSearchCV(knn, param_grid, cv=5)
opt_knn.fit(X_train,y_train)

GridSearchCV(cv=5, estimator=KNeighborsClassifier(),
             param_grid={'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
                                         13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
                                         23, 24, 25, 26, 27, 28, 29, 30],
                         'weights': ['uniform', 'distance']})

In [31]:
# check knn best parameter

opt_knn.best_params_

{'n_neighbors': 7, 'weights': 'distance'}

In [32]:
# generate predictions
predictions1 = opt_knn.predict(X_test)

In [33]:
opt_knn.score(X_train, y_train)

0.9054479963980189

In [34]:
opt_knn.score(X_test, y_test)   #0.6

0.5799086757990868

In [None]:
# SVM

In [35]:
# Instantiate support vector machine.
svc = SVC()

In [36]:
gs1 = GridSearchCV(estimator=SVC(),
             param_grid={'C': [1, 10], 'kernel': ('linear', 'rbf', 'poly'), 'degree':[2]})
gs1.fit(X_train,y_train);

In [37]:
predictions2 = gs1.predict(X_test)

In [38]:
gs1.score(X_train, y_train)

0.7919855920756416

In [39]:
gs1.score(X_test, y_test)

0.5990867579908675

In [None]:
# 0.62