In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [27]:
ds = pd.read_csv("testdata_manual_2009_06_14.csv", names=['Depression_Level','SNo','PostDate','Source','Destination','Tweet'])

In [28]:
ds.head()

Unnamed: 0,Depression_Level,SNo,PostDate,Source,Destination,Tweet
0,4,3,Mon May 11 03:17:40 UTC 2009,kindle2,tpryan,@stellargirl I loooooooovvvvvveee my Kindle2. ...
1,4,4,Mon May 11 03:18:03 UTC 2009,kindle2,vcu451,Reading my kindle2... Love it... Lee childs i...
2,4,5,Mon May 11 03:18:54 UTC 2009,kindle2,chadfu,"Ok, first assesment of the #kindle2 ...it fuck..."
3,4,6,Mon May 11 03:19:04 UTC 2009,kindle2,SIX15,@kenburbary You'll love your Kindle2. I've had...
4,4,7,Mon May 11 03:21:41 UTC 2009,kindle2,yamarama,@mikefish Fair enough. But i have the Kindle2...


# Importing NLP Text Preprocessing Libraries

In [37]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

[nltk_data] Error loading stopwords: <urlopen error [Errno 11001]
[nltk_data]     getaddrinfo failed>


## An alternative library of English stop-words in "sklearn" library rather than 'nklt'

In [41]:
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS

In [46]:
print(set(ENGLISH_STOP_WORDS))

{'may', 'was', 'thereafter', 'or', 'for', 'amongst', 'around', 'etc', 'my', 'are', 'always', 'as', 'then', 'her', 'six', 'myself', 'third', 'again', 'during', 'everywhere', 'made', 'once', 'therein', 'thereupon', 'than', 'whether', 'until', 'both', 'former', 'these', 'nor', 'against', 'bottom', 'alone', 'herein', 'his', 'side', 'it', 'own', 'every', 'you', 'beforehand', 'thick', 'hers', 'less', 'seemed', 'down', 'perhaps', 'hasnt', 'seem', 'be', 'sometimes', 'any', 'do', 'might', 'many', 'he', 'among', 'de', 'elsewhere', 'something', 'me', 'should', 'himself', 'ltd', 'thence', 'anyone', 'have', 'thin', 'per', 'several', 'very', 'toward', 'all', 'enough', 'top', 'fire', 'front', 'must', 'done', 'afterwards', 'of', 'not', 'three', 'mine', 'un', 'whereas', 'being', 'else', 'an', 'indeed', 'sixty', 'couldnt', 'others', 'those', 'there', 'towards', 'within', 'con', 'under', 'even', 'they', 'themselves', 'two', 'anywhere', 'together', 'besides', 'throughout', 'most', 'ourselves', 'becomes', 

In [29]:
sample = ds.Tweet[0]

In [31]:
print(sample)

@stellargirl I loooooooovvvvvveee my Kindle2. Not that the DX is cool, but the 2 is fantastic in its own right.


# Preprocessing

## Sample text Cleaning...

In [45]:
print("Before Preprocessing: ...\n", ds.Tweet[0])

# Removing all numbers, punctuations and special characters. Extract only alphabets
tweet = re.sub('[^a-zA-Z]',' ', ds.Tweet[0])

# Converting all text to lowercase
tweet = tweet.lower()
tweet = tweet.split()
print('\nRemoved special characters and numbers: ... \n', tweet)


# Stemming (Using only the root word of every polymorphic words. e.g. Loved, Loving = Love; Eat, Ate, Eaten = Eat; etc)
ps = PorterStemmer()

# Removing all common words e.g. Preposition, article, conjunction, etc.
#tweet = [ps.stem(word) for word in tweet if word not in set(stopwords.words('english'))]
tweet = [ps.stem(word) for word in tweet if word not in set(ENGLISH_STOP_WORDS)]
tweet = ' '.join(tweet)
print('\nStemmed and removed common words: ... \n', tweet)


Before Preprocessing: ...
 @stellargirl I loooooooovvvvvveee my Kindle2. Not that the DX is cool, but the 2 is fantastic in its own right.

Removed special characters and numbers: ... 
 ['stellargirl', 'i', 'loooooooovvvvvveee', 'my', 'kindle', 'not', 'that', 'the', 'dx', 'is', 'cool', 'but', 'the', 'is', 'fantastic', 'in', 'its', 'own', 'right']

Stemmed and removed common words: ... 
 stellargirl loooooooovvvvvvee kindl dx cool fantast right


## Actual Text Cleaning and creating of Corpus of text

In [68]:
corpus = []

In [69]:
for i in range(len(ds.index)):
    # Removing all numbers, punctuations and special characters. Extract only alphabets
    tweet = re.sub('[^a-zA-Z]',' ', ds.Tweet[i])

    # Converting all text to lowercase
    tweet = tweet.lower()
    tweet = tweet.split()

    # Stemming (Using only the root word of every polymorphic words. e.g. Loved, Loving = Love; Eat, Ate, Eaten = Eat; etc)
    ps = PorterStemmer()

    # Removing all common words e.g. Preposition, article, conjunction, etc.
    tweet = [ps.stem(word) for word in tweet if word not in set(ENGLISH_STOP_WORDS)]
    tweet = ' '.join(tweet)
    
    # Creating Corpus of tweets
    corpus.append(tweet)
    

ds['Cleaned Tweet'] = corpus

In [70]:
ds.head()

Unnamed: 0,Depression_Level,SNo,PostDate,Source,Destination,Tweet,Cleaned Tweet
0,4,3,Mon May 11 03:17:40 UTC 2009,kindle2,tpryan,@stellargirl I loooooooovvvvvveee my Kindle2. ...,stellargirl loooooooovvvvvvee kindl dx cool fa...
1,4,4,Mon May 11 03:18:03 UTC 2009,kindle2,vcu451,Reading my kindle2... Love it... Lee childs i...,read kindl love lee child good read
2,4,5,Mon May 11 03:18:54 UTC 2009,kindle2,chadfu,"Ok, first assesment of the #kindle2 ...it fuck...",ok asses kindl fuck rock
3,4,6,Mon May 11 03:19:04 UTC 2009,kindle2,SIX15,@kenburbary You'll love your Kindle2. I've had...,kenburbari ll love kindl ve month look new big...
4,4,7,Mon May 11 03:21:41 UTC 2009,kindle2,yamarama,@mikefish Fair enough. But i have the Kindle2...,mikefish fair kindl think s perfect


In [76]:
print("Original Tweet")
print('---------------')
print(ds.Tweet[0])
print('\n')

print("Cleaned Tweet")
print('---------------')
print(ds['Cleaned Tweet'][0])

Original Tweet
---------------
@stellargirl I loooooooovvvvvveee my Kindle2. Not that the DX is cool, but the 2 is fantastic in its own right.


Cleaned Tweet
---------------
stellargirl loooooooovvvvvvee kindl dx cool fantast right


## Creating Bag of words model (Sparse Matrix)

In [62]:
from sklearn.feature_extraction.text import CountVectorizer

In [86]:
cv = CountVectorizer(max_features=1000)
bag_of_words = cv.fit_transform(corpus).toarray()

In [87]:
bag_of_words = pd.DataFrame(bag_of_words, index=ds.index)

In [88]:
bag_of_words.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,990,991,992,993,994,995,996,997,998,999
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [89]:
ds2 = pd.concat([ds.drop(['Tweet', 'Cleaned Tweet'], axis=1), bag_of_words], axis=1)

In [90]:
ds2.head(3)

Unnamed: 0,Depression_Level,SNo,PostDate,Source,Destination,0,1,2,3,4,...,990,991,992,993,994,995,996,997,998,999
0,4,3,Mon May 11 03:17:40 UTC 2009,kindle2,tpryan,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,4,4,Mon May 11 03:18:03 UTC 2009,kindle2,vcu451,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,4,5,Mon May 11 03:18:54 UTC 2009,kindle2,chadfu,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [91]:
ds2.tail(3)

Unnamed: 0,Depression_Level,SNo,PostDate,Source,Destination,0,1,2,3,4,...,990,991,992,993,994,995,996,997,998,999
495,4,14074,Sun Jun 14 04:36:34 UTC 2009,latex,iamtheonlyjosie,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
496,0,14075,Sun Jun 14 21:36:07 UTC 2009,iran,plutopup7,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
497,0,14076,Sun Jun 14 21:36:17 UTC 2009,iran,captain_pete,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [92]:
ds2.isnull().sum()

Depression_Level    0
SNo                 0
PostDate            0
Source              0
Destination         0
                   ..
995                 0
996                 0
997                 0
998                 0
999                 0
Length: 1005, dtype: int64

In [93]:
ds2.Depression_Level.value_counts()

4    182
0    177
2    139
Name: Depression_Level, dtype: int64

# Splitting the Dataset into Training and Test sets

In [95]:
X = bag_of_words
y = ds.Depression_Level

In [96]:
from sklearn.cross_validation import train_test_split

In [97]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)


In [98]:
X_train.head(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,990,991,992,993,994,995,996,997,998,999
265,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
448,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [99]:
y_train.head(2)

265    4
448    2
Name: Depression_Level, dtype: int64

In [101]:
print('Shape of X_train', X_train.shape)
print('Shape of X_test', X_test.shape)

Shape of X_train (448, 1000)
Shape of X_test (50, 1000)


# Building the Models

In [106]:
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

## Naive Bayesian Model

In [103]:
nb = GaussianNB()

In [104]:
nb.fit(X_train,y_train)

GaussianNB(priors=None)

In [105]:
nb_prediction = nb.predict(X_test)

## Decision Tree Model

In [108]:
dtree = DecisionTreeClassifier()

In [109]:
dtree.fit(X_train,y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [110]:
dtree_predictions = dtree.predict(X_test)

## KNN Model

In [111]:
from sklearn.model_selection import GridSearchCV

In [113]:
knn_param = {'n_neighbors': list(range(2,50))}
grid_knn = GridSearchCV(estimator=KNeighborsClassifier(), param_grid=knn_param, scoring='accuracy')

In [114]:
grid_knn.fit(X_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'n_neighbors': [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

In [115]:
print('Best Parameters: ', grid_knn.best_params_)

Best Parameters:  {'n_neighbors': 23}


In [116]:
knn_predictions = grid_knn.predict(X_test)

# Model Evaluations

In [117]:
from sklearn.metrics import confusion_matrix, accuracy_score, r2_score, classification_report

In [119]:
print('GaussianNB')
print('----------')
print(confusion_matrix(y_test, nb_prediction))
print(classification_report(y_test, nb_prediction))
print(r2_score(y_test, nb_prediction))
print(accuracy_score(y_test, nb_prediction))
print('\n\n')

print('Decision Tree')
print('--------------')
print(confusion_matrix(y_test, dtree_predictions))
print(classification_report(y_test, dtree_predictions))
print(r2_score(y_test, dtree_predictions))
print(accuracy_score(y_test, dtree_predictions))
print('\n\n')

print('KNN')
print('---')
print(confusion_matrix(y_test, knn_predictions))
print(classification_report(y_test, knn_predictions))
print(r2_score(y_test, knn_predictions))
print(accuracy_score(y_test, knn_predictions))
print('\n\n')


GaussianNB
----------
[[11  1  3]
 [ 0 12  5]
 [ 2  8  8]]
             precision    recall  f1-score   support

          0       0.85      0.73      0.79        15
          2       0.57      0.71      0.63        17
          4       0.50      0.44      0.47        18

avg / total       0.63      0.62      0.62        50

-0.03595368677635591
0.62



Decision Tree
--------------
[[ 9  2  4]
 [ 2 11  4]
 [ 2  4 12]]
             precision    recall  f1-score   support

          0       0.69      0.60      0.64        15
          2       0.65      0.65      0.65        17
          4       0.60      0.67      0.63        18

avg / total       0.64      0.64      0.64        50

-0.09689213893967086
0.64



KNN
---
[[ 5  5  5]
 [ 1 10  6]
 [ 0  7 11]]
             precision    recall  f1-score   support

          0       0.83      0.33      0.48        15
          2       0.45      0.59      0.51        17
          4       0.50      0.61      0.55        18

avg / total       0.58

# PART 2: Using Dimensionality Reduction Methods

# Dimensioality Reduction Techniques
    Most Used Techniques are PCA, LDA, and Kernel PCA:
    Principal Component Analysis (PCA): 
        Uses the features that with higher explained variance. 
        It does not considers the dependent ,y, variable and as such is called UnSupervised Dim. Reduction Method 
        
    Linear Discriminant Analysis (LDA):
        Uses the features that most separates the classes of the dependent, y, variable.
        It uses the dependent variable classes, so it is a Supervised Dim. Reduction Method
        
    Kernel PCA:
        Advanced form of PCA used for Non-Linearly Separable dataset, when the X and y do not have a linear relationship
        Used when there are many dimensionality involved.
        

## Principal Component Analysis (PCA)

In [137]:
# Principal Component Analysis 
from sklearn.decomposition import PCA

In [169]:
# Set theis first and then change the value of n_components arg. to 'None' and then change, 
# after testing the explained variance. 
# pca = PCA(n_components=None) .......Firstly

pca = PCA(n_components=400)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)
explained_variance = pca.explained_variance_ratio_


In [170]:
explained_variance[:450].sum()

0.9958738341068587

In [171]:
len(X_train.columns)

1000

## Naive Bayesian Model

In [172]:
nb = GaussianNB()

In [173]:
nb.fit(X_train_pca,y_train)

GaussianNB(priors=None)

In [174]:
nb_prediction = nb.predict(X_test_pca)

## Decision Tree Model

In [175]:
dtree = DecisionTreeClassifier()

In [176]:
dtree.fit(X_train_pca,y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [177]:
dtree_predictions = dtree.predict(X_test_pca)

## KNN Model

In [178]:
from sklearn.model_selection import GridSearchCV

In [179]:
knn_param = {'n_neighbors': list(range(2,50))}
grid_knn = GridSearchCV(estimator=KNeighborsClassifier(), param_grid=knn_param, scoring='accuracy')

In [180]:
grid_knn.fit(X_train_pca, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'n_neighbors': [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

In [181]:
print('Best Parameters: ', grid_knn.best_params_)

Best Parameters:  {'n_neighbors': 24}


In [182]:
knn_predictions = grid_knn.predict(X_test_pca)

# Model Evaluations

In [183]:
from sklearn.metrics import confusion_matrix, accuracy_score, r2_score, classification_report

In [184]:
print('GaussianNB')
print('----------')
print(confusion_matrix(y_test, nb_prediction))
print(classification_report(y_test, nb_prediction))
print(r2_score(y_test, nb_prediction))
print(accuracy_score(y_test, nb_prediction))
print('\n\n')

print('Decision Tree')
print('--------------')
print(confusion_matrix(y_test, dtree_predictions))
print(classification_report(y_test, dtree_predictions))
print(r2_score(y_test, dtree_predictions))
print(accuracy_score(y_test, dtree_predictions))
print('\n\n')

print('KNN')
print('---')
print(confusion_matrix(y_test, knn_predictions))
print(classification_report(y_test, knn_predictions))
print(r2_score(y_test, knn_predictions))
print(accuracy_score(y_test, knn_predictions))
print('\n\n')


GaussianNB
----------
[[ 6  3  6]
 [ 0  6 11]
 [ 2  3 13]]
             precision    recall  f1-score   support

          0       0.75      0.40      0.52        15
          2       0.50      0.35      0.41        17
          4       0.43      0.72      0.54        18

avg / total       0.55      0.50      0.49        50

-0.4929920780012187
0.5



Decision Tree
--------------
[[ 9  5  1]
 [ 2  8  7]
 [ 4  4 10]]
             precision    recall  f1-score   support

          0       0.60      0.60      0.60        15
          2       0.47      0.47      0.47        17
          4       0.56      0.56      0.56        18

avg / total       0.54      0.54      0.54        50

-0.15783059110298603
0.54



KNN
---
[[ 4  6  5]
 [ 0 12  5]
 [ 0 13  5]]
             precision    recall  f1-score   support

          0       1.00      0.27      0.42        15
          2       0.39      0.71      0.50        17
          4       0.33      0.28      0.30        18

avg / total       0.55  

## Linear Discriminant Analysis (LDA)

In [185]:
# Principal Component Analysis 
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA

In [186]:
# Set theis first and then change the value of n_components arg. to 'None' and then change, 
# after testing the explained variance. 
# pca = PCA(n_components=None) .......Firstly

lda = LDA(n_components=400)
X_train_lda = lda.fit_transform(X_train, y_train)
X_test_lda = lda.transform(X_test)



In [187]:
len(X_train.columns)

1000

## Naive Bayesian Model

In [188]:
nb = GaussianNB()

In [189]:
nb.fit(X_train_lda,y_train)

GaussianNB(priors=None)

In [190]:
nb_prediction = nb.predict(X_test_lda)

## Decision Tree Model

In [191]:
dtree = DecisionTreeClassifier()

In [192]:
dtree.fit(X_train_lda,y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [193]:
dtree_predictions = dtree.predict(X_test_lda)

## KNN Model

In [194]:
from sklearn.model_selection import GridSearchCV

In [195]:
knn_param = {'n_neighbors': list(range(2,50))}
grid_knn = GridSearchCV(estimator=KNeighborsClassifier(), param_grid=knn_param, scoring='accuracy')

In [196]:
grid_knn.fit(X_train_lda, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'n_neighbors': [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

In [197]:
print('Best Parameters: ', grid_knn.best_params_)

Best Parameters:  {'n_neighbors': 3}


In [198]:
knn_predictions = grid_knn.predict(X_test_lda)

# Model Evaluations

In [199]:
from sklearn.metrics import confusion_matrix, accuracy_score, r2_score, classification_report

In [200]:
print('GaussianNB')
print('----------')
print(confusion_matrix(y_test, nb_prediction))
print(classification_report(y_test, nb_prediction))
print(r2_score(y_test, nb_prediction))
print(accuracy_score(y_test, nb_prediction))
print('\n\n')

print('Decision Tree')
print('--------------')
print(confusion_matrix(y_test, dtree_predictions))
print(classification_report(y_test, dtree_predictions))
print(r2_score(y_test, dtree_predictions))
print(accuracy_score(y_test, dtree_predictions))
print('\n\n')

print('KNN')
print('---')
print(confusion_matrix(y_test, knn_predictions))
print(classification_report(y_test, knn_predictions))
print(r2_score(y_test, knn_predictions))
print(accuracy_score(y_test, knn_predictions))
print('\n\n')


GaussianNB
----------
[[6 6 3]
 [3 9 5]
 [3 8 7]]
             precision    recall  f1-score   support

          0       0.50      0.40      0.44        15
          2       0.39      0.53      0.45        17
          4       0.47      0.39      0.42        18

avg / total       0.45      0.44      0.44        50

-0.40158439975624627
0.44



Decision Tree
--------------
[[6 4 5]
 [3 7 7]
 [5 5 8]]
             precision    recall  f1-score   support

          0       0.43      0.40      0.41        15
          2       0.44      0.41      0.42        17
          4       0.40      0.44      0.42        18

avg / total       0.42      0.42      0.42        50

-0.7976843388177941
0.42



KNN
---
[[ 6  6  3]
 [ 3 10  4]
 [ 4  8  6]]
             precision    recall  f1-score   support

          0       0.46      0.40      0.43        15
          2       0.42      0.59      0.49        17
          4       0.46      0.33      0.39        18

avg / total       0.45      0.44      0.4

## Kernel Principal Component Analysis (Kernel PCA)
    Used for Linearly Inseparable Dataset

In [201]:
# Principal Component Analysis 
from sklearn.decomposition import KernelPCA

In [202]:
# Set theis first and then change the value of n_components arg. to 'None' and then change, 
# after testing the explained variance. 
# pca = PCA(n_components=None) .......Firstly

kpca = KernelPCA(n_components=400, kernel='rbf')
X_train_kpca = kpca.fit_transform(X_train)
X_test_kpca = kpca.transform(X_test)

## Naive Bayesian Model

In [203]:
nb = GaussianNB()

In [204]:
nb.fit(X_train_kpca,y_train)

GaussianNB(priors=None)

In [205]:
nb_prediction = nb.predict(X_test_kpca)

## Decision Tree Model

In [206]:
dtree = DecisionTreeClassifier()

In [207]:
dtree.fit(X_train_kpca,y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [208]:
dtree_predictions = dtree.predict(X_test_kpca)

## KNN Model

In [178]:
from sklearn.model_selection import GridSearchCV

In [209]:
knn_param = {'n_neighbors': list(range(2,50))}
grid_knn = GridSearchCV(estimator=KNeighborsClassifier(), param_grid=knn_param, scoring='accuracy')

In [210]:
grid_knn.fit(X_train_kpca, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'n_neighbors': [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

In [211]:
print('Best Parameters: ', grid_knn.best_params_)

Best Parameters:  {'n_neighbors': 24}


In [212]:
knn_predictions = grid_knn.predict(X_test_kpca)

# Model Evaluations

In [183]:
from sklearn.metrics import confusion_matrix, accuracy_score, r2_score, classification_report

In [213]:
print('GaussianNB')
print('----------')
print(confusion_matrix(y_test, nb_prediction))
print(classification_report(y_test, nb_prediction))
print(r2_score(y_test, nb_prediction))
print(accuracy_score(y_test, nb_prediction))
print('\n\n')

print('Decision Tree')
print('--------------')
print(confusion_matrix(y_test, dtree_predictions))
print(classification_report(y_test, dtree_predictions))
print(r2_score(y_test, dtree_predictions))
print(accuracy_score(y_test, dtree_predictions))
print('\n\n')

print('KNN')
print('---')
print(confusion_matrix(y_test, knn_predictions))
print(classification_report(y_test, knn_predictions))
print(r2_score(y_test, knn_predictions))
print(accuracy_score(y_test, knn_predictions))
print('\n\n')


GaussianNB
----------
[[ 8  0  7]
 [ 0  3 14]
 [ 2  2 14]]
             precision    recall  f1-score   support

          0       0.80      0.53      0.64        15
          2       0.60      0.18      0.27        17
          4       0.40      0.78      0.53        18

avg / total       0.59      0.50      0.47        50

-0.5843997562461913
0.5



Decision Tree
--------------
[[ 7  5  3]
 [ 1  9  7]
 [ 2  6 10]]
             precision    recall  f1-score   support

          0       0.70      0.47      0.56        15
          2       0.45      0.53      0.49        17
          4       0.50      0.56      0.53        18

avg / total       0.54      0.52      0.52        50

-0.1882998171846435
0.52



KNN
---
[[ 5  6  4]
 [ 0 12  5]
 [ 0 13  5]]
             precision    recall  f1-score   support

          0       1.00      0.33      0.50        15
          2       0.39      0.71      0.50        17
          4       0.36      0.28      0.31        18

avg / total       0.56   