In [1]:
import math
import random
from collections import defaultdict
from pprint import pprint

# Prevent future/deprecation warnings from showing in output
import warnings
warnings.filterwarnings(action='ignore')

import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# Set global styles for plots
sns.set_style(style='white')
sns.set_context(context='notebook', font_scale=1.3, rc={'figure.figsize': (16,9)})

In [14]:
df = pd.read_csv('reddit_posts_labels.csv', encoding='utf-8')
df.head()

Unnamed: 0,post,label
0,WPP’s Wavemaker with Acceleration adds in-hous...,0
1,NXTP💰🚀nextplaytechnologies,0
2,The National Dental Association and SmileDirec...,0
3,PHUN just the cool off before the real blast off🚀,1
4,#premarket #watchlist 10/29 $GFAI - Guardforce...,1


### We require only the positive and negative reviews for classification, so ignor

In [17]:
df = df[df.label != 0]
df.label.value_counts()

 1    295
-1    104
Name: label, dtype: int64

### Vectorization to convert posts to features

In [18]:
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer(max_features=1000, binary=True)
X = vect.fit_transform(df.post)
X.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

### Splitting training and testing data
30% of the data converted to testing data

In [19]:
from sklearn.model_selection import train_test_split

X = df.post
y = df.label

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [20]:
from sklearn.feature_extraction.text import CountVectorizer

vect = CountVectorizer(max_features=1000, binary=True)

X_train_vect = vect.fit_transform(X_train)

In [21]:
counts = df.label.value_counts()
print(counts)

print("\nPredicting only -1 = {:.2f}% accuracy".format(counts[-1] / sum(counts) * 100))

 1    295
-1    104
Name: label, dtype: int64

Predicting only -1 = 26.07% accuracy


In [24]:
from imblearn.over_sampling import SMOTE

sm = SMOTE()

X_train_res, y_train_res = sm.fit_resample(X_train_vect, y_train)

In [32]:
unique, counts = np.unique(y_train_res, return_counts=True)
print("Counts of -1 and +1 labels : ",counts)

Counts of -1 and +1 labels :  [203 203]


## Naive Bayes

### First testing for Multinomial Naive Bayes

In [40]:
from sklearn.naive_bayes import MultinomialNB, BernoulliNB, GaussianNB

mnb = MultinomialNB()

mnb.fit(X_train_res, y_train_res)

mnb.score(X_train_res, y_train_res)

0.9679802955665024

In [42]:
X_test_vect = vect.transform(X_test)

y_pred = mnb.predict(X_test_vect)

y_pred

array([-1,  1,  1,  1,  1, -1,  1,  1, -1, -1,  1,  1, -1, -1,  1,  1,  1,
        1,  1, -1,  1,  1,  1, -1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
       -1, -1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1, -1,  1,  1,
        1,  1, -1,  1, -1,  1, -1,  1,  1, -1,  1,  1,  1,  1, -1,  1,  1,
        1,  1,  1, -1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1, -1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1, -1,  1,  1,
       -1,  1,  1, -1, -1,  1,  1, -1,  1,  1,  1,  1,  1,  1, -1,  1,  1,
       -1], dtype=int64)

In [60]:
X_test

86     China Hints Its Crackdown on Tech Giants Is Co...
828    First option, how did I do lol? (Suoer noob, I...
146    ROOT 47% SI i bought 19,100 shares, could this...
194    Here is a Market Recap for today Tuesday, Octo...
456    Translation: "Be patient till Wall Street find...
                             ...                        
77     Malls Are Not Dead, Based on Simon Property's ...
447                                      Manipulation???
221                                      Thank you $PHUN
518    And Another Website Gone - Benzinga Media, Val...
686                 BRK/A Gains to whoever sold at $661k
Name: post, Length: 120, dtype: object

In [61]:
y_test

86     1
828    1
146    1
194    1
456    1
      ..
77     1
447   -1
221    1
518    1
686    1
Name: label, Length: 120, dtype: int64

In [35]:
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix

print("Accuracy: {:.2f}%".format(accuracy_score(y_test, y_pred) * 100))
print("\nF1 Score: {:.2f}".format(f1_score(y_test, y_pred) * 100))
print("\nCOnfusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 77.50%

F1 Score: 85.56

COnfusion Matrix:
 [[13 15]
 [12 80]]


### Now testing for Bernoulli Naive Bayes

In [38]:
bnb = BernoulliNB(binarize = 0.0)
bnb.fit(X_train_res, y_train_res)
bnb.score(X_train_res, y_train_res)

0.8004926108374384

In [None]:
### SO this doesn't give us as mush accuracy as Multinomial NB, so we're gonna go with MultinomialNb only