# Baseline Naive Bayes Classifier

This baseline is a really basic pipeline:
- Convert sentences to bag of word, vector with columns representing occurences of words
- Naive bayes classifier

### Imports

In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
import pandas as pd
import numpy as np

In [2]:
np.random.seed(0) # for reproducibility with sklearn

In [3]:
sent_col = "opinion"
brand_col = "brand"
text_col = "body"

### Get the data

In [4]:
df = pd.read_csv("tweet_sent_predictor/data/train_proper.csv")

In [5]:
df

Unnamed: 0,opinion,brand,body
0,neu,apl,20 min line @apple store @short pump.
1,irr,msf,Nueva tecnología convierte cualquier superfici...
2,neu,ggl,Some people should not post replies in #Google...
3,neg,apl,I know a few others having same issue RT @Joel...
4,neg,msf,"#Microsoft - We put the """"backwards"""" into bac..."
...,...,...,...
4168,neg,apl,fuck this see you hoes @ work @WeakTwip @Munnn...
4169,neg,msf,"#Microsoft, #Adobe lose $13.5bn to piracy: Rep..."
4170,neu,twt,"I tried to explain why you would do """"The #Twi..."
4171,neg,apl,Installed io5 - fine on ipad but wiped wife's ...


### Change sentences into count vectors

In [6]:
count_vectorizer = CountVectorizer()

use example

In [7]:
x = count_vectorizer.fit_transform(df[text_col])

In [8]:
x.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

### Baseline classifier

In [9]:
clf = MultinomialNB()

### Pipeline

In [10]:
naive_pipeline = Pipeline([
    ("Count vectorizer", count_vectorizer),
    ("Naive bayesian classifier", clf)
])

### Train

In [11]:
X = df[text_col]
y = df[sent_col].values

In [12]:
naive_pipeline.fit(X, y)

Pipeline(steps=[('Count vectorizer', CountVectorizer()),
                ('Naive bayesian classifier', MultinomialNB())])

### Score on train

In [13]:
naive_pipeline.score(X, y)

0.8734723220704529

In [14]:
y_pred = naive_pipeline.predict(X)
y_true = y

In [15]:
labels=["pos", "neg", "neu", "irr"]
index=labels
conf = confusion_matrix(y_pred, y_true, labels=labels, normalize='true')
confusion_df = pd.DataFrame(conf, index=labels, columns=labels)

In [16]:
confusion_df

Unnamed: 0,pos,neg,neu,irr
pos,0.973856,0.006536,0.006536,0.013072
neg,0.029333,0.922667,0.034667,0.013333
neu,0.111831,0.059154,0.7962,0.032815
irr,0.00602,0.001505,0.009782,0.982694


### Cross validation score

In [17]:
scores = cross_val_score(naive_pipeline, X, y, cv=5)

In [18]:
scores

array([0.76526946, 0.73652695, 0.73652695, 0.74220624, 0.76258993])

In [19]:
scores.mean(), scores.var()

(0.7486239032725917, 0.00016119673082793566)