In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report, accuracy_score
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from string import digits

## Reading in the train and validation set

In [2]:
train = pd.read_csv('data/train.csv')
valid = pd.read_csv('data/valid.csv')

## Removing digits for the text

In [3]:
def remove_digits(s: str) -> str:
    remove_digits = str.maketrans('', '', digits)
    res = s.translate(remove_digits)
    return res

In [4]:
train['text'] = train['text'].apply(remove_digits)
valid['text'] = valid['text'].apply(remove_digits)

## Bag of words representation (binary)

In [5]:
vectorizer = CountVectorizer(stop_words=None, lowercase=True,
                             ngram_range=(1, 1), min_df=2, binary=True)

train_features = vectorizer.fit_transform(train['text'])
train_labels = train['class']

valid_features = vectorizer.transform(valid['text'])
valid_labels = valid['class']

## Bernoulli Naive Bayes model

In [6]:
model = BernoulliNB(fit_prior=True)
model.fit(train_features, train_labels)

valid_preds = model.predict(valid_features)
print(classification_report(valid_labels, valid_preds))
print(f'Accuracy:{accuracy_score(valid_labels, valid_preds)}')

              precision    recall  f1-score   support

    negative       0.73      0.75      0.74       403
    positive       0.86      0.84      0.85       739

   micro avg       0.81      0.81      0.81      1142
   macro avg       0.79      0.80      0.80      1142
weighted avg       0.81      0.81      0.81      1142

Accuracy:0.8126094570928196


# Can we do better ?

## Bag of words representation (count)

In [7]:
vectorizer = CountVectorizer(stop_words=None, lowercase=True,
                             ngram_range=(1, 1), min_df=2)

train_features = vectorizer.fit_transform(train['text'])
train_labels = train['class']

valid_features = vectorizer.transform(valid['text'])
valid_labels = valid['class']

## Multinomial Naive Bayes model

In [8]:
model = MultinomialNB(fit_prior=True)
model.fit(train_features, train_labels)

valid_preds = model.predict(valid_features)
print(classification_report(valid_labels, valid_preds))
print(f'Accuracy: {accuracy_score(valid_labels, valid_preds)}')

              precision    recall  f1-score   support

    negative       0.76      0.74      0.75       403
    positive       0.86      0.87      0.87       739

   micro avg       0.83      0.83      0.83      1142
   macro avg       0.81      0.81      0.81      1142
weighted avg       0.82      0.83      0.83      1142

Accuracy: 0.8257443082311734


## Obtaining performance on our test set

In [9]:
test = pd.read_csv('data/test.csv')

In [10]:
test['text'] = test['text'].apply(remove_digits)

In [11]:
test_features = vectorizer.transform(test['text'])
test_labels = test['class']

In [12]:
test_preds = model.predict(test_features)
print(classification_report(test_labels, test_preds))
print(f'Accuracy: {accuracy_score(test_labels, test_preds)}')

              precision    recall  f1-score   support

    negative       0.72      0.69      0.71       394
    positive       0.84      0.86      0.85       747

   micro avg       0.80      0.80      0.80      1141
   macro avg       0.78      0.77      0.78      1141
weighted avg       0.80      0.80      0.80      1141

Accuracy: 0.8019281332164768


## Training on train + valid

In [13]:
data = pd.concat((train, valid), axis=0)
vectorizer = CountVectorizer(stop_words=None, lowercase=True,
                             ngram_range=(1, 1), min_df=2)

features = vectorizer.fit_transform(data['text'])
labels = data['class']

test_features = vectorizer.transform(test['text'])
test_labels = test['class']

In [14]:
model = MultinomialNB(fit_prior=True)
model.fit(features, labels)

test_preds = model.predict(test_features)
print(classification_report(test_labels, test_preds))
print(f'Accuracy: {accuracy_score(test_labels, test_preds)}')

              precision    recall  f1-score   support

    negative       0.73      0.69      0.71       394
    positive       0.84      0.86      0.85       747

   micro avg       0.80      0.80      0.80      1141
   macro avg       0.79      0.78      0.78      1141
weighted avg       0.80      0.80      0.80      1141

Accuracy: 0.8045574057843996


## Saving our trained model

In [15]:
from sklearn.pipeline import Pipeline
from sklearn.externals import joblib

In [16]:
model = Pipeline([('feature_transformer', vectorizer),
                  ('classifier', model)])

In [17]:
joblib.dump(model, 'data/model.pkl')

['data/model.pkl']

In [18]:
joblib.load('data/model.pkl')

Pipeline(memory=None,
     steps=[('feature_transformer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=2,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)), ('classifier', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])