In [1]:
import pandas as pd
import glob
import os
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

Function to read the input files:

In [2]:
def reader(path):
    content = open(path, "r",encoding="utf8")
    temp = content.read()
    content.close()
    return temp

Function to return paths of all the training and testing files:

In [3]:
def all_paths():
    train_pos = glob.glob(os.getcwd()+'/train/pos/*.txt')
    train_neg = glob.glob(os.getcwd()+'/train/neg/*.txt')
    test_pos = glob.glob(os.getcwd()+'/test/pos/*.txt')
    test_neg = glob.glob(os.getcwd()+'/test/neg/*.txt')
    return [train_pos, train_neg, test_pos, test_neg]

Function for applying function "reader" on all the data:

In [4]:
def create_df(paths):
    data = pd.Series(paths).apply(reader)
    return data

Finally create a training and testing dataframe:

In [5]:
def merge_pos_neg(pos, neg):
    pos = pd.DataFrame(pos)
    neg = pd.DataFrame(neg)
    pos["label"] = "pos"
    neg["label"] = "neg"
    merged = pd.concat([neg, pos])
    merged.columns = ["review", "label"]
    return merged

In [6]:
train_pos, train_neg, test_pos, test_neg = all_paths()

In [7]:
train_pos = create_df(train_pos)
train_neg = create_df(train_neg)

In [8]:
train = merge_pos_neg(train_pos, train_neg)

In [9]:
test_pos = create_df(test_pos)
test_neg = create_df(test_neg)

In [10]:
test = merge_pos_neg(test_pos, test_neg)

In [11]:
train.head(5)

Unnamed: 0,review,label
0,Story of a man who has unnatural feelings for ...,neg
1,Airport '77 starts as a brand new luxury 747 p...,neg
2,This film lacked something I couldn't put my f...,neg
3,"Sorry everyone,,, I know this is supposed to b...",neg
4,When I was little my parents took me along to ...,neg


In [12]:
test.tail(5)

Unnamed: 0,review,label
12495,I was extraordinarily impressed by this film. ...,pos
12496,"Although I'm not a golf fan, I attended a snea...",pos
12497,"From the start of ""The Edge Of Love"", the view...",pos
12498,"This movie, with all its complexity and subtle...",pos
12499,I've seen this story before but my kids haven'...,pos


In [13]:
train.isnull().sum()

review    0
label     0
dtype: int64

In [14]:
test.isnull().sum()

review    0
label     0
dtype: int64

In [15]:
train["label"].value_counts()

neg    12500
pos    12500
Name: label, dtype: int64

In [16]:
test["label"].value_counts()

neg    12500
pos    12500
Name: label, dtype: int64

Fitting a Naïve Bayes classifier:

In [17]:
nb_classifier = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', MultinomialNB()),
])

In [18]:
X_train, y_train = [train["review"], train["label"]]
x_test, y_test = [test["review"], test["label"]]

In [19]:
nb_classifier.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('clf',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

In [20]:
y_pred = nb_classifier.predict(x_test)

In [21]:
print(metrics.classification_report(y_test,y_pred))

              precision    recall  f1-score   support

         neg       0.79      0.89      0.84     12500
         pos       0.87      0.77      0.82     12500

    accuracy                           0.83     25000
   macro avg       0.83      0.83      0.83     25000
weighted avg       0.83      0.83      0.83     25000

