In [1]:
import pandas as pd 
from nltk.corpus import stopwords
from string import punctuation
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB

In [2]:
stop = set(stopwords.words('english'))
punctuation = list(punctuation)
stop.update(punctuation)

In [3]:
def count_acc(y_test, y_pred):
    count = 0
    accurate = 0
    for y_t, y_p in zip(y_test, y_pred):
        if(y_t == y_p):
            accurate += 1
        count += 1
    return accurate/count

### 1. With stopwords and origin form of words - 85.04% acc

In [4]:
def process_remove_punctuation(text):
    output = []
    for word in word_tokenize(text.lower()):
        if word.isalpha():
            output.append(word)
    return " ".join(output)

In [5]:
df = pd.read_csv('../IMDB_dataset/IMDB dataset.csv')

In [6]:
df['review']=df['review'].apply(process_remove_punctuation)

df.sentiment.replace("positive" , 1 , inplace = True)
df.sentiment.replace("negative" , 0 , inplace = True)

In [7]:
X = df['review']
Y = df['sentiment']

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=42)

x_train = x_train.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
x_test = x_test.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

In [8]:
vectorizer = CountVectorizer()
x_train_cv= vectorizer.fit_transform(x_train)
print(x_train_cv.shape)

(40000, 88261)


In [12]:
model = MultinomialNB()
model.fit(x_train_cv, y_train)

In [13]:
x_test_cv = vectorizer.transform(x_test)
x_test_cv.shape

(10000, 88261)

In [14]:
y_pred = model.predict(x_test_cv)

In [15]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.83      0.88      0.85      5000
           1       0.87      0.82      0.85      5000

    accuracy                           0.85     10000
   macro avg       0.85      0.85      0.85     10000
weighted avg       0.85      0.85      0.85     10000



In [16]:
count_acc(y_test, y_pred)

0.8504

### 2. With stopwords and stemming - 84.33% acc

In [18]:
def remove_punctuation_and_stem(text):
    stem_tokens = []
    stemmer = PorterStemmer()
    for word in word_tokenize(text.lower()):
        if word.isalpha():
            stem_word = stemmer.stem(word)
            stem_tokens.append(stem_word)
    return " ".join(stem_tokens)

In [19]:
df = pd.read_csv('../IMDB_dataset/IMDB dataset.csv')

In [20]:
df['review']=df['review'].apply(remove_punctuation_and_stem)

df.sentiment.replace("positive" , 1 , inplace = True)
df.sentiment.replace("negative" , 0 , inplace = True)

In [21]:
X = df['review']
Y = df['sentiment']

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=42)

x_train = x_train.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
x_test = x_test.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

In [22]:
vectorizer = CountVectorizer()
x_train_cv= vectorizer.fit_transform(x_train)
print(x_train_cv.shape)

(40000, 60951)


In [23]:
model = MultinomialNB()
model.fit(x_train_cv, y_train)

In [24]:
x_test_cv = vectorizer.transform(x_test)
x_test_cv.shape

(10000, 60951)

In [25]:
y_pred = model.predict(x_test_cv)

In [26]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.82      0.87      0.85      5000
           1       0.86      0.81      0.84      5000

    accuracy                           0.84     10000
   macro avg       0.84      0.84      0.84     10000
weighted avg       0.84      0.84      0.84     10000



In [27]:
count_acc(y_test, y_pred)

0.8433

### 3. Without stopwords and origin form of words - 86.11% acc

In [4]:
def remove_punctuation_and_stopwords(text):
    output = []
    for word in word_tokenize(text.lower()):
        if word not in stop and word.isalpha():
            output.append(word)
    return " ".join(output)

In [5]:
df = pd.read_csv('../IMDB_dataset/IMDB dataset.csv')

In [6]:
df['review']=df['review'].apply(remove_punctuation_and_stopwords)

df.sentiment.replace("positive" , 1 , inplace = True)
df.sentiment.replace("negative" , 0 , inplace = True)

In [7]:
X = df['review']
Y = df['sentiment']

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=42)

x_train = x_train.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
x_test = x_test.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

In [8]:
vectorizer = CountVectorizer()
x_train_cv= vectorizer.fit_transform(x_train)
print(x_train_cv.shape)

(40000, 88119)


In [9]:
model = MultinomialNB()
model.fit(x_train_cv, y_train)

In [10]:
x_test_cv = vectorizer.transform(x_test)
x_test_cv.shape

(10000, 88119)

In [11]:
y_pred = model.predict(x_test_cv)

In [12]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.85      0.88      0.86      5000
           1       0.88      0.84      0.86      5000

    accuracy                           0.86     10000
   macro avg       0.86      0.86      0.86     10000
weighted avg       0.86      0.86      0.86     10000



In [13]:
count_acc(y_test, y_pred)

0.8611

In [14]:
import pickle

In [15]:
filename = 'naive_bayes_cv.sav'
pickle.dump(model, open(filename, 'wb'))

In [16]:
filename = 'naive_bayes_cv_vectorizer.sav'
pickle.dump(vectorizer, open(filename, 'wb'))

### 4. Without stopwords and with steeming  - 85.7%

In [38]:
def remove_stopwords_and_stem(text):
    stem_tokens = []
    stemmer = PorterStemmer()
    for word in word_tokenize(text.lower()):
        if word not in stop and word.isalpha():
            stem_word = stemmer.stem(word)
            stem_tokens.append(stem_word)
    return " ".join(stem_tokens)

In [39]:
df = pd.read_csv('../IMDB_dataset/IMDB dataset.csv')

In [40]:
df['review']=df['review'].apply(remove_stopwords_and_stem)

df.sentiment.replace("positive" , 1 , inplace = True)
df.sentiment.replace("negative" , 0 , inplace = True)

In [41]:
X = df['review']
Y = df['sentiment']

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=42)

x_train = x_train.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
x_test = x_test.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

In [42]:
vectorizer = CountVectorizer()
x_train_cv= vectorizer.fit_transform(x_train)
print(x_train_cv.shape)

(40000, 60891)


In [43]:
model = MultinomialNB()
model.fit(x_train_cv, y_train)

In [44]:
x_test_cv = vectorizer.transform(x_test)
x_test_cv.shape

(10000, 60891)

In [45]:
y_pred = model.predict(x_test_cv)

In [46]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.84      0.88      0.86      5000
           1       0.87      0.84      0.85      5000

    accuracy                           0.86     10000
   macro avg       0.86      0.86      0.86     10000
weighted avg       0.86      0.86      0.86     10000



In [47]:
count_acc(y_test, y_pred)

0.857