In [1]:
import pandas as pd 
from nltk.corpus import stopwords
from string import punctuation
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB

In [2]:
stop = set(stopwords.words('english'))
punctuation = list(punctuation)
stop.update(punctuation)

In [3]:
def count_acc(y_test, y_pred):
    count = 0
    accurate = 0
    for y_t, y_p in zip(y_test, y_pred):
        if(y_t == y_p):
            accurate += 1
        count += 1
    return accurate/count

### 1. With stopwords and origin form of words - 86.56% acc

In [4]:
def process_remove_punctuation(text):
    output = []
    for word in word_tokenize(text.lower()):
        if word.isalpha():
            output.append(word)
    return " ".join(output)

In [5]:
df = pd.read_csv('../IMDB_dataset/IMDB dataset.csv')

In [6]:
df['review']=df['review'].apply(process_remove_punctuation)

df.sentiment.replace("positive" , 1 , inplace = True)
df.sentiment.replace("negative" , 0 , inplace = True)

In [7]:
X = df['review']
Y = df['sentiment']

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=42)

x_train = x_train.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
x_test = x_test.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

In [8]:
vectorizer = TfidfVectorizer()
x_train_tfidf= vectorizer.fit_transform(x_train)
print(x_train_tfidf.shape)

(40000, 88261)


In [9]:
model = MultinomialNB()
model.fit(x_train_tfidf, y_train)

In [10]:
x_test_tfidf = vectorizer.transform(x_test)
x_test_tfidf.shape

(10000, 88261)

In [11]:
y_pred = model.predict(x_test_tfidf)

In [12]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.85      0.89      0.87      5000
           1       0.88      0.84      0.86      5000

    accuracy                           0.87     10000
   macro avg       0.87      0.87      0.87     10000
weighted avg       0.87      0.87      0.87     10000



In [13]:
count_acc(y_test, y_pred)

0.8656

### 2. With stopwords and stemming - 85.64% acc

In [14]:
def remove_punctuation_and_stem(text):
    stem_tokens = []
    stemmer = PorterStemmer()
    for word in word_tokenize(text.lower()):
        if word.isalpha():
            stem_word = stemmer.stem(word)
            stem_tokens.append(stem_word)
    return " ".join(stem_tokens)

In [15]:
df = pd.read_csv('../IMDB_dataset/IMDB dataset.csv')

In [17]:
df['review']=df['review'].apply(remove_punctuation_and_stem)

df.sentiment.replace("positive" , 1 , inplace = True)
df.sentiment.replace("negative" , 0 , inplace = True)

In [18]:
X = df['review']
Y = df['sentiment']

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=42)

x_train = x_train.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
x_test = x_test.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

In [19]:
vectorizer = TfidfVectorizer()
x_train_tfidf= vectorizer.fit_transform(x_train)
print(x_train_tfidf.shape)

(40000, 60951)


In [20]:
model = MultinomialNB()
model.fit(x_train_tfidf, y_train)

In [21]:
x_test_tfidf = vectorizer.transform(x_test)
x_test_tfidf.shape

(10000, 60951)

In [22]:
y_pred = model.predict(x_test_tfidf)

In [23]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.84      0.88      0.86      5000
           1       0.87      0.84      0.85      5000

    accuracy                           0.86     10000
   macro avg       0.86      0.86      0.86     10000
weighted avg       0.86      0.86      0.86     10000



In [24]:
count_acc(y_test, y_pred)

0.8564

### 3. Without stopwords and origin form of words - 86.78% acc

In [45]:
def remove_punctuation_and_stopwords(text):
    output = []
    for word in word_tokenize(text.lower()):
        if word not in stop and word.isalpha():
            output.append(word)
    return " ".join(output)

In [46]:
df = pd.read_csv('../IMDB_dataset/IMDB dataset.csv')

In [47]:
df['review']=df['review'].apply(remove_punctuation_and_stopwords)

df.sentiment.replace("positive" , 1 , inplace = True)
df.sentiment.replace("negative" , 0 , inplace = True)

In [48]:
X = df['review']
Y = df['sentiment']

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=42)

x_train = x_train.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
x_test = x_test.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

In [49]:
vectorizer = TfidfVectorizer()
x_train_tfidf= vectorizer.fit_transform(x_train)
print(x_train_tfidf.shape)

(40000, 88119)


In [50]:
model = MultinomialNB()
model.fit(x_train_tfidf, y_train)

In [51]:
x_test_tfidf = vectorizer.transform(x_test)
x_test_tfidf.shape

(10000, 88119)

In [52]:
y_pred = model.predict(x_test_tfidf)

In [53]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.86      0.88      0.87      5000
           1       0.88      0.85      0.87      5000

    accuracy                           0.87     10000
   macro avg       0.87      0.87      0.87     10000
weighted avg       0.87      0.87      0.87     10000



In [54]:
count_acc(y_test, y_pred)

0.8678

In [56]:
import pickle

In [57]:
filename = 'naive_bayes_tfidf.sav'
pickle.dump(model, open(filename, 'wb'))

In [58]:
filename = 'naive_bayes_tfidf_vectorizer.sav'
pickle.dump(vectorizer, open(filename, 'wb'))

### 4. Without stopwords and with steeming  - 86.07%

In [35]:
def remove_stopwords_and_stem(text):
    stem_tokens = []
    stemmer = PorterStemmer()
    for word in word_tokenize(text.lower()):
        if word not in stop and word.isalpha():
            stem_word = stemmer.stem(word)
            stem_tokens.append(stem_word)
    return " ".join(stem_tokens)

In [36]:
df = pd.read_csv('../IMDB_dataset/IMDB dataset.csv')

In [37]:
df['review']=df['review'].apply(remove_stopwords_and_stem)

df.sentiment.replace("positive" , 1 , inplace = True)
df.sentiment.replace("negative" , 0 , inplace = True)

In [38]:
X = df['review']
Y = df['sentiment']

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=42)

x_train = x_train.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
x_test = x_test.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

In [39]:
vectorizer = TfidfVectorizer()
x_train_tfidf= vectorizer.fit_transform(x_train)
print(x_train_tfidf.shape)

(40000, 60891)


In [40]:
model = MultinomialNB()
model.fit(x_train_tfidf, y_train)

In [41]:
x_test_tfidf = vectorizer.transform(x_test)
x_test_tfidf.shape

(10000, 60891)

In [42]:
y_pred = model.predict(x_test_tfidf)

In [43]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.85      0.87      0.86      5000
           1       0.87      0.85      0.86      5000

    accuracy                           0.86     10000
   macro avg       0.86      0.86      0.86     10000
weighted avg       0.86      0.86      0.86     10000



In [44]:
count_acc(y_test, y_pred)

0.8607