In [1]:
import numpy as np
import pandas as pd 
from nltk.corpus import stopwords
from string import punctuation
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression

### 1. Without stopwords and  with steeming - 87.92% acc

In [2]:
df=pd.read_csv('../IMDB_dataset/IMDB dataset.csv')

stop = set(stopwords.words('english'))
punctuation = list(punctuation)
stop.update(punctuation)

In [3]:
def remove_stopwords_and_stem(text):
    stem_tokens = []
    stemmer = PorterStemmer()
    for word in word_tokenize(text.lower()):
        if word not in stop and word.isalpha():
            stem_word = stemmer.stem(word)
            stem_tokens.append(stem_word)
    return " ".join(stem_tokens)

In [5]:
df['review']=df['review'].apply(remove_stopwords_and_stem)

df.sentiment.replace("positive" , 1 , inplace = True)
df.sentiment.replace("negative" , 0 , inplace = True)


In [6]:
df.head(10)

Unnamed: 0,review,sentiment
0,one review mention watch oz episod hook right ...,1
1,wonder littl product br br film techniqu fashi...,1
2,thought wonder way spend time hot summer weeke...,1
3,basic famili littl boy jake think zombi closet...,0
4,petter mattei love time money visual stun film...,1
5,probabl favorit movi stori selfless sacrific d...,1
6,sure would like see resurrect date seahunt ser...,1
7,show amaz fresh innov idea first air first yea...,0
8,encourag posit comment film look forward watch...,0
9,like origin gut wrench laughter like movi youn...,1


In [7]:
X = df['review']
Y = df['sentiment']

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=42)

x_train = x_train.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
x_test = x_test.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

In [8]:
vectorized = CountVectorizer()
x_train_cv= vectorized.fit_transform(x_train)
print(x_train_cv.shape)

(40000, 60891)


In [9]:
from sklearn.linear_model  import LogisticRegression
model = LogisticRegression( max_iter=2000)
model.fit(x_train_cv, y_train)

In [10]:
def count_acc(y_test, y_pred):
    count = 0
    accurate = 0
    for y_t, y_p in zip(y_test, y_pred):
        if(y_t == y_p):
            accurate += 1
        count += 1
    return accurate/count

In [11]:
x_test_cv= vectorized.transform(x_test)
y_pred = model.predict(x_test_cv)

print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.88      0.87      0.88      5000
           1       0.87      0.89      0.88      5000

    accuracy                           0.88     10000
   macro avg       0.88      0.88      0.88     10000
weighted avg       0.88      0.88      0.88     10000



In [12]:
count_acc(y_test, y_pred)

0.8792

### 2. With stopwords and words in origin form - 88,8% acc

In [13]:
def process_remove_punctuation(text):
    output = []
    for word in word_tokenize(text.lower()):
        if word.isalpha():
            output.append(word)
    return " ".join(output)

In [14]:
df2=pd.read_csv('../IMDB_dataset/IMDB dataset.csv')

In [15]:
df2['review']=df2['review'].apply(process_remove_punctuation)

df2.sentiment.replace("positive" , 1 , inplace = True)
df2.sentiment.replace("negative" , 0 , inplace = True)

In [16]:
df2.head(10)

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,1
1,a wonderful little production br br the filmin...,1
2,i thought this was a wonderful way to spend ti...,1
3,basically there a family where a little boy ja...,0
4,petter mattei love in the time of money is a v...,1
5,probably my favorite movie a story of selfless...,1
6,i sure would like to see a resurrection of a u...,1
7,this show was an amazing fresh innovative idea...,0
8,encouraged by the positive comments about this...,0
9,if you like original gut wrenching laughter yo...,1


In [17]:
X2 = df2['review']
Y2 = df2['sentiment']

x_train2, x_test2, y_train2, y_test2 = train_test_split(X2, Y2, test_size=0.2, stratify=Y2, random_state=42)

x_train2 = x_train2.reset_index(drop=True)
y_train2 = y_train2.reset_index(drop=True)
x_test2 = x_test2.reset_index(drop=True)
y_test2 = y_test2.reset_index(drop=True)

In [18]:
vectorized2 = CountVectorizer()
x_train_cv2 = vectorized2.fit_transform(x_train2)
print(x_train_cv2.shape)

(40000, 88261)


In [19]:
model2 = LogisticRegression( max_iter=2000)
model2.fit(x_train_cv2, y_train2)

In [21]:
x_test_cv2 = vectorized2.transform(x_test2)
y_pred2 = model2.predict(x_test_cv2)

print(classification_report(y_test2, y_pred2))

              precision    recall  f1-score   support

           0       0.89      0.89      0.89      5000
           1       0.89      0.89      0.89      5000

    accuracy                           0.89     10000
   macro avg       0.89      0.89      0.89     10000
weighted avg       0.89      0.89      0.89     10000



In [22]:
count_acc(y_test2, y_pred2)

0.888

In [50]:
import pickle

filename = 'logistic_regresion_cv.sav'
pickle.dump(model2, open(filename, 'wb'))

filename = 'logistic_regresion_cv_vectorizer.sav'
pickle.dump(vectorized2, open(filename, 'wb'))

### 3. With stopwords and steeming - 88.1%

In [23]:
df3 = pd.read_csv('../IMDB_dataset/IMDB dataset.csv')

In [24]:
def remove_punctuation_and_stem(text):
    stem_tokens = []
    stemmer = PorterStemmer()
    for word in word_tokenize(text.lower()):
        if word.isalpha():
            stem_word = stemmer.stem(word)
            stem_tokens.append(stem_word)
    return " ".join(stem_tokens)

In [25]:
df3['review']=df3['review'].apply(remove_punctuation_and_stem)

df3.sentiment.replace("positive" , 1 , inplace = True)
df3.sentiment.replace("negative" , 0 , inplace = True)

In [30]:
X3 = df3['review']
Y3 = df3['sentiment']

x_train3, x_test3, y_train3, y_test3 = train_test_split(X3, Y3, test_size=0.2, stratify=Y3, random_state=42)

x_train3 = x_train3.reset_index(drop=True)
y_train3 = y_train3.reset_index(drop=True)
x_test3 = x_test3.reset_index(drop=True)
y_test3 = y_test3.reset_index(drop=True)

In [31]:
vectorized3 = CountVectorizer()
x_train_cv3 = vectorized3.fit_transform(x_train3)
print(x_train_cv3.shape)

(40000, 60951)


In [32]:
model3 = LogisticRegression( max_iter=2000)
model3.fit(x_train_cv3, y_train3)

In [34]:
x_test_cv3 = vectorized3.transform(x_test3)
y_pred3 = model3.predict(x_test_cv3)

print(classification_report(y_test3, y_pred3))

              precision    recall  f1-score   support

           0       0.88      0.88      0.88      5000
           1       0.88      0.88      0.88      5000

    accuracy                           0.88     10000
   macro avg       0.88      0.88      0.88     10000
weighted avg       0.88      0.88      0.88     10000



In [35]:
count_acc(y_test3, y_pred3)

0.881

### 4. Without stopwords and origin form of words - 88.62%


In [38]:
df4 = pd.read_csv('../IMDB_dataset/IMDB dataset.csv')

In [41]:
def remove_punctuation_and_stopwords(text):
    output = []
    for word in word_tokenize(text.lower()):
        if word not in stop and word.isalpha():
            output.append(word)
    return " ".join(output)

In [42]:
df4['review']=df4['review'].apply(remove_punctuation_and_stopwords)

df4.sentiment.replace("positive" , 1 , inplace = True)
df4.sentiment.replace("negative" , 0 , inplace = True)

In [43]:
X4 = df4['review']
Y4 = df4['sentiment']

x_train4, x_test4, y_train4, y_test4 = train_test_split(X4, Y4, test_size=0.2, stratify=Y4, random_state=42)

x_train4 = x_train4.reset_index(drop=True)
y_train4 = y_train4.reset_index(drop=True)
x_test4 = x_test4.reset_index(drop=True)
y_test4 = y_test4.reset_index(drop=True)

In [44]:
vectorized4 = CountVectorizer()
x_train_cv4 = vectorized4.fit_transform(x_train4)
print(x_train_cv4.shape)

(40000, 88119)


In [45]:
model4 = LogisticRegression( max_iter=2000)
model4.fit(x_train_cv4, y_train4)

In [48]:
x_test_cv4 = vectorized4.transform(x_test4)
y_pred4 = model4.predict(x_test_cv4)

print(classification_report(y_test4, y_pred4))

              precision    recall  f1-score   support

           0       0.89      0.88      0.89      5000
           1       0.88      0.89      0.89      5000

    accuracy                           0.89     10000
   macro avg       0.89      0.89      0.89     10000
weighted avg       0.89      0.89      0.89     10000



In [49]:
count_acc(y_test4, y_pred4)

0.8862