In [1]:
import pandas as pd 
from nltk.corpus import stopwords
from string import punctuation
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
import pickle

In [2]:
stop = set(stopwords.words('english'))
punctuation = list(punctuation)
stop.update(punctuation)

In [1]:
def count_acc(y_test, y_pred):
    count = 0
    accurate = 0
    for y_t, y_p in zip(y_test, y_pred):
        if(y_t == y_p):
            accurate += 1
        count += 1
    return accurate/count

### 1. With stopwords and origin form of words - 89.8%

In [56]:
def process_remove_punctuation(text):
    output = []
    for word in word_tokenize(text.lower()):
        if word.isalpha():
            output.append(word)
    return " ".join(output)

In [57]:
df = pd.read_csv('../IMDB_dataset/IMDB dataset.csv')

In [58]:
df['review']=df['review'].apply(process_remove_punctuation)

df.sentiment.replace("positive" , 1 , inplace = True)
df.sentiment.replace("negative" , 0 , inplace = True)

In [59]:
df.head(5)

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,1
1,a wonderful little production br br the filmin...,1
2,i thought this was a wonderful way to spend ti...,1
3,basically there a family where a little boy ja...,0
4,petter mattei love in the time of money is a v...,1


In [60]:
X = df['review']
Y = df['sentiment']

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=42)

x_train = x_train.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
x_test = x_test.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

In [61]:
vectorizer = TfidfVectorizer()
x_train_tfidf  = vectorizer.fit_transform(x_train)
print(x_train_tfidf.shape)

(40000, 88261)


In [62]:
model = LogisticRegression( max_iter=2000)
model.fit(x_train_tfidf, y_train)

In [63]:
x_test_tfidf = vectorizer.transform(x_test)
y_pred = model.predict(x_test_tfidf)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.90      0.89      0.90      5000
           1       0.89      0.91      0.90      5000

    accuracy                           0.90     10000
   macro avg       0.90      0.90      0.90     10000
weighted avg       0.90      0.90      0.90     10000



In [64]:
count_acc(y_test, y_pred)

0.898

In [65]:
filename = 'logistic_regresion_tfidf.sav'
pickle.dump(model, open(filename, 'wb'))

In [66]:
filename = 'logistic_regresion_tfidf_vectorizer.sav'
pickle.dump(vectorizer, open(filename, 'wb'))

### 2. With stopwords and stemming - 89.19% acc

In [16]:
def remove_punctuation_and_stem(text):
    stem_tokens = []
    stemmer = PorterStemmer()
    for word in word_tokenize(text.lower()):
        if word.isalpha():
            stem_word = stemmer.stem(word)
            stem_tokens.append(stem_word)
    return " ".join(stem_tokens)

In [17]:
df = pd.read_csv('../IMDB_dataset/IMDB dataset.csv')

In [18]:
df['review']=df['review'].apply(remove_punctuation_and_stem)

df.sentiment.replace("positive" , 1 , inplace = True)
df.sentiment.replace("negative" , 0 , inplace = True)

In [19]:
X = df['review']
Y = df['sentiment']

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=42)

x_train = x_train.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
x_test = x_test.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

In [20]:
vectorizer = TfidfVectorizer()
x_train_tfidf  = vectorizer.fit_transform(x_train)
print(x_train_tfidf.shape)

(40000, 60951)


In [21]:
model = LogisticRegression( max_iter=2000)
model.fit(x_train_tfidf, y_train)

In [22]:
x_test_tfidf = vectorizer.transform(x_test)
y_pred = model.predict(x_test_tfidf)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.90      0.89      0.89      5000
           1       0.89      0.90      0.89      5000

    accuracy                           0.89     10000
   macro avg       0.89      0.89      0.89     10000
weighted avg       0.89      0.89      0.89     10000



In [23]:
count_acc(y_test, y_pred)

0.8919

### 3. Without stopwords and origin form of words - 89.65% acc

In [27]:
def remove_punctuation_and_stopwords(text):
    output = []
    for word in word_tokenize(text.lower()):
        if word not in stop and word.isalpha():
            output.append(word)
    return " ".join(output)

In [25]:
df = pd.read_csv('../IMDB_dataset/IMDB dataset.csv')

In [28]:
df['review']=df['review'].apply(remove_punctuation_and_stopwords)

df.sentiment.replace("positive" , 1 , inplace = True)
df.sentiment.replace("negative" , 0 , inplace = True)

In [29]:
X = df['review']
Y = df['sentiment']

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=42)

x_train = x_train.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
x_test = x_test.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

In [30]:
vectorizer = TfidfVectorizer()
x_train_tfidf  = vectorizer.fit_transform(x_train)
print(x_train_tfidf.shape)

(40000, 88119)


In [43]:
model = LogisticRegression( max_iter=2000)
model.fit(x_train_tfidf, y_train)

In [44]:
x_test_tfidf = vectorizer.transform(x_test)
y_pred = model.predict(x_test_tfidf)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.90      0.89      0.90      5000
           1       0.89      0.91      0.90      5000

    accuracy                           0.90     10000
   macro avg       0.90      0.90      0.90     10000
weighted avg       0.90      0.90      0.90     10000



In [45]:
count_acc(y_test, y_pred)

0.8965

### 4. Without stopwords and with steeming  - 89.18%

In [46]:
def remove_stopwords_and_stem(text):
    stem_tokens = []
    stemmer = PorterStemmer()
    for word in word_tokenize(text.lower()):
        if word not in stop and word.isalpha():
            stem_word = stemmer.stem(word)
            stem_tokens.append(stem_word)
    return " ".join(stem_tokens)

In [47]:
df = pd.read_csv('../IMDB_dataset/IMDB dataset.csv')

In [48]:
df['review']=df['review'].apply(remove_stopwords_and_stem)

df.sentiment.replace("positive" , 1 , inplace = True)
df.sentiment.replace("negative" , 0 , inplace = True)

In [49]:
X = df['review']
Y = df['sentiment']

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=42)

x_train = x_train.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
x_test = x_test.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

In [50]:
vectorizer = TfidfVectorizer()
x_train_tfidf  = vectorizer.fit_transform(x_train)
print(x_train_tfidf.shape)

(40000, 60891)


In [51]:
model = LogisticRegression( max_iter=2000)
model.fit(x_train_tfidf, y_train)

In [52]:
x_test_tfidf = vectorizer.transform(x_test)
y_pred = model.predict(x_test_tfidf)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.90      0.88      0.89      5000
           1       0.88      0.90      0.89      5000

    accuracy                           0.89     10000
   macro avg       0.89      0.89      0.89     10000
weighted avg       0.89      0.89      0.89     10000



In [53]:
count_acc(y_test, y_pred)

0.8918