In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import joblib
import re

In [2]:
imdb_reviews_df = pd.read_csv('../../data/imdb_dataset.csv')
imdb_reviews_df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [3]:
imdb_reviews_df.shape

(50000, 2)

In [4]:
imdb_reviews_df['sentiment'].value_counts()

sentiment
positive    25000
negative    25000
Name: count, dtype: int64

In [5]:
imdb_reviews_df['sentiment'] = imdb_reviews_df['sentiment'].map({'positive':1, 'negative':0})

In [6]:
x_train, x_test1, y_train, y_test1 = train_test_split(imdb_reviews_df.drop('sentiment',axis=1),imdb_reviews_df['sentiment'],stratify=imdb_reviews_df['sentiment'],test_size=0.25,random_state=60)

x_test1, x_test2, y_test1, y_test2 = train_test_split(x_test1, y_test1, stratify=y_test1,test_size=0.5, random_state=60)

print(f"Train pairs : {x_train.shape, y_train.shape}")
print(f"Test 1 pairs : {x_test1.shape, y_test1.shape}")
print(f"Test 2 pairs : {x_test2.shape, y_test2.shape}")

Train pairs : ((37500, 1), (37500,))
Test 1 pairs : ((6250, 1), (6250,))
Test 2 pairs : ((6250, 1), (6250,))


In [7]:
x_test1.to_csv('../../data/test_set_1.csv',index=False)
y_test1.to_csv('../../data/test_set_1_answers.csv',index=False)

x_test2.to_csv('../../data/test_set_2.csv',index=False)
y_test2.to_csv('../../data/test_set_2_answers.csv',index=False)

In [8]:
def preprocess_reviews(text):
    text = text.lower()
    text = re.sub("[^a-z0-9]"," ", text)
    text = re.sub("(\s)+"," ", text)
    text = text.strip()
    return text

In [9]:
x_train['review'] = x_train['review'].apply(preprocess_reviews)

In [10]:
tfidf = TfidfVectorizer(max_features=200, stop_words='english')

tfidf.fit(x_train['review'])

In [11]:
joblib.dump(tfidf, open('../../model/tfidf_vectorizer.pkl','wb'),compress=True)

In [12]:
x_train = pd.DataFrame(tfidf.transform(x_train['review']).toarray(), columns=tfidf.get_feature_names_out())

In [13]:
x_train.shape

(37500, 200)

In [14]:
model = LogisticRegression(random_state=60)
model.fit(x_train, y_train)

In [15]:
joblib.dump(model, open('../../model/logistic_model.pkl','wb'))