In [1]:
# Importing Required Libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

In [2]:
# Loading the Data
df = pd.read_csv('IMDB Dataset.csv')

In [18]:
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1
...,...,...
49995,I thought this movie did a down right good job...,1
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",0
49997,I am a Catholic taught in parochial elementary...,0
49998,I'm going to have to disagree with the previou...,0


In [3]:
# Dropping Duplicate Rows
df.drop_duplicates(inplace=True)

In [4]:
# Mapping 'positive' and 'negative' sentiment to 1 and 0 respectively
df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})

In [5]:
# Splitting Data into Train and Test Sets
X_train, X_test, y_train, y_test = train_test_split(df['review'], df['sentiment'], test_size=0.2, random_state=42)


In [6]:
# Creating TfidfVectorizer Object
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7)

In [7]:
# Fitting and Transforming the Train Set and Transforming the Test Set
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [8]:
# Creating Logistic Regression Object and Fitting the Model
logreg = LogisticRegression()
logreg.fit(X_train_tfidf, y_train)

In [9]:
# Making Predictions on Test Set
y_pred = logreg.predict(X_test_tfidf)

In [10]:
# Calculating Accuracy Score
accuracy = np.mean(y_pred == y_test)

In [11]:
new_reviews = ['The movie was awesome and entertaining!',
               'The plot was predictable and the acting was not good.',
               'I did not like the movie at all.']

In [12]:
# Transforming New Reviews
new_reviews_tfidf = tfidf_vectorizer.transform(new_reviews)

In [13]:
# Making Predictions on New Reviews
new_predictions = logreg.predict(new_reviews_tfidf)

In [14]:
# Printing Predictions
print('New Review Predictions:', new_predictions)

New Review Predictions: [1 0 0]


In [15]:
# Saving the Model
import pickle

In [16]:
filename = 'IMDB_LogReg_Model.pkl'
pickle.dump(logreg, open(filename, 'wb'))

In [17]:
# Saving the TfidfVectorizer Object
filename = 'TfidfVectorizer_Object.pkl'
pickle.dump(tfidf_vectorizer, open(filename, 'wb'))