In [63]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [64]:
df = pd.read_csv("IMDB Dataset.csv.zip")

In [65]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [66]:
df['sentiment']=df['sentiment'].map({'positive':1,'negative':0})

In [67]:
import re

In [68]:
def clean_text(text):
   text = re.sub(r'https?://\S+|www\.\S+', '', text)
   text = re.sub(r'@\w+', '', text)
   text = re.sub(r'#', '', text)
   emoji_pattern = re.compile(
        "["
        "\U0001F600-\U0001F64F"  # emoticons
        "\U0001F300-\U0001F5FF"  # symbols & pictographs
        "\U0001F680-\U0001F6FF"  # transport & map symbols
        "\U0001F1E0-\U0001F1FF"  # flags
        "]+",
        flags=re.UNICODE
    )
   text = emoji_pattern.sub('', text)
   text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
   text = text.lower()
   text = re.sub(r'\s+', ' ', text).strip()

   return text

In [69]:
df['review'] = df['review'].apply(clean_text)

In [70]:
df.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,1
1,a wonderful little production br br the filmin...,1
2,i thought this was a wonderful way to spend ti...,1
3,basically theres a family where a little boy j...,0
4,petter matteis love in the time of money is a ...,1


In [71]:
df['review'] = df['review'].apply(lambda x:x.split())

In [72]:
count = 0
for i in df['review']:
  count += len(i)
print("Number of words in entire corpus :",count)

Number of words in entire corpus : 11509865


In [73]:
df['review'] = df['review'].apply(lambda x:" ".join(x))

In [74]:
import nltk

In [75]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [76]:
def stem(text):
  y=[]
  for i in text.split():
    y.append(ps.stem(i))
  return " ".join(y)

In [77]:
df['review']=df['review'].apply(stem)

In [78]:
df.sample(3)

Unnamed: 0,review,sentiment
9895,a 14 year old girl develop her first seriou cr...,1
11296,i rent zero day from the local video store las...,1
48303,well i gave thi movi a 7 it wa better than thi...,1


In [79]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(max_features=5000,ngram_range=(1,2),stop_words='english')

In [80]:
X = tfidf.fit_transform(df['review'])
Y = df['sentiment']

In [81]:
x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size=0.2,random_state=42)

In [82]:
model = LogisticRegression()

In [83]:
model.fit(x_train,y_train)

In [84]:
from sklearn.metrics import accuracy_score

In [85]:
y_pred = model.predict(x_test)
print("Accuracy :",accuracy_score(y_test,y_pred))

Accuracy : 0.8867


In [87]:
r = input("Enter you movie review :")
r = clean_text(r)
r = " ".join([ps.stem(w) for w in r.split()])
review = [r]
review_vec = tfidf.transform(review)
prediction = model.predict(review_vec)
print("Sentiment:", "Positive" if prediction[0] == 1 else "Negative")

Enter you movie review : "This movie was so boring i almost slept"


Sentiment: Negative
