In [70]:
import pandas as pd
import numpy as np
import sklearn as sk
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [29]:
df_reviews = pd.read_json("IMDB_reviews.json", lines=True)

In [30]:
df_reviews.head()

Unnamed: 0,review_date,movie_id,user_id,is_spoiler,review_text,rating,review_summary
0,10 February 2006,tt0111161,ur1898687,True,"In its Oscar year, Shawshank Redemption (writt...",10,A classic piece of unforgettable film-making.
1,6 September 2000,tt0111161,ur0842118,True,The Shawshank Redemption is without a doubt on...,10,Simply amazing. The best film of the 90's.
2,3 August 2001,tt0111161,ur1285640,True,I believe that this film is the best story eve...,8,The best story ever told on film
3,1 September 2002,tt0111161,ur1003471,True,"**Yes, there are SPOILERS here**This film has ...",10,Busy dying or busy living?
4,20 May 2004,tt0111161,ur0226855,True,At the heart of this extraordinary movie is a ...,8,"Great story, wondrously told and acted"


In [31]:
display(df_reviews.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 573913 entries, 0 to 573912
Data columns (total 7 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   review_date     573913 non-null  object
 1   movie_id        573913 non-null  object
 2   user_id         573913 non-null  object
 3   is_spoiler      573913 non-null  bool  
 4   review_text     573913 non-null  object
 5   rating          573913 non-null  int64 
 6   review_summary  573913 non-null  object
dtypes: bool(1), int64(1), object(5)
memory usage: 26.8+ MB


None

In [32]:
display(df_reviews.is_spoiler.value_counts())

False    422989
True     150924
Name: is_spoiler, dtype: int64

In [33]:
display(df_reviews.movie_id.value_counts())

tt0468569    4845
tt0111161    4361
tt0167260    2729
tt0137523    2480
tt0068646    2137
             ... 
tt0107719      12
tt6294822      11
tt0104014       5
tt0201265       4
tt0114142       2
Name: movie_id, Length: 1572, dtype: int64

In [34]:
df_reviews.drop(["review_date" , "user_id","rating", "review_summary"], axis="columns",inplace=True)

In [35]:
df_reviews.head()

Unnamed: 0,movie_id,is_spoiler,review_text
0,tt0111161,True,"In its Oscar year, Shawshank Redemption (writt..."
1,tt0111161,True,The Shawshank Redemption is without a doubt on...
2,tt0111161,True,I believe that this film is the best story eve...
3,tt0111161,True,"**Yes, there are SPOILERS here**This film has ..."
4,tt0111161,True,At the heart of this extraordinary movie is a ...


In [36]:
movie1 = pd.DataFrame()

In [37]:
movie1 = df_reviews.loc[df_reviews['movie_id'] == "tt0468569"]

In [38]:
movie1.head().reset_index()

Unnamed: 0,index,movie_id,is_spoiler,review_text
0,7068,tt0468569,True,I got to see The Dark Knight on Wednesday nigh...
1,7069,tt0468569,True,I must say I was excited for this movie since ...
2,7070,tt0468569,True,I thought Batman Begins was a very well concei...
3,7071,tt0468569,True,I think the big question...or the question eve...
4,7072,tt0468569,True,(Synopsis) Bruce Wayne/Batman (Christian Bale)...


# Text Preprocessing

## Tokenization

This process divides a large piece of continuous text into distinct units or tokens basically this process is often known as Tokenization.

In [39]:
from nltk.tokenize import word_tokenize

In [40]:
movie1['review_text'] = movie1['review_text'].apply(word_tokenize)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movie1['review_text'] = movie1['review_text'].apply(word_tokenize)


In [41]:
print(df_reviews.head())

    movie_id  is_spoiler                                        review_text
0  tt0111161        True  In its Oscar year, Shawshank Redemption (writt...
1  tt0111161        True  The Shawshank Redemption is without a doubt on...
2  tt0111161        True  I believe that this film is the best story eve...
3  tt0111161        True  **Yes, there are SPOILERS here**This film has ...
4  tt0111161        True  At the heart of this extraordinary movie is a ...


## Stemming

This is the idea of removing the suffix of a word and reducing different forms of a word to a core root.
eg. waiting , waited , waits -> wait

In [42]:
from nltk.stem.snowball import SnowballStemmer
porter = SnowballStemmer("english")

In [43]:
def stem_it(text):
    return[porter.stem(word) for word in text]

In [46]:
movie1['review_text'] = movie1['review_text'].apply(stem_it)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movie1['review_text'] = movie1['review_text'].apply(stem_it)


In [51]:
movie1 = movie1.reset_index()

In [52]:
movie1.head()

Unnamed: 0,level_0,index,movie_id,is_spoiler,review_text
0,0,7068,tt0468569,True,"[i, got, to, see, the, dark, knight, on, wedne..."
1,1,7069,tt0468569,True,"[i, must, say, i, was, excit, for, this, movi,..."
2,2,7070,tt0468569,True,"[i, thought, batman, begin, was, a, veri, well..."
3,3,7071,tt0468569,True,"[i, think, the, big, question, ..., or, the, q..."
4,4,7072,tt0468569,True,"[(, synopsi, ), bruce, wayne/batman, (, christ..."


## Stopword Removal

A stop word is a commonly used word (such as “the”, “a”, “an”, “in”) that a search engine has been programmed to ignore.

In [53]:
def stop_it(text):
    dt = [word for word in text if len(word)>2]
    return dt

In [55]:
movie1['review_text'] = movie1['review_text'].apply(stop_it)

In [57]:
movie1.head()

Unnamed: 0,level_0,index,movie_id,is_spoiler,review_text
0,0,7068,tt0468569,True,"[got, see, the, dark, knight, wednesday, night..."
1,1,7069,tt0468569,True,"[must, say, was, excit, for, this, movi, sinc,..."
2,2,7070,tt0468569,True,"[thought, batman, begin, was, veri, well, conc..."
3,3,7071,tt0468569,True,"[think, the, big, question, ..., the, question..."
4,4,7072,tt0468569,True,"[synopsi, bruce, wayne/batman, christian, bale..."


In [59]:
movie1['review_text'] = movie1['review_text'].apply(' '.join)

In [60]:
movie1.head()

Unnamed: 0,level_0,index,movie_id,is_spoiler,review_text
0,0,7068,tt0468569,True,got see the dark knight wednesday night the re...
1,1,7069,tt0468569,True,must say was excit for this movi sinc the inst...
2,2,7070,tt0468569,True,thought batman begin was veri well conceiv and...
3,3,7071,tt0468569,True,think the big question ... the question everyo...
4,4,7072,tt0468569,True,synopsi bruce wayne/batman christian bale cont...


# Splitting of Data

In [76]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(movie1['review_text'], movie1['is_spoiler'], test_size=0.25)
display("X_train", X_train.head())
print('\n')
display("Y_train", y_train.head())

'X_train'

2892    while this film has superb act far heath ledge...
2065    most the time peopl give film they enjoy but n...
3481    saw this movi the 25th juli which was the prem...
3644    sorri entir cast other than heath ledger heath...
1774    humm saw this film befor batman begin and poss...
Name: review_text, dtype: object





'Y_train'

2892    False
2065    False
3481    False
3644    False
1774    False
Name: is_spoiler, dtype: bool

# Vectorization
The Vectorization is a technique used to convert textual data to numerical format.

Using Vectorization, a matrix is created where each column represents a feature and each row represents an individual review

In [78]:
from sklearn.feature_extraction.text import TfidfVectorizer
my_tfidf = TfidfVectorizer(max_df=0.7)

tfidf_train = my_tfidf.fit_transform(X_train)
tfidf_test = my_tfidf.transform(X_test)

In [79]:
print(tfidf_train.shape, tfidf_test.shape, y_train.shape)

(3633, 14850) (1212, 14850) (3633,)


# Logistic Regression

In [80]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [81]:
model_1 = LogisticRegression(max_iter = 900, solver = 'lbfgs')
model_1.fit(tfidf_train, y_train)
pred_1 = model_1.predict(tfidf_test)
cr1 = accuracy_score(y_test, pred_1)
print(cr1*100,"%")

73.51485148514851 %


# Passive Agressive Classifier
Passive: If the prediction is correct, keep the model and do not make any changes. i.e., the data in the example is not enough to cause any changes in the model.

Aggressive: If the prediction is incorrect, make changes to the model. i.e., some change to the model may correct it.

In [82]:
from sklearn.linear_model import PassiveAggressiveClassifier

model = PassiveAggressiveClassifier(max_iter=50)
model.fit(tfidf_train, y_train)

PassiveAggressiveClassifier(max_iter=50)

In [83]:
y_pred = model.predict(tfidf_test)
acc_score = accuracy_score(y_test, y_pred)
print("The Accuracy of the prediction is : ", acc_score*100)

The Accuracy of the prediction is :  63.20132013201321
