In [1]:
import pandas as pd
import numpy as np
import sklearn as sk
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [2]:
df_reviews = pd.read_json("IMDB_reviews.json", lines=True)

In [3]:
df_reviews.head()

Unnamed: 0,review_date,movie_id,user_id,is_spoiler,review_text,rating,review_summary
0,10 February 2006,tt0111161,ur1898687,True,"In its Oscar year, Shawshank Redemption (writt...",10,A classic piece of unforgettable film-making.
1,6 September 2000,tt0111161,ur0842118,True,The Shawshank Redemption is without a doubt on...,10,Simply amazing. The best film of the 90's.
2,3 August 2001,tt0111161,ur1285640,True,I believe that this film is the best story eve...,8,The best story ever told on film
3,1 September 2002,tt0111161,ur1003471,True,"**Yes, there are SPOILERS here**This film has ...",10,Busy dying or busy living?
4,20 May 2004,tt0111161,ur0226855,True,At the heart of this extraordinary movie is a ...,8,"Great story, wondrously told and acted"


In [72]:
display(df_reviews.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 573913 entries, 0 to 573912
Data columns (total 7 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   review_date     573913 non-null  object
 1   movie_id        573913 non-null  object
 2   user_id         573913 non-null  object
 3   is_spoiler      573913 non-null  bool  
 4   review_text     573913 non-null  object
 5   rating          573913 non-null  int64 
 6   review_summary  573913 non-null  object
dtypes: bool(1), int64(1), object(5)
memory usage: 26.8+ MB


None

In [5]:
display(df_reviews.is_spoiler.value_counts())

False    422989
True     150924
Name: is_spoiler, dtype: int64

In [6]:
display(df_reviews.movie_id.value_counts())

tt0468569    4845
tt0111161    4361
tt0167260    2729
tt0137523    2480
tt0068646    2137
             ... 
tt0107719      12
tt6294822      11
tt0104014       5
tt0201265       4
tt0114142       2
Name: movie_id, Length: 1572, dtype: int64

In [4]:
df_reviews.drop(["review_date", "user_id","rating", "review_summary"], axis="columns",inplace=True)

In [5]:
df_reviews.head()

Unnamed: 0,movie_id,is_spoiler,review_text
0,tt0111161,True,"In its Oscar year, Shawshank Redemption (writt..."
1,tt0111161,True,The Shawshank Redemption is without a doubt on...
2,tt0111161,True,I believe that this film is the best story eve...
3,tt0111161,True,"**Yes, there are SPOILERS here**This film has ..."
4,tt0111161,True,At the heart of this extraordinary movie is a ...


In [14]:
movie_id_col = df_reviews['movie_id']

In [17]:
movie_reviews_df = df_reviews.loc[(movie_id_col == "tt0468569") | (movie_id_col == "tt0111161") | (movie_id_col == "tt0167260")]

In [91]:
#movie_reviews_df = df_reviews.iloc[ :25000]
movie_reviews_df = df_reviews.sample(n = 25000)

In [19]:
def reset_idx(df):
    return df.reset_index()

In [92]:
movie_reviews_df = reset_idx(movie_reviews_df)

In [24]:
def get_info(df):
    return df.info()

In [32]:
def get_head(df):
    return df.head()

In [93]:
get_info(movie_reviews_df)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   index        25000 non-null  int64 
 1   movie_id     25000 non-null  object
 2   is_spoiler   25000 non-null  bool  
 3   review_text  25000 non-null  object
dtypes: bool(1), int64(1), object(2)
memory usage: 610.5+ KB


In [95]:
def apply_func_2_col(df, col, func):
    df[col] = df[col].apply(func)

# Text Preprocessing

## Tokenization

This process divides a large piece of continuous text into distinct units or tokens basically this process is often known as Tokenization.

In [29]:
from nltk.tokenize import word_tokenize

In [30]:
def tokenize_df(df_col):
    return df_col.apply(word_tokenize)

In [97]:
movie_reviews_df['review_text'] = tokenize_df(movie_reviews_df['review_text'])

In [98]:
get_head(movie_reviews_df)

Unnamed: 0,index,movie_id,is_spoiler,review_text
0,121626,tt1250777,False,"[Its, been, so, long, since, i, 've, been, to,..."
1,452803,tt0800039,False,"[Oh, well, ,, this, is, one, of, these, movies..."
2,381817,tt0355295,False,"[I, 'm, posting, mostly, because, I, read, one..."
3,411922,tt0389860,True,"[*, *, *, Spoiler, Follows, *, *, *, I, think,..."
4,226083,tt3460252,False,"[It, has, n't, been, a, long, time, since, Tar..."


## Stemming

This is the idea of removing the suffix of a word and reducing different forms of a word to a core root.
eg. waiting , waited , waits -> wait

In [99]:
from nltk.stem.snowball import SnowballStemmer
porter = SnowballStemmer("english")

In [36]:
def stem_it(text):
    return[porter.stem(word) for word in text]

In [100]:
movie_reviews_df['review_text'] = movie_reviews_df['review_text'].apply(stem_it)

In [101]:
get_head(movie_reviews_df)

Unnamed: 0,index,movie_id,is_spoiler,review_text
0,121626,tt1250777,False,"[it, been, so, long, sinc, i, ve, been, to, th..."
1,452803,tt0800039,False,"[oh, well, ,, this, is, one, of, these, movi, ..."
2,381817,tt0355295,False,"[i, 'm, post, most, becaus, i, read, one, too,..."
3,411922,tt0389860,True,"[*, *, *, spoiler, follow, *, *, *, i, think, ..."
4,226083,tt3460252,False,"[it, has, n't, been, a, long, time, sinc, tara..."


## Stopword Removal

A stop word is a commonly used word (such as “the”, “a”, “an”, “in”) that a search engine has been programmed to ignore.

In [42]:
def stop_it(text):
    dt = [word for word in text if len(word)>2]
    return dt

In [102]:
apply_func_to_col(movie_reviews_df, 'review_text', stop_it)

In [103]:
apply_func_to_col(movie_reviews_df, 'review_text', ' '.join)

In [104]:
get_head(movie_reviews_df)

Unnamed: 0,index,movie_id,is_spoiler,review_text
0,121626,tt1250777,False,been long sinc been the theater and was just b...
1,452803,tt0800039,False,well this one these movi you watch and when fi...
2,381817,tt0355295,False,post most becaus read one too mani review proc...
3,411922,tt0389860,True,spoiler follow think that the scene outsid the...
4,226083,tt3460252,False,has n't been long time sinc tarantino first tr...


# Splitting of Data

In [105]:
from sklearn.model_selection import train_test_split


In [106]:
def get_train_test(df):
    X_train, X_test, y_train, y_test = train_test_split(df['review_text'], df['is_spoiler'], test_size=0.25)
    display("X_train", X_train.head())
    print('\n')
    display("Y_train", y_train.head())
    return X_train, X_test, y_train, y_test

In [107]:
X_train, X_test, y_train, y_test = get_train_test(movie_reviews_df)

'X_train'

5198     when first saw the trailer for superbad though...
21211    this the movi that pit second-r prize fighter ...
23144    horror can make scare and some n't conjur veri...
15219    kill mock bird worth seen and read cours veri ...
14425    probabl favorit jack black movi date take the ...
Name: review_text, dtype: object





'Y_train'

5198      True
21211     True
23144    False
15219    False
14425     True
Name: is_spoiler, dtype: bool

# Vectorization
The Vectorization is a technique used to convert textual data to numerical format.

Using Vectorization, a matrix is created where each column represents a feature and each row represents an individual review

In [78]:
from sklearn.feature_extraction.text import TfidfVectorizer


In [108]:
def get_tfidf():
    return TfidfVectorizer(max_df=0.75, min_df = 0.01, sublinear_tf = True,
                             use_idf = True)

In [109]:
def get_vectors(my_tfidf, train_var, test_var):
    tfidf_train = my_tfidf.fit_transform(train_var)
    tfidf_test = my_tfidf.transform(test_var)
    return tfidf_train, tfidf_test
    

In [110]:
my_tfidf = get_tfidf()

In [111]:
tfidf_train, tfidf_test = get_vectors(my_tfidf, X_train, X_test)

In [112]:
print( tfidf_train.shape, tfidf_test.shape, y_train.shape, y_test.shape)

(18750, 1768) (6250, 1768) (18750,) (6250,)


In [None]:
#SVM method

In [113]:
from sklearn import svm
from sklearn.metrics import classification_report

In [114]:
classifier_linear = svm.SVC(kernel='linear')

In [118]:
classifier_linear.fit(tfidf_train, y_train)

SVC(kernel='linear')

In [119]:
prediction_linear = classifier_linear.predict(tfidf_test)

In [117]:
prediction_linear

array([False,  True, False, ...,  True, False, False])

In [120]:
report = classification_report(y_test, prediction_linear, output_dict=True)
report

{'False': {'precision': 0.7763908701854494,
  'recall': 0.9537787513691128,
  'f1-score': 0.8559913496510372,
  'support': 4565},
 'True': {'precision': 0.6713395638629284,
  'recall': 0.255786350148368,
  'f1-score': 0.37043403523850454,
  'support': 1685},
 'accuracy': 0.7656,
 'macro avg': {'precision': 0.7238652170241888,
  'recall': 0.6047825507587404,
  'f1-score': 0.6132126924447708,
  'support': 6250},
 'weighted avg': {'precision': 0.7480690380008977,
  'recall': 0.7656,
  'f1-score': 0.7250850976854184,
  'support': 6250}}

# Logistic Regression

In [121]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [122]:
model_1 = LogisticRegression(max_iter = 900, solver = 'lbfgs')
model_1.fit(tfidf_train, y_train)
pred_1 = model_1.predict(tfidf_test)
cr1 = accuracy_score(y_test, pred_1)
print(cr1*100,"%")

76.416 %


# Passive Agressive Classifier
Passive: If the prediction is correct, keep the model and do not make any changes. i.e., the data in the example is not enough to cause any changes in the model.

Aggressive: If the prediction is incorrect, make changes to the model. i.e., some change to the model may correct it.

In [124]:
from sklearn.linear_model import PassiveAggressiveClassifier

model = PassiveAggressiveClassifier(max_iter=50)
model.fit(tfidf_train, y_train)

PassiveAggressiveClassifier(max_iter=50)

In [125]:
y_pred = model.predict(tfidf_test)
acc_score = accuracy_score(y_test, y_pred)
print("The Accuracy of the prediction is : ", acc_score*100)

The Accuracy of the prediction is :  75.104
