In [1]:
import pandas as pd
import numpy as np
import sklearn as sk
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [2]:
df_reviews = pd.read_json("IMDB_reviews.json", lines=True)

In [3]:
df_movies = pd.read_json('IMDB_movie_details.json', lines = True)

In [4]:
def get_info(df):
    return df.info()

In [5]:
def get_head(df):
    return df.head()

In [6]:
df_reviews.drop(["review_date", "user_id","rating", "review_summary"], axis="columns",inplace=True)

In [7]:
df_movies.drop(["duration", "rating", "plot_summary", "plot_synopsis"], axis="columns",inplace=True)

In [8]:
get_info(df_movies)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1572 entries, 0 to 1571
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   movie_id      1572 non-null   object
 1   genre         1572 non-null   object
 2   release_date  1572 non-null   object
dtypes: object(3)
memory usage: 37.0+ KB


In [9]:
display(df_reviews.is_spoiler.value_counts())

False    422989
True     150924
Name: is_spoiler, dtype: int64

In [6]:
display(df_reviews.movie_id.value_counts())

tt0468569    4845
tt0111161    4361
tt0167260    2729
tt0137523    2480
tt0068646    2137
             ... 
tt0107719      12
tt6294822      11
tt0104014       5
tt0201265       4
tt0114142       2
Name: movie_id, Length: 1572, dtype: int64

In [8]:
def add_col(a_df, b_df, on_col):
    pd.merge(a_df, b_df, on = on_col, how = 'left')

In [9]:
add_col(df_reviews, df_movies, 'movie_id')

In [152]:
get_head(df_reviews)

Unnamed: 0,movie_id,is_spoiler,review_text
0,tt0111161,True,"In its Oscar year, Shawshank Redemption (writt..."
1,tt0111161,True,The Shawshank Redemption is without a doubt on...
2,tt0111161,True,I believe that this film is the best story eve...
3,tt0111161,True,"**Yes, there are SPOILERS here**This film has ..."
4,tt0111161,True,At the heart of this extraordinary movie is a ...


In [13]:
get_info(df_reviews)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 573913 entries, 0 to 573912
Data columns (total 3 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   movie_id     573913 non-null  object
 1   is_spoiler   573913 non-null  bool  
 2   review_text  573913 non-null  object
dtypes: bool(1), object(2)
memory usage: 9.3+ MB


In [153]:
#movie_reviews_df = df_reviews.iloc[ :25000]
def get_sample(df, samp_size):
    return df.sample(n = samp_size)

In [154]:
movie_reviews_df = get_sample(df_reviews, 20000)

In [155]:
def reset_idx(df):
    return df.reset_index()

In [156]:
movie_reviews_df = reset_idx(movie_reviews_df)

In [157]:
get_info(movie_reviews_df)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   index        20000 non-null  int64 
 1   movie_id     20000 non-null  object
 2   is_spoiler   20000 non-null  bool  
 3   review_text  20000 non-null  object
dtypes: bool(1), int64(1), object(2)
memory usage: 488.4+ KB


In [158]:
get_head(movie_reviews_df)

Unnamed: 0,index,movie_id,is_spoiler,review_text
0,504342,tt0107050,False,It has been twenty years since the original re...
1,220413,tt1371150,True,A family spends a week together on the request...
2,116803,tt3647998,False,"Thick British accents, low volume while the th..."
3,447785,tt0367882,True,"For starters, this isn't a review. I can't bri..."
4,384919,tt0435625,False,Creepy eerie little horror film from England. ...


# Text Preprocessing

In [None]:
#clean the dataset

In [159]:
def apply_func_2_col(df, col, func):
    df[col] = df[col].apply(func)

In [160]:
import re

In [161]:
def lower_df(df, col):
    df[col] = [_.lower() for _ in df[col]]
    

In [162]:
#lower all the words for simplification
lower_df(movie_reviews_df, 'review_text')

In [163]:
def remove_punc(df, col):
    df[col] = [re.sub(r'\W', ' ', _) for _ in df[col]]
    df[col] = [re.sub(r'\s+', ' ', _) for _ in df[col]]

In [164]:
remove_punc(movie_reviews_df, 'review_text')

## Tokenization

This process divides a large piece of continuous text into distinct units or tokens basically this process is often known as Tokenization.

In [165]:
from nltk.tokenize import word_tokenize

In [166]:
def tokenize_df(df_col):
    return df_col.apply(word_tokenize)

In [167]:
movie_reviews_df['review_text'] = tokenize_df(movie_reviews_df['review_text'])

In [168]:
get_head(movie_reviews_df)

Unnamed: 0,index,movie_id,is_spoiler,review_text
0,504342,tt0107050,False,"[it, has, been, twenty, years, since, the, ori..."
1,220413,tt1371150,True,"[a, family, spends, a, week, together, on, the..."
2,116803,tt3647998,False,"[thick, british, accents, low, volume, while, ..."
3,447785,tt0367882,True,"[for, starters, this, isn, t, a, review, i, ca..."
4,384919,tt0435625,False,"[creepy, eerie, little, horror, film, from, en..."


## Stemming

This is the idea of removing the suffix of a word and reducing different forms of a word to a core root.
eg. waiting , waited , waits -> wait

In [169]:
from nltk.stem.snowball import SnowballStemmer
porter = SnowballStemmer("english")

In [170]:
def stem_it(text):
    return[porter.stem(word) for word in text]

In [171]:
movie_reviews_df['review_text'] = movie_reviews_df['review_text'].apply(stem_it)

In [172]:
get_head(movie_reviews_df)

Unnamed: 0,index,movie_id,is_spoiler,review_text
0,504342,tt0107050,False,"[it, has, been, twenti, year, sinc, the, origi..."
1,220413,tt1371150,True,"[a, famili, spend, a, week, togeth, on, the, r..."
2,116803,tt3647998,False,"[thick, british, accent, low, volum, while, th..."
3,447785,tt0367882,True,"[for, starter, this, isn, t, a, review, i, can..."
4,384919,tt0435625,False,"[creepi, eeri, littl, horror, film, from, engl..."


In [50]:
def drop_col(df, col):
    df.drop(col, axis = "columns", inplace = True)

## Stopword Removal

A stop word is a commonly used word (such as “the”, “a”, “an”, “in”) that a search engine has been programmed to ignore.

In [173]:
def stop_it(text):
    dt = [word for word in text if len(word)>2]
    return dt

In [174]:
apply_func_2_col(movie_reviews_df, 'review_text', stop_it)

In [56]:
#make bi/trigrams
#from nltk.util import ngrams

In [57]:
#def add_ngrams(df, col, n , new_col):
#    df[new_col] = [list(ngrams(sentence, n)) for sentence in df[col]]

In [58]:
#add bigrams to MOVIE REVIEWS
#add_ngrams(movie_reviews_df, 'review_text', 2, 'bigrams')

In [None]:
#join words as a sentence

In [175]:
apply_func_2_col(movie_reviews_df, 'review_text', ' '.join)

In [176]:
get_head(movie_reviews_df)

Unnamed: 0,index,movie_id,is_spoiler,review_text
0,504342,tt0107050,False,has been twenti year sinc the origin releas gr...
1,220413,tt1371150,True,famili spend week togeth the request the mom a...
2,116803,tt3647998,False,thick british accent low volum while the thick...
3,447785,tt0367882,True,for starter this isn review can bring myself i...
4,384919,tt0435625,False,creepi eeri littl horror film from england abo...


# Splitting of Data

In [177]:
from sklearn.model_selection import train_test_split


In [178]:
def get_train_test(df, text_col, bool_col):
    X_train, X_test, y_train, y_test = train_test_split(df[text_col], df[bool_col], test_size=0.25, random_state = 1)
    #display("X_train", X_train.head())
    #print('\n')
    #display("Y_train", y_train.head())
    return X_train, X_test, y_train, y_test

In [179]:
 X_train, X_test, y_train, y_test = get_train_test(movie_reviews_df, 'review_text', 'is_spoiler')

'X_train'

18960    this just tell all you movi lover out there th...
11353    wreck ralph anim film which entertain more adu...
7449     this movi was made the earli 90s befor marvel ...
14308    the conceit sound too good true mother and her...
11889    this movi without doubt favorit western all ti...
Name: review_text, dtype: object





'Y_train'

18960    False
11353    False
7449     False
14308     True
11889    False
Name: is_spoiler, dtype: bool

# Vectorization
The Vectorization is a technique used to convert textual data to numerical format.

Using Vectorization, a matrix is created where each column represents a feature and each row represents an individual review

In [180]:
#Count Vectorizer
from sklearn.feature_extraction.text import CountVectorizer

In [181]:
def get_ngram_cv(no_of_grams):
    return CountVectorizer(analyzer = 'word',ngram_range=(1,no_of_grams), stop_words='english')

In [182]:
cv = get_ngram_cv(2) #2 is pretty good

In [183]:
X_train_cv = cv.fit_transform(X_train)
X_test_cv = cv.transform(X_test)

In [184]:
X_train_cv

<15000x1020401 sparse matrix of type '<class 'numpy.int64'>'
	with 3161932 stored elements in Compressed Sparse Row format>

In [185]:
#with score
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score
clf = MultinomialNB()

In [186]:
clf.fit(X_train_cv, y_train)

MultinomialNB()

In [187]:
 y_pred = clf.predict(X_test_cv)

In [190]:
def get_score(y_test, y_pred):
    score = f1_score(y_test, y_pred, average='micro')
    return 'F-1 score : {}'.format(np.round(score,4))

In [191]:
get_score(y_test, y_pred)

'F-1 score : 0.4357'

In [192]:
from sklearn.feature_extraction.text import TfidfVectorizer


In [193]:
def get_tfidf():
    return TfidfVectorizer(max_df=0.75, min_df = 0.01, sublinear_tf = True,
                             use_idf = True)

In [194]:
def get_vectors(my_tfidf, train_var, test_var):
    tfidf_train = my_tfidf.fit_transform(train_var)
    tfidf_test = my_tfidf.transform(test_var)
    return tfidf_train, tfidf_test
    

In [195]:
my_tfidf = get_tfidf()

In [196]:
tfidf_train, tfidf_test = get_vectors(my_tfidf, X_train, X_test)

In [197]:
print( tfidf_train.shape, tfidf_test.shape, y_train.shape, y_test.shape)

(15000, 1763) (5000, 1763) (15000,) (5000,)


In [None]:
#SVM method

In [198]:
from sklearn import svm
from sklearn.metrics import classification_report

In [199]:
classifier_linear = svm.SVC(kernel='linear')

In [147]:
prediction_linear = classifier_linear.predict(tfidf_test)

In [148]:
prediction_linear

array([False, False, False, ..., False, False, False])

In [120]:
report = classification_report(y_test, prediction_linear, output_dict=True)
report

{'False': {'precision': 0.7763908701854494,
  'recall': 0.9537787513691128,
  'f1-score': 0.8559913496510372,
  'support': 4565},
 'True': {'precision': 0.6713395638629284,
  'recall': 0.255786350148368,
  'f1-score': 0.37043403523850454,
  'support': 1685},
 'accuracy': 0.7656,
 'macro avg': {'precision': 0.7238652170241888,
  'recall': 0.6047825507587404,
  'f1-score': 0.6132126924447708,
  'support': 6250},
 'weighted avg': {'precision': 0.7480690380008977,
  'recall': 0.7656,
  'f1-score': 0.7250850976854184,
  'support': 6250}}

# Logistic Regression

In [149]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [150]:
model_1 = LogisticRegression(max_iter = 900, solver = 'lbfgs')
model_1.fit(tfidf_train, y_train)
pred_1 = model_1.predict(tfidf_test)
cr1 = accuracy_score(y_test, pred_1)
print(cr1*100,"%")

74.32 %


# Passive Agressive Classifier
Passive: If the prediction is correct, keep the model and do not make any changes. i.e., the data in the example is not enough to cause any changes in the model.

Aggressive: If the prediction is incorrect, make changes to the model. i.e., some change to the model may correct it.

In [124]:
from sklearn.linear_model import PassiveAggressiveClassifier

model = PassiveAggressiveClassifier(max_iter=50)
model.fit(tfidf_train, y_train)

PassiveAggressiveClassifier(max_iter=50)

In [125]:
y_pred = model.predict(tfidf_test)
acc_score = accuracy_score(y_test, y_pred)
print("The Accuracy of the prediction is : ", acc_score*100)

The Accuracy of the prediction is :  75.104
