In [14]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("..\data\Preprocessed_data.csv")

In [3]:
df = df.drop('Unnamed: 0', axis=1)

In [4]:
df

Unnamed: 0,Ratings,Reviews,Clean_Reviews
0,1.0,*Disclaimer: I only watched this movie as a co...,watched conditional see films not caught dead ...
1,1.0,I am writing this in hopes that this gets put ...,writing hopes gets put previous review anyone ...
2,1.0,"Really, I could write a scathing review of thi...",write scathing review turd going making observ...
3,1.0,If you saw the other previous spoof movies by ...,saw previous spoof movies two horrible know al...
4,1.0,This movie I saw a day early for free and I st...,saw day early free still feel like got ripped ...
...,...,...,...
149995,10.0,GoldenEye (1995) is my number 1 personal favor...,goldeneye number personal favorite james bond ...
149996,10.0,"*** 1/2Starring: Pierce Brosnan, Izabella Scor...",pierce izabella sean famke agent james still s...
149997,10.0,"I've given this film a 10, not just because I ...",given not thoroughly enjoyed believe best bond...
149998,10.0,I absolutely adore this movie. What a comeback...,absolutely adore comeback timothy dalton franc...


In [5]:
df.isnull().sum()

Ratings           0
Reviews           0
Clean_Reviews    10
dtype: int64

In [6]:
df = df.dropna()

In [7]:
df['Label'] = df['Ratings'].apply(lambda x: '1' if x>=7 else ('0' if x<=4 else '2'))
df = df[df.Label<'2']
data = df[["Clean_Reviews", "Label"]]
print(data['Label'].value_counts())

1    59996
0    59995
Name: Label, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Label'] = df['Ratings'].apply(lambda x: '1' if x>=7 else ('0' if x<=4 else '2'))


In [8]:
import sys
import os
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from prettytable import PrettyTable
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer

In [9]:
# import nltk
# nltk.download('wordnet')

# Lemmatization

In [10]:
class LemmaTokenizer(object):
    def __init__(self):
        self.wordnetlemma = WordNetLemmatizer()
    def __call__(self, reviews):
        return [self.wordnetlemma.lemmatize(word) for word in word_tokenize(reviews)]

# Vectorization with CountVectorizer and Tfidf with unigram

In [11]:
train, test = train_test_split(data, test_size=0.3, random_state=42, shuffle=True)
# min_df=10 -> terms having minimum frequency =10
countvect = CountVectorizer(analyzer="word", tokenizer=LemmaTokenizer(), ngram_range=(1,1), min_df=10, max_features=500)
tfidfvect = TfidfVectorizer(analyzer="word", tokenizer=LemmaTokenizer(), ngram_range=(1,1), min_df=10, max_features=500)
x_train_count = countvect.fit_transform(train["Clean_Reviews"]).toarray()
x_test_count = countvect.transform(test["Clean_Reviews"]).toarray()
x_train_tfidf = tfidfvect.fit_transform(train["Clean_Reviews"]).toarray()
x_test_tfidf = tfidfvect.transform(test["Clean_Reviews"]).toarray()
y_train = train['Label']
y_test = test['Label']



In [15]:
# np.save(r"..\data\x_train_tfidf", x_train_tfidf)
# np.save(r"..\data\x_test_tfidf", x_test_tfidf)
# np.save(r"..\data\y_train", y_train)
# np.save(r"..\data\y_test", y_test)

## Feature Importance with Logistic Regression and Count Vectorizer with unigram

In [27]:
lgr = LogisticRegression()
lgr.fit(x_train_count, y_train)
lgr.score(x_test_count, y_test)
lgr.coef_[0]
i=0
importantfeature = PrettyTable(["Feature", "Score"])
for feature, importance in zip(countvect.get_feature_names_out(), lgr.coef_[0]):
    if i<=200:
        importantfeature.add_row([feature, importance])
        i+=1
print(importantfeature)

+----------------+------------------------+
|    Feature     |         Score          |
+----------------+------------------------+
|      able      |  0.06677606189474043   |
|   absolutely   | 0.0017159258444780854  |
|     across     |  -0.10796843321046863  |
|      act       |  -0.11685786529920128  |
|     acting     |  -0.1948960153780428   |
|     action     |  0.29260137548834886   |
|     actor      |  -0.16420319832071584  |
|    actress     |  -0.09702988754550164  |
|     actual     |  -0.23884194603250072  |
|    actually    | -0.011138108533621661  |
|      add       |  0.043377048564465656  |
|      age       |   0.0804985480229436   |
|     alien      |  -0.05063666971029193  |
|     almost     |  -0.04066159839345951  |
|     along      |  0.12538658840327224   |
|    already     |  -0.18021431449062136  |
|      also      |  0.14849384495297638   |
|    although    |  0.14657535755336962   |
|     always     |  0.21437326633554982   |
|    amazing     |   0.854872573

## Feature Importance with Logistic Regression and Tfidf Vectorizer with unigram

In [29]:
lgr = LogisticRegression()
lgr.fit(x_train_tfidf, y_train)
lgr.score(x_test_tfidf, y_test)
lgr.coef_[0]
i=0
importantfeature = PrettyTable(["Feature", "Score"])
for feature, importance in zip(tfidfvect.get_feature_names_out(), lgr.coef_[0]):
    if i<=200:
        importantfeature.add_row([feature, importance])
        i+=1
print(importantfeature)

+----------------+----------------------+
|    Feature     |        Score         |
+----------------+----------------------+
|      able      |  0.4593946702009999  |
|   absolutely   | 0.24801158565462816  |
|     across     | -0.5673890897336797  |
|      act       |  -0.878025659227441  |
|     acting     | -1.4870739472250205  |
|     action     |  2.4636596277781937  |
|     actor      | -1.3747690436037245  |
|    actress     | -0.6189558765502853  |
|     actual     | -1.1801147977470814  |
|    actually    | -0.17042732097745586 |
|      add       |  0.2794301548652466  |
|      age       |  0.5365423197341169  |
|     alien      | -0.12445814314062309 |
|     almost     | -0.35875093758577664 |
|     along      |  0.7673994304741162  |
|    already     | -0.8099485323689475  |
|      also      |  1.4909505290923648  |
|    although    |  0.833190245458195   |
|     always     |  1.6604514640962766  |
|    amazing     |  4.257073526904754   |
|    american    | 0.4603461175464

## Vectorization with CountVectorizer and Tfidf with bigram

In [30]:
train, test = train_test_split(data, test_size=0.3, random_state=42, shuffle=True)
# min_df=10 -> terms having minimum frequency =10
countvect = CountVectorizer(analyzer="word", tokenizer=LemmaTokenizer(), ngram_range=(2,2), min_df=10, max_features=500)
tfidfvect = TfidfVectorizer(analyzer="word", tokenizer=LemmaTokenizer(), ngram_range=(2,2), min_df=10, max_features=500)
x_train_count = countvect.fit_transform(train["Clean_Reviews"]).toarray()
x_test_count = countvect.transform(test["Clean_Reviews"]).toarray()
x_train_tfidf = tfidfvect.fit_transform(train["Clean_Reviews"]).toarray()
x_test_tfidf = tfidfvect.transform(test["Clean_Reviews"]).toarray()
y_train = train['Label']
y_test = test['Label']



## Feature Importance with Logistic Regression and Count Vectorizer with bigram

In [31]:
lgr = LogisticRegression()
lgr.fit(x_train_count, y_train)
lgr.score(x_test_count, y_test)
lgr.coef_[0]
i=0
importantfeature = PrettyTable(["Feature", "Score"])
for feature, importance in zip(countvect.get_feature_names_out(), lgr.coef_[0]):
    if i<=200:
        importantfeature.add_row([feature, importance])
        i+=1
print(importantfeature)

+-----------------------+-----------------------+
|        Feature        |         Score         |
+-----------------------+-----------------------+
|   absolutely nothing  |  -1.3103618478356924  |
|     academy award     |   0.9267612742162695  |
|        act like       |  -0.48969041107245925 |
|      acting good      |   0.8284649580877653  |
|       acting not      |  -0.40846449998899015 |
|      action movie     |   0.7590789502506831  |
|       action not      |  0.25737257770492067  |
|      action scene     |   0.3420731424402315  |
|    action sequence    |  0.20150399797184593  |
|     actor actress     |  -0.03752097074900597 |
|       actor not       |  -0.4805903743514308  |
|      actually not     |  0.26219404131786933  |
|    actually pretty    |  0.20478436749459206  |
|     actually quite    | -0.049306329815101996 |
|      adam sandler     |   0.2014251057259889  |
|      almost every     |  -0.15652008311236076 |
|       also good       |   1.2023544917988012  |


## Feature Importance with Logistic Regression and Tfidf Vectorizer with bigram

In [32]:
lgr = LogisticRegression()
lgr.fit(x_train_tfidf, y_train)
lgr.score(x_test_tfidf, y_test)
lgr.coef_[0]
i=0
importantfeature = PrettyTable(["Feature", "Score"])
for feature, importance in zip(tfidfvect.get_feature_names_out(), lgr.coef_[0]):
    if i<=200:
        importantfeature.add_row([feature, importance])
        i+=1
print(importantfeature)

+-----------------------+------------------------+
|        Feature        |         Score          |
+-----------------------+------------------------+
|   absolutely nothing  |   -2.551507592380524   |
|     academy award     |   1.5791476448509165   |
|        act like       |  -1.1092840942950157   |
|      acting good      |   1.6109767490736806   |
|       acting not      |  -1.1202366632292948   |
|      action movie     |   1.5108498848488103   |
|       action not      |   0.4786051056853975   |
|      action scene     |   0.6941804467767612   |
|    action sequence    |  0.49025577476535515   |
|     actor actress     |  -0.26938677288692403  |
|       actor not       |  -1.0794873154049438   |
|      actually not     |  0.28830514070278135   |
|    actually pretty    |  0.26757027772410424   |
|     actually quite    | -0.011116954017813962  |
|      adam sandler     |   0.8968776744719575   |
|      almost every     |   -0.27528312871486    |
|       also good       |   2.1

In [35]:
pd.options.display.max_colwidth = 1000
df[["Reviews", "Ratings"]][(df["Ratings"]>=9) & (df["Clean_Reviews"].str.contains("bad reviews"))].head(100)

Unnamed: 0,Reviews,Ratings
120239,"No idea why there are so many bad reviews here? I loved it; I thought it was a very advanced thoughtful film. The graphic were #killer. The comparison of video game culture and young girl culture was spot on. This film makes connections that I've never seen on the big screen but, do see in every day life.The casting was spot on, Hello 12 year-old girls are supposed to be a little annoying. I do wish that more directors would take color into more consideration the way this film does. T The highly stylized sets make the murder scenes more believable because everything is so unbelievable. How can you live in 2016 and not ""get""a film about social media and accelerationism. #duh Someone explain this to me.",9.0
120334,"I don't understand the negative reviews for this movie.This deserve a 10 stars.Something is definitely wrong with people when they do not care for a positive message in a movie.This movie was beyond great,had a touch of everything.It shows what can happen when a person's influence can change lives.These young boys had no hope,they could not see a promising future.The streets are what they knew,and of course,some of you can't relate, but if you are a fair person,then you will try to understand it from that viewpoint.I was not raised in a bad neighborhood,nor the streets, and i still totally get this movie.Life is about sacrifices and there were many a lessons in this movie that are based on real life for so many young people out there.This is their reality, and however fate made an entrance in these young boys life.It brought them someone that made them believe in themselves. Whether dance is an art to you, or not.Only the true artistic, and creative minds can get this.The art of da...",9.0
120408,watch this movie! very cool movie with just enough of all the stuff one would think coming from a horror movie entitled War Wolves!i usually look forward to movies w bad reviewsso please keep giving cool movies like this poor reviews!and yea the 3 old timers made a cool movie cooler! Adrianne bar-beau still kicks ares! and yes the movie was funny too! if ya reading this i suggest watching the movie. thank you all the horror movies are real in my mind! yes cool movie very cool. more movies like this should be made more people need to give poor reviews to cool movies i have lots a time on my hands war wolves was a very entertaining movie,9.0
120540,"Honestly if you want to see this movie because it looks funny, you should still see it. The only reason this movie got bad reviews is because it is a horrible movie. However if you want to go just to laugh, you should definitely see it. In my opinion, it is one of the funniest ones so far.No Scary Movie movies are supposed to be good movies, they're supposed to be funny movies. That's the point, they are made to be that way. Critics should know that. It's a 0.5/10 for a good movie but 9/10 for the funny parts. Don't listen to the critics they don't know what they're talking about . Movies I recommend you see before just watching it: -Paranormal Activity 4 -Black Swan -Cabin In The Woods -Planet of the Apes -Evil Dead",9.0
120550,"I was a fan of the first one, but hesitant to watch this entry, as there are sooooooo many bad reviews out there. Well, I was more than pleasantly surprised. First, for those upset with the sex, um, the title pretty much clues you in. This is total camp, with lots of intentional humor, as well as lots of gore, sex, and pretty much everything but the kitchen sink. If you think you've seen everything, you haven't---I can't recall a movie where my jaw was open half the time, not believing what I was seeing---but it was all in fun. David Hasselhoff's performance was a spot-on parody of himself, and he did it with aplomb---laughed out loud several times during his scenes. And actually, I laughed out loud during this movie more than I did during the two ""Hangover"" movies (which I enjoyed)put together--- and the laughs were in the right spots, for all the right reasons. There were also some very well-executed tension and scare scenes that I was very impressed by. The effects to me were ve...",9.0
...,...,...
135402,"I have to disagree with all of the other commenters. I have currently watched the 3 videos about the ""Damsel in Distress"" trope and found them extremely well done. Sarkeesian does not blame anyone: not gamers, not designers,not ""men"",... She also does not portray gamers as evil women-hating misogynists. She points out a problem in a beloved industry not to assign blame or to complain, but because she is aware that most of this is done unintentionally yet, when you sum it all up, has dire consequences in the real world. I feel like the people giving bad reviews haven't watched the videos, at least not fully watched them, or are of bad faith. Sarkeesian literally says in her videos that game designers aren't evil guys twirling their mustaches while part of some conspiracy, they just probably aren't aware of what they're doing or the impact of their choices on society's view of women and violence against them. She even points out how the tropes are harmful for males as well as females...",10.0
135508,"There is deep meaning to this film. I watched it the other day and found it to be thought provoking and very profound. There were some production problems, I guess for its low budget, but all in all, I really liked this movie. I specially loved the music. I can't wait to see what the director comes up with next. You can tell he has a lot of talent and originality. Making reference to the gay holocaust is something that I found to be refreshing and have not seen in any other film since ""Bent"" which I loved. The moving back and forth within time periods was effective. The actors did a descent job. I'd like to see more work from Craig Pinkston. I read all of the bad reviews on this site. I think most here are too focused on the production values of the film rather than the message.",10.0
135619,"I don't understand where all the bad reviews are coming from, this movie is AMAZING. I watched this with my family, and It was very educational and informative. Must watch!",10.0
135831,"I've seen and read some of the bad reviews of this animated short. They are fundamentally flawed as they fail to acknowledge Dorbees artistic merits and groundbreaking art style. It truly is Pollock to Hopper's Toy Story, which at the time some struggled to accommodate widespread recognition. Dorbees esoteric nature is perhaps responsible for its mediocre to shoddy rating. Nonetheless, I find that Jacks's moving and often emotional character arc is nothing short of a modern literary masterpiece that will be worshiped alongside other pinnacles of human achievement. Like Sisyphus I am tormented on a daily basis knowing that I will never produce anything that shall rival or even pay homage to Dorbees: Making Decisions and that all else is meaningless. Like the mythical Chimera, Dorbees must be respected for its hybrid nature. It should be recognised for its musical, artistic, dramatic, and satirical elements which unlike the Chimera are certainly not fictitious. I suspect the digital ...",10.0


## Vectorization with CountVectorizer and Tfidf with trigram

In [36]:
train, test = train_test_split(data, test_size=0.3, random_state=42, shuffle=True)
# min_df=10 -> terms having minimum frequency =10
countvect = CountVectorizer(analyzer="word", tokenizer=LemmaTokenizer(), ngram_range=(3,3), min_df=10, max_features=500)
tfidfvect = TfidfVectorizer(analyzer="word", tokenizer=LemmaTokenizer(), ngram_range=(3,3), min_df=10, max_features=500)
x_train_count = countvect.fit_transform(train["Clean_Reviews"]).toarray()
x_test_count = countvect.transform(test["Clean_Reviews"]).toarray()
x_train_tfidf = tfidfvect.fit_transform(train["Clean_Reviews"]).toarray()
x_test_tfidf = tfidfvect.transform(test["Clean_Reviews"]).toarray()
y_train = train['Label']
y_test = test['Label']



## Feature Importance with Logistic Regression and Count Vectorizer with trigram

In [37]:
lgr = LogisticRegression()
lgr.fit(x_train_count, y_train)
lgr.score(x_test_count, y_test)
lgr.coef_[0]
i=0
importantfeature = PrettyTable(["Feature", "Score"])
for feature, importance in zip(countvect.get_feature_names_out(), lgr.coef_[0]):
    if i<=200:
        importantfeature.add_row([feature, importance])
        i+=1
print(importantfeature)

+-------------------------+------------------------+
|         Feature         |         Score          |
+-------------------------+------------------------+
|      acting can not     |  -0.5371433387151568   |
|      acting not bad     |  -0.5276289517996829   |
|     acting not good     |  -1.5018684296874094   |
|    acting pretty good   |   0.7783945521492577   |
|  acting special effect  |  -0.5585998706389342   |
|     action scene not    |  -0.2991313939146455   |
|    action take place    |  0.030330091490939435  |
|      actor can not      |  -0.8840181616848419   |
|      actor good job     |   0.5202635678023184   |
|     actor look like     |  -0.9153306446794007   |
|     actually not bad    |  -0.3562298285390032   |
|   actually pretty good  |   0.3717588908179357   |
|    almost every scene   |  -0.15740145177628262  |
|       bad bad bad       |   -1.727513845060244   |
|       bad can not       |   -1.300764336805821   |
|       bad guy not       |  0.413189225044712

## Feature Importance with Logistic Regression and Tfidf Vectorizer with trigram

In [38]:
lgr = LogisticRegression()
lgr.fit(x_train_tfidf, y_train)
lgr.score(x_test_tfidf, y_test)
lgr.coef_[0]
i=0
importantfeature = PrettyTable(["Feature", "Score"])
for feature, importance in zip(tfidfvect.get_feature_names_out(), lgr.coef_[0]):
    if i<=200:
        importantfeature.add_row([feature, importance])
        i+=1
print(importantfeature)

+-------------------------+------------------------+
|         Feature         |         Score          |
+-------------------------+------------------------+
|      acting can not     |  -0.7975461916037586   |
|      acting not bad     |  -0.6553909095040769   |
|     acting not good     |  -1.6017497028358572   |
|    acting pretty good   |   0.941997014170508    |
|  acting special effect  |  -0.6194332026175022   |
|     action scene not    |  -0.36918617647449053  |
|    action take place    |  0.052640143242040985  |
|      actor can not      |   -1.284934429407125   |
|      actor good job     |   0.4977895373945103   |
|     actor look like     |  -0.9879943050010154   |
|     actually not bad    |  -0.4240866556508076   |
|   actually pretty good  |  0.47490013200532916   |
|    almost every scene   |  -0.1779900653600232   |
|       bad bad bad       |  -2.5460643903028806   |
|       bad can not       |  -1.7913851058353578   |
|       bad guy not       |  0.465496600980752

## Vectorization with CountVectorizer and Tfidf with unigram, bigram, trigram

In [41]:
train, test = train_test_split(data, test_size=0.3, random_state=42, shuffle=True)
# min_df=10 -> terms having minimum frequency =10
countvect = CountVectorizer(analyzer="word", tokenizer=LemmaTokenizer(), ngram_range=(1,3), min_df=10, max_features=5000)
tfidfvect = TfidfVectorizer(analyzer="word", tokenizer=LemmaTokenizer(), ngram_range=(1,3), min_df=10, max_features=5000)
x_train_count = countvect.fit_transform(train["Clean_Reviews"]).toarray()
x_test_count = countvect.transform(test["Clean_Reviews"]).toarray()
x_train_tfidf = tfidfvect.fit_transform(train["Clean_Reviews"]).toarray()
x_test_tfidf = tfidfvect.transform(test["Clean_Reviews"]).toarray()
y_train = train['Label']
y_test = test['Label']



## Feature selection with Chi squared

In [42]:
from sklearn.feature_selection import chi2
import numpy as np
N = 5000
Number = 1
featureselection = PrettyTable(["Unigram", "Bigram","Trigram"])
for category in train['Label'].unique():
    features_chi2 = chi2(x_train_tfidf, train['Label'] == category)
    indices = np.argsort(features_chi2[0])
    feature_names = np.array(tfidfvect.get_feature_names_out())[indices]
    unigrams = [x for x in feature_names if len(x.split(' ')) == 1]
    bigrams = [x for x in feature_names if len(x.split(' ')) == 2]
    trigrams = [x for x in feature_names if len(x.split(' ')) == 3]
    print("%s. %s :" % (Number,category))
    print("\t# Unigrams :\n\t. %s" %('\n\t. '.join(unigrams[-N:])))
    print("\t# Bigrams :\n\t. %s" %('\n\t. '.join(bigrams[-N:])))
    print("\t# Trigrams :\n\t. %s" %('\n\t. '.join(trigrams[-N:])))
    Number += 1

1. 1 :
	# Unigrams :
	. note
	. box
	. ken
	. accompanied
	. chop
	. offering
	. mountain
	. global
	. going
	. definition
	. investigate
	. mr
	. paris
	. believing
	. category
	. invisible
	. twice
	. took
	. credit
	. plan
	. photography
	. korean
	. nicole
	. enjoyment
	. scifi
	. want
	. newspaper
	. farm
	. survivor
	. remembered
	. jackie
	. hilariously
	. nicolas
	. away
	. round
	. â
	. revolves
	. racial
	. lloyd
	. self
	. eventually
	. egg
	. wise
	. pink
	. illegal
	. judging
	. hook
	. increasingly
	. exposed
	. australian
	. par
	. extended
	. looking
	. invasion
	. wake
	. blow
	. chick
	. marry
	. outside
	. arrogant
	. tag
	. whenever
	. lady
	. speech
	. hearing
	. die
	. board
	. hurt
	. accepted
	. snipe
	. known
	. finger
	. notice
	. test
	. surely
	. weapon
	. legend
	. praise
	. pro
	. close
	. eager
	. voice
	. confusion
	. suggested
	. nerve
	. exaggerated
	. response
	. fame
	. distant
	. admittedly
	. record
	. know
	. existence
	. station
	. saturday
	. fe

2. 0 :
	# Unigrams :
	. note
	. box
	. ken
	. accompanied
	. chop
	. offering
	. mountain
	. global
	. going
	. definition
	. investigate
	. mr
	. paris
	. believing
	. category
	. invisible
	. twice
	. took
	. credit
	. plan
	. photography
	. korean
	. nicole
	. enjoyment
	. scifi
	. want
	. newspaper
	. farm
	. survivor
	. remembered
	. jackie
	. hilariously
	. nicolas
	. away
	. round
	. â
	. revolves
	. racial
	. lloyd
	. self
	. eventually
	. egg
	. wise
	. pink
	. illegal
	. judging
	. hook
	. increasingly
	. exposed
	. australian
	. par
	. extended
	. looking
	. invasion
	. wake
	. blow
	. chick
	. marry
	. outside
	. arrogant
	. tag
	. whenever
	. lady
	. speech
	. hearing
	. die
	. board
	. hurt
	. accepted
	. snipe
	. known
	. finger
	. notice
	. test
	. surely
	. weapon
	. legend
	. praise
	. pro
	. close
	. eager
	. voice
	. confusion
	. suggested
	. nerve
	. exaggerated
	. response
	. fame
	. distant
	. admittedly
	. record
	. know
	. existence
	. station
	. saturday
	. fe