In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [2]:
df = pd.read_csv(r"9th July 2024\IMDB Dataset.csv")

In [3]:
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [4]:
lemmatizer = WordNetLemmatizer()
word_stop = set(stopwords.words('english'))

In [5]:
cleaned_reviews = []

import re
for i in df['review']:
    i_sent = sent_tokenize(i)
    cleaned_review = []
    for sent in i_sent:
        i_sub = re.sub(r'<.*?>', ' ', sent)
        i_sub = re.sub(r'[^a-zA-Z]', ' ', i_sub)
        i_sub = i_sub.lower()
        i_sub = i_sub.split()
        i_sub = [lemmatizer.lemmatize(word) for word in i_sub if word not in word_stop]
        i_sub = ' '.join(i_sub)
        cleaned_review.append(i_sub)
    cleaned_reviews.append(' '.join(cleaned_review))

cleaned_reviews
        

['one reviewer mentioned watching oz episode hooked right exactly happened first thing struck oz brutality unflinching scene violence set right word go trust show faint hearted timid show pull punch regard drug sex violence hardcore classic use word called oz nickname given oswald maximum security state penitentary focus mainly emerald city experimental section prison cell glass front face inwards privacy high agenda em city home many aryan muslim gangsta latino christian italian irish scuffle death stare dodgy dealing shady agreement never far away would say main appeal show due fact go show dare forget pretty picture painted mainstream audience forget charm forget romance oz mess around first episode ever saw struck nasty surreal say ready watched developed taste oz got accustomed high level graphic violence violence injustice crooked guard sold nickel inmate kill order get away well mannered middle class inmate turned prison bitch due lack street skill prison experience watching oz 

In [6]:
df['review'][0]

"One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />I would say the main appeal of the show is due to the fa

In [8]:
len(cleaned_reviews)

50000

In [7]:
df['cleaned_review'] = cleaned_reviews

In [31]:
df.drop('cleaned_review', axis = 1, inplace = True)

In [10]:
df

Unnamed: 0,review,sentiment,cleaned_review
0,One of the other reviewers has mentioned that ...,positive,one reviewer mentioned watching oz episode hoo...
1,A wonderful little production. <br /><br />The...,positive,wonderful little production filming technique ...
2,I thought this was a wonderful way to spend ti...,positive,thought wonderful way spend time hot summer we...
3,Basically there's a family where a little boy ...,negative,basically family little boy jake think zombie ...
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,petter mattei love time money visually stunnin...
...,...,...,...
49995,I thought this movie did a down right good job...,positive,thought movie right good job creative original...
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative,bad plot bad dialogue bad acting idiotic direc...
49997,I am a Catholic taught in parochial elementary...,negative,catholic taught parochial elementary school nu...
49998,I'm going to have to disagree with the previou...,negative,going disagree previous comment side maltin on...


In [9]:
df['sentiment'].value_counts()

sentiment
positive    25000
negative    25000
Name: count, dtype: int64

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [12]:
tfidf = TfidfVectorizer()

In [13]:
from sklearn.pipeline import Pipeline

In [14]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

In [70]:
models = {'Naive_bayes' : MultinomialNB(),
          'LOgistic_Regression' : LogisticRegression(),
          'Decision_Tree' : DecisionTreeClassifier(),
          'Random_Forest' : RandomForestClassifier(),
          'Xgboost' : XGBClassifier(),
          'Catboost' : CatBoostClassifier(),
          'Lightgbm' : LGBMClassifier()}

In [71]:
x_train, x_test, y_train, y_test = train_test_split(df['cleaned_review'], df['sentiment'], test_size = 0.3, random_state = 42)

In [72]:
y_train_lab = y_train.apply(lambda x : 1 if x == 'positive' else 0)
y_test_lab = y_test.apply(lambda x : 1 if x == 'positive' else 0)

In [73]:
results = {}
for name, model in models.items():
    classifier = Pipeline([('tfidf', tfidf), ('model',model)])
    if name not in ['Xgboost', 'Catboost', 'Lightgbm']:
        print(f'------------ {name} ------------')
        classifier.fit(x_train, y_train)
        y_pred = classifier.predict(x_test)
        accuracy = accuracy_score(y_test,y_pred)
        print(classification_report(y_test, y_pred))

    else:
        print(f'------------ {name} ------------')
        classifier.fit(x_train, y_train_lab)
        y_pred = classifier.predict(x_test)
        accuracy = accuracy_score(y_test_lab,y_pred)
        print(classification_report(y_test_lab, y_pred))
        
    results[name] = accuracy
    print()

results

------------ Naive_bayes ------------
              precision    recall  f1-score   support

    negative       0.85      0.88      0.87      7411
    positive       0.88      0.85      0.86      7589

    accuracy                           0.86     15000
   macro avg       0.86      0.86      0.86     15000
weighted avg       0.86      0.86      0.86     15000


------------ LOgistic_Regression ------------
              precision    recall  f1-score   support

    negative       0.91      0.88      0.89      7411
    positive       0.89      0.91      0.90      7589

    accuracy                           0.90     15000
   macro avg       0.90      0.90      0.90     15000
weighted avg       0.90      0.90      0.90     15000


------------ Decision_Tree ------------
              precision    recall  f1-score   support

    negative       0.72      0.73      0.72      7411
    positive       0.73      0.72      0.72      7589

    accuracy                           0.72     15000
  

{'Naive_bayes': 0.8642,
 'LOgistic_Regression': 0.8964,
 'Decision_Tree': 0.7231333333333333,
 'Random_Forest': 0.8546,
 'Xgboost': 0.853,
 'Catboost': 0.8726,
 'Lightgbm': 0.8610666666666666}

In [50]:
x_train = x_train[:2000]
x_train_fit = tfidf.fit_transform(x_train)
y_train_lab = y_train_lab[:2000]
mod = CatBoostClassifier().fit(x_train_fit, y_train_lab)

x_test = x_test[:2000]
x_test = tfidf.transform(x_test)
y_pred = mod.predict(x_test)
y_test_lab = y_test_lab[:2000]

accuracy_score(y_test_lab, y_pred)

Learning rate set to 0.013851
0:	learn: 0.6910367	total: 386ms	remaining: 6m 25s
1:	learn: 0.6877858	total: 710ms	remaining: 5m 54s
2:	learn: 0.6854258	total: 1.04s	remaining: 5m 45s
3:	learn: 0.6827838	total: 1.36s	remaining: 5m 39s
4:	learn: 0.6798295	total: 1.69s	remaining: 5m 36s
5:	learn: 0.6773805	total: 2.01s	remaining: 5m 33s
6:	learn: 0.6753190	total: 2.33s	remaining: 5m 31s
7:	learn: 0.6724794	total: 2.66s	remaining: 5m 29s
8:	learn: 0.6702050	total: 2.98s	remaining: 5m 27s
9:	learn: 0.6687842	total: 3.3s	remaining: 5m 27s
10:	learn: 0.6669383	total: 3.64s	remaining: 5m 26s
11:	learn: 0.6641964	total: 3.97s	remaining: 5m 26s
12:	learn: 0.6611868	total: 4.37s	remaining: 5m 32s
13:	learn: 0.6592287	total: 4.69s	remaining: 5m 30s
14:	learn: 0.6568981	total: 5.01s	remaining: 5m 29s
15:	learn: 0.6553939	total: 5.33s	remaining: 5m 27s
16:	learn: 0.6540084	total: 5.66s	remaining: 5m 27s
17:	learn: 0.6517932	total: 5.98s	remaining: 5m 26s
18:	learn: 0.6495493	total: 6.3s	remaining: 5

0.827

In [45]:
y_train[:5]

38094    negative
40624    positive
49425    negative
35734    positive
41708    negative
Name: sentiment, dtype: object

In [46]:
x_train[:5]

38094    much love train stomach movie premise one coul...
40624    good ppv like wrestlemania xx year later wwe c...
49425    finding right word everybody problem vaudevill...
35734    really suprised movie get higher rating imdb o...
41708    start confessing tend really enjoy action movi...
Name: cleaned_review, dtype: object

In [60]:
y_train[:5]

38094    negative
40624    positive
49425    negative
35734    positive
41708    negative
Name: sentiment, dtype: object

In [53]:
tfidf.fit_transform(y_train).toarray()[:5]

array([[1., 0.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [1., 0.]])

In [59]:
y_train_lab[:5]

38094    0
40624    1
49425    0
35734    1
41708    0
Name: sentiment, dtype: int64

In [69]:
x_train_mod = x_train[:2000]
x_train_fit = tfidf.fit_transform(x_train_mod)
y_train_lab_mod = y_train_lab[:2000]
mod = CatBoostClassifier().fit(x_train_fit, y_train_lab_mod)

x_test_mod = x_test[:2000]
x_test_mod = tfidf.transform(x_test_mod)
y_pred = mod.predict(x_test_mod)
y_test_lab_mod = y_test_lab[:2000]

accuracy_score(y_test_lab_mod, y_pred)

Learning rate set to 0.013851
0:	learn: 0.6910367	total: 391ms	remaining: 6m 30s
1:	learn: 0.6877858	total: 710ms	remaining: 5m 54s
2:	learn: 0.6854258	total: 1.03s	remaining: 5m 42s
3:	learn: 0.6827838	total: 1.35s	remaining: 5m 36s
4:	learn: 0.6798295	total: 1.68s	remaining: 5m 33s
5:	learn: 0.6773805	total: 2s	remaining: 5m 30s
6:	learn: 0.6753190	total: 2.31s	remaining: 5m 28s
7:	learn: 0.6724794	total: 2.64s	remaining: 5m 26s
8:	learn: 0.6702050	total: 2.96s	remaining: 5m 25s
9:	learn: 0.6687842	total: 3.29s	remaining: 5m 25s
10:	learn: 0.6669383	total: 3.68s	remaining: 5m 31s
11:	learn: 0.6641964	total: 4s	remaining: 5m 29s
12:	learn: 0.6611868	total: 4.33s	remaining: 5m 28s
13:	learn: 0.6592287	total: 4.82s	remaining: 5m 39s
14:	learn: 0.6568981	total: 5.34s	remaining: 5m 50s
15:	learn: 0.6553939	total: 5.8s	remaining: 5m 56s
16:	learn: 0.6540084	total: 6.23s	remaining: 6m
17:	learn: 0.6517932	total: 6.65s	remaining: 6m 3s
18:	learn: 0.6495493	total: 7.02s	remaining: 6m 2s
19:	l

0.827

In [58]:
y_train_lab

38094    0
40624    1
49425    0
35734    1
41708    0
        ..
18858    1
33303    1
16811    1
45428    1
37920    1
Name: sentiment, Length: 2000, dtype: int64

In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [2]:
df = pd.read_csv(r"9th July 2024\IMDB Dataset.csv")

In [3]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [4]:
df.shape

(50000, 2)

In [5]:
df['sentiment'].value_counts()

sentiment
positive    25000
negative    25000
Name: count, dtype: int64

In [7]:
import spacy
nlp = spacy.load('en_core_web_sm')
stop_spa = nlp.Defaults.stop_words

In [8]:
stop_nlt = stopwords.words('english')

In [10]:
stop_spa

{"'d",
 "'ll",
 "'m",
 "'re",
 "'s",
 "'ve",
 'a',
 'about',
 'above',
 'across',
 'after',
 'afterwards',
 'again',
 'against',
 'all',
 'almost',
 'alone',
 'along',
 'already',
 'also',
 'although',
 'always',
 'am',
 'among',
 'amongst',
 'amount',
 'an',
 'and',
 'another',
 'any',
 'anyhow',
 'anyone',
 'anything',
 'anyway',
 'anywhere',
 'are',
 'around',
 'as',
 'at',
 'back',
 'be',
 'became',
 'because',
 'become',
 'becomes',
 'becoming',
 'been',
 'before',
 'beforehand',
 'behind',
 'being',
 'below',
 'beside',
 'besides',
 'between',
 'beyond',
 'both',
 'bottom',
 'but',
 'by',
 'ca',
 'call',
 'can',
 'cannot',
 'could',
 'did',
 'do',
 'does',
 'doing',
 'done',
 'down',
 'due',
 'during',
 'each',
 'eight',
 'either',
 'eleven',
 'else',
 'elsewhere',
 'empty',
 'enough',
 'even',
 'ever',
 'every',
 'everyone',
 'everything',
 'everywhere',
 'except',
 'few',
 'fifteen',
 'fifty',
 'first',
 'five',
 'for',
 'former',
 'formerly',
 'forty',
 'four',
 'from',
 'fron

In [11]:
all_stop_words = ['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each',
 'few',
 'more',
 'most',
 'other',
 'some',
 'such',
 'no',
 'nor',
 'not',
 'only',
 'own',
 'same',
 'so',
 'than',
 'too',
 'very',
 's',
 't',
 'can',
 'will',
 'just',
 'don',
 "don't",
 'should',
 "should've",
 'now',
 'd',
 'll',
 'm',
 'o',
 're',
 've',
 'y',
 'ain',
 'aren',
 "aren't",
 'couldn',
 "couldn't",
 'didn',
 "didn't",
 'doesn',
 "doesn't",
 'hadn',
 "hadn't",
 'hasn',
 "hasn't",
 'haven',
 "haven't",
 'isn',
 "isn't",
 'ma',
 'mightn',
 "mightn't",
 'mustn',
 "mustn't",
 'needn',
 "needn't",
 'shan',
 "shan't",
 'shouldn',
 "shouldn't",
 'wasn',
 "wasn't",
 'weren',
 "weren't",
 'won',
 "won't",
 'wouldn',
 "wouldn't",
  "'d",
 "'ll",
 "'m",
 "'re",
 "'s",
 "'ve",
 'a',
 'about',
 'above',
 'across',
 'after',
 'afterwards',
 'again',
 'against',
 'all',
 'almost',
 'alone',
 'along',
 'already',
 'also',
 'although',
 'always',
 'am',
 'among',
 'amongst',
 'amount',
 'an',
 'and',
 'another',
 'any',
 'anyhow',
 'anyone',
 'anything',
 'anyway',
 'anywhere',
 'are',
 'around',
 'as',
 'at',
 'back',
 'be',
 'became',
 'because',
 'become',
 'becomes',
 'becoming',
 'been',
 'before',
 'beforehand',
 'behind',
 'being',
 'below',
 'beside',
 'besides',
 'between',
 'beyond',
 'both',
 'bottom',
 'but',
 'by',
 'ca',
 'call',
 'can',
 'cannot',
 'could',
 'did',
 'do',
 'does',
 'doing',
 'done',
 'down',
 'due',
 'during',
 'each',
 'eight',
 'either',
 'eleven',
 'else',
 'elsewhere',
 'empty',
 'enough',
 'even',
 'ever',
 'every',
 'everyone',
 'everything',
 'everywhere',
 'except',
 'few',
 'fifteen',
 'fifty',
 'first',
 'five',
 'for',
 'former',
 'formerly',
 'forty',
 'four',
 'from',
 'front',
 'full',
 'further',
 'get',
 'give',
 'go',
 'had',
 'has',
 'have',
 'he',
 'hence',
 'her',
 'here',
 'hereafter',
 'hereby',
 'herein',
 'hereupon',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'however',
 'hundred',
 'i',
 'if',
 'in',
 'indeed',
 'into',
 'is',
 'it',
 'its',
 'itself',
 'just',
 'keep',
 'last',
 'latter',
 'latterly',
 'least',
 'less',
 'made',
 'make',
 'many',
 'may',
 'me',
 'meanwhile',
 'might',
 'mine',
 'more',
 'moreover',
 'most',
 'mostly',
 'move',
 'much',
 'must',
 'my',
 'myself',
 "n't",
 'name',
 'namely',
 'neither',
 'never',
 'nevertheless',
 'next',
 'nine',
 'no',
 'nobody',
 'none',
 'noone',
 'nor',
 'not',
 'nothing',
 'now',
 'nowhere',
 'n‘t',
 'n’t',
 'of',
 'off',
 'often',
 'on',
 'once',
 'one',
 'only',
 'onto',
 'or',
 'other',
 'others',
 'otherwise',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'part',
 'per',
 'perhaps',
 'please',
 'put',
 'quite',
 'rather',
 're',
 'really',
 'regarding',
 'same',
 'say',
 'see',
 'seem',
 'seemed',
 'seeming',
 'seems',
 'serious',
 'several',
 'she',
 'should',
 'show',
 'side',
 'since',
 'six',
 'sixty',
 'so',
 'some',
 'somehow',
 'someone',
 'something',
 'sometime',
 'sometimes',
 'somewhere',
 'still',
 'such',
 'take',
 'ten',
 'than',
 'that',
 'the',
 'their',
 'them',
 'themselves',
 'then',
 'thence',
 'there',
 'thereafter',
 'thereby',
 'therefore',
 'therein',
 'thereupon',
 'these',
 'they',
 'third',
 'this',
 'those',
 'though',
 'three',
 'through',
 'throughout',
 'thru',
 'thus',
 'to',
 'together',
 'too',
 'top',
 'toward',
 'towards',
 'twelve',
 'twenty',
 'two',
 'under',
 'unless',
 'until',
 'up',
 'upon',
 'us',
 'used',
 'using',
 'various',
 'very',
 'via',
 'was',
 'we',
 'well',
 'were',
 'what',
 'whatever',
 'when',
 'whence',
 'whenever',
 'where',
 'whereafter',
 'whereas',
 'whereby',
 'wherein',
 'whereupon',
 'wherever',
 'whether',
 'which',
 'while',
 'whither',
 'who',
 'whoever',
 'whole',
 'whom',
 'whose',
 'why',
 'will',
 'with',
 'within',
 'without',
 'would',
 'yet',
 'you',
 'your',
 'yours',
 'yourself',
 'yourselves',
 '‘d',
 '‘ll',
 '‘m',
 '‘re',
 '‘s',
 '‘ve',
 '’d',
 '’ll',
 '’m',
 '’re',
 '’s',
 '’ve']

In [13]:
all_stop_words = set(all_stop_words)
lemmatizer = WordNetLemmatizer()
import re
from bs4 import BeautifulSoup

In [28]:
clean_reviews = []
for i in df['review']:
    i = sent_tokenize(i)
    clean_review = []
    for sent in i:
        sent = re.sub(r'<br\s*/?>', ' ', sent)
        sent = re.sub(r'[^a-zA-Z]+', ' ', sent)
        sent = BeautifulSoup(sent, 'lxml').get_text()
        sent = sent.lower()
        sent = sent.split()
        sent = [lemmatizer.lemmatize(word) for word in sent if not word in all_stop_words]
        sent = ' '.join(sent)
        clean_review.append(sent)
    clean_reviews.append(' '.join(clean_review))

clean_reviews

['reviewer mentioned watching oz episode hooked right exactly happened thing struck oz brutality unflinching scene violence set right word trust faint hearted timid pull punch regard drug sex violence hardcore classic use word called oz nickname given oswald maximum security state penitentary focus mainly emerald city experimental section prison cell glass front face inwards privacy high agenda em city home aryan muslim gangsta latino christian italian irish scuffle death stare dodgy dealing shady agreement far away main appeal fact go show dare forget pretty picture painted mainstream audience forget charm forget romance oz mess episode saw struck nasty surreal ready watched developed taste oz got accustomed high level graphic violence violence injustice crooked guard sold nickel inmate kill order away mannered middle class inmate turned prison bitch lack street skill prison experience watching oz comfortable uncomfortable viewing thats touch darker',
 'wonderful little production fil

In [17]:
clean_reviews[4980:]

['deliverance stunning thriller bit exciting good thriller aspire stomach churningly frightening horror movie terrifying classic horror film thought normal red blooded male enjoying adventure weekend mile form civilisation captured sodomised couple violent hillbilly surely worst nightmare world population easy deliverance slip exploitation territory john boorman cleverly avoided temptation route film explores question challenge meaning masculinity film come away wishing heaven step hero shoe performing heroic deed saving day getting girl deliverance come away praying god experience protagonist br br city guy ed jon voight lewis burt reynolds drew ronny cox bobby ned beatty head wilderness spend day canoing soon dammed river guy riding rapid pair ed bobby inadvertently little far ahead pull riverside await pal adjacent woodland fall foul local woodlanders bill mckinney herbert coward tie ed tree strip rape bobby instructing perversely squeal like pig lewis drew arrive unseen lewis fair 

In [29]:
df.drop('clean_reviews', axis = 1, inplace = True)

In [30]:
df['clean_reviews'] = clean_reviews

In [31]:
df.head()

Unnamed: 0,review,sentiment,clean_reviews
0,One of the other reviewers has mentioned that ...,positive,reviewer mentioned watching oz episode hooked ...
1,A wonderful little production. <br /><br />The...,positive,wonderful little production filming technique ...
2,I thought this was a wonderful way to spend ti...,positive,thought wonderful way spend time hot summer we...
3,Basically there's a family where a little boy ...,negative,basically family little boy jake think zombie ...
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,petter mattei love time money visually stunnin...


In [26]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [32]:
analyzer = SentimentIntensityAnalyzer()

In [36]:
def sentiment_analyszer(text):
    sentiment = analyzer.polarity_scores(text)
    sentiment_scores = sentiment['compound']
    if sentiment_scores > 0.05:
        sentiment_label = 'positive'
    else:
        sentiment_label = 'negative'

    return sentiment_label

In [35]:
type(df['clean_reviews'][0])

str

In [37]:
df['predicted_sentiment'] = df['clean_reviews'].apply(sentiment_analyszer)

In [38]:
df.head()

Unnamed: 0,review,sentiment,clean_reviews,predicted_sentiment
0,One of the other reviewers has mentioned that ...,positive,reviewer mentioned watching oz episode hooked ...,negative
1,A wonderful little production. <br /><br />The...,positive,wonderful little production filming technique ...,positive
2,I thought this was a wonderful way to spend ti...,positive,thought wonderful way spend time hot summer we...,positive
3,Basically there's a family where a little boy ...,negative,basically family little boy jake think zombie ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,petter mattei love time money visually stunnin...,positive


In [39]:
from sklearn.metrics import accuracy_score, f1_score, classification_report

In [40]:
accuracy_score(df['sentiment'], df['predicted_sentiment'])

0.67598

In [41]:
df['review'][0]

"One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />I would say the main appeal of the show is due to the fa

In [45]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

In [43]:
x_train, x_test, y_train, y_test = train_test_split(df['clean_reviews'], df['sentiment'], test_size = 0.25, random_state = 42)

In [46]:
classifier = Pipeline([('tfidf', TfidfVectorizer()),
                       ('model', RandomForestClassifier())])

In [47]:
mod = classifier.fit(x_train, y_train)
y_pred = classifier.predict(x_test)
accuracy_score(y_test, y_pred)

0.85368