# `IMBD` Reviews

In [2]:
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

import string
import re

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer

from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, classification_report, confusion_matrix

from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

In [3]:
from sklearnex import patch_sklearn
patch_sklearn()

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [4]:
temp_df = pd.read_csv('IMDB Dataset.csv')

In [5]:
df = temp_df

In [6]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [7]:
df['review'][1]

'A wonderful little production. <br /><br />The filming technique is very unassuming- very old-time-BBC fashion and gives a comforting, and sometimes discomforting, sense of realism to the entire piece. <br /><br />The actors are extremely well chosen- Michael Sheen not only "has got all the polari" but he has all the voices down pat too! You can truly see the seamless editing guided by the references to Williams\' diary entries, not only is it well worth the watching but it is a terrificly written and performed piece. A masterful production about one of the great master\'s of comedy and his life. <br /><br />The realism really comes home with the little things: the fantasy of the guard which, rather than use the traditional \'dream\' techniques remains solid then disappears. It plays on our knowledge and our senses, particularly with the scenes concerning Orton and Halliwell and the sets (particularly of their flat with Halliwell\'s murals decorating every surface) are terribly well d

### There are html tags in the data set of we have to remove them.

In [8]:
df['sentiment'].value_counts()

sentiment
positive    25000
negative    25000
Name: count, dtype: int64

### Data set is a `balanced data set` so we are good to go.

In [9]:
df.isnull().sum()

review       0
sentiment    0
dtype: int64

### There is no null value.

In [10]:
df.duplicated().sum()

418

### There are some dublicate reviews. So drop these dubplicate reviews.

In [11]:
df.drop_duplicates(inplace=True)

In [12]:
df.duplicated().sum()

0

### `Remove tags`

In [13]:
def remove_tags(raw_text):
    cleaned_text = re.sub(re.compile('<.*?>'), '', raw_text)
    return cleaned_text

In [14]:
df['review'] = df['review'].apply(remove_tags)

In [15]:
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. The filming tec...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


### Lowecase each word

In [16]:
df['review'] = df['review'].apply(lambda x:x.lower())

### Chat word teartment

In [17]:
chat_words = {
    "$" : " dollar ",
    "€" : " euro ",
    "4ao" : "for adults only",
    "a.m" : "before midday",
    "a3" : "anytime anywhere anyplace",
    "aamof" : "as a matter of fact",
    "acct" : "account",
    "adih" : "another day in hell",
    "afaic" : "as far as i am concerned",
    "afaict" : "as far as i can tell",
    "afaik" : "as far as i know",
    "afair" : "as far as i remember",
    "afk" : "away from keyboard",
    "app" : "application",
    "approx" : "approximately",
    "apps" : "applications",
    "asap" : "as soon as possible",
    "asl" : "age, sex, location",
    "atk" : "at the keyboard",
    "ave." : "avenue",
    "aymm" : "are you my mother",
    "ayor" : "at your own risk",
    "b&b" : "bed and breakfast",
    "b+b" : "bed and breakfast",
    "b.c" : "before christ",
    "b2b" : "business to business",
    "b2c" : "business to customer",
    "b4" : "before",
    "b4n" : "bye for now",
    "b@u" : "back at you",
    "bae" : "before anyone else",
    "bak" : "back at keyboard",
    "bbbg" : "bye bye be good",
    "bbc" : "british broadcasting corporation",
    "bbias" : "be back in a second",
    "bbl" : "be back later",
    "bbs" : "be back soon",
    "be4" : "before",
    "bfn" : "bye for now",
    "blvd" : "boulevard",
    "bout" : "about",
    "brb" : "be right back",
    "bros" : "brothers",
    "brt" : "be right there",
    "bsaaw" : "big smile and a wink",
    "btw" : "by the way",
    "bwl" : "bursting with laughter",
    "c/o" : "care of",
    "cet" : "central european time",
    "cf" : "compare",
    "cia" : "central intelligence agency",
    "csl" : "can not stop laughing",
    "cu" : "see you",
    "cul8r" : "see you later",
    "cv" : "curriculum vitae",
    "cwot" : "complete waste of time",
    "cya" : "see you",
    "cyt" : "see you tomorrow",
    "dae" : "does anyone else",
    "dbmib" : "do not bother me i am busy",
    "diy" : "do it yourself",
    "dm" : "direct message",
    "dwh" : "during work hours",
    "e123" : "easy as one two three",
    "eet" : "eastern european time",
    "eg" : "example",
    "embm" : "early morning business meeting",
    "encl" : "enclosed",
    "encl." : "enclosed",
    "etc" : "and so on",
    "faq" : "frequently asked questions",
    "fawc" : "for anyone who cares",
    "fb" : "facebook",
    "fc" : "fingers crossed",
    "fig" : "figure",
    "fimh" : "forever in my heart",
    "ft." : "feet",
    "ft" : "featuring",
    "ftl" : "for the loss",
    "ftw" : "for the win",
    "fwiw" : "for what it is worth",
    "fyi" : "for your information",
    "g9" : "genius",
    "gahoy" : "get a hold of yourself",
    "gal" : "get a life",
    "gcse" : "general certificate of secondary education",
    "gfn" : "gone for now",
    "gg" : "good game",
    "gl" : "good luck",
    "glhf" : "good luck have fun",
    "gmt" : "greenwich mean time",
    "gmta" : "great minds think alike",
    "gn" : "good night",
    "g.o.a.t" : "greatest of all time",
    "goat" : "greatest of all time",
    "goi" : "get over it",
    "gps" : "global positioning system",
    "gr8" : "great",
    "gratz" : "congratulations",
    "gyal" : "girl",
    "h&c" : "hot and cold",
    "hp" : "horsepower",
    "hr" : "hour",
    "hrh" : "his royal highness",
    "ht" : "height",
    "ibrb" : "i will be right back",
    "ic" : "i see",
    "icq" : "i seek you",
    "icymi" : "in case you missed it",
    "idc" : "i do not care",
    "idgadf" : "i do not give a damn fuck",
    "idgaf" : "i do not give a fuck",
    "idk" : "i do not know",
    "ie" : "that is",
    "i.e" : "that is",
    "ifyp" : "i feel your pain",
    "IG" : "instagram",
    "iirc" : "if i remember correctly",
    "ilu" : "i love you",
    "ily" : "i love you",
    "imho" : "in my humble opinion",
    "imo" : "in my opinion",
    "imu" : "i miss you",
    "iow" : "in other words",
    "irl" : "in real life",
    "j4f" : "just for fun",
    "jic" : "just in case",
    "jk" : "just kidding",
    "jsyk" : "just so you know",
    "l8r" : "later",
    "lb" : "pound",
    "lbs" : "pounds",
    "ldr" : "long distance relationship",
    "lmao" : "laugh my ass off",
    "lmfao" : "laugh my fucking ass off",
    "lol" : "laughing out loud",
    "ltd" : "limited",
    "ltns" : "long time no see",
    "m8" : "mate",
    "mf" : "motherfucker",
    "mfs" : "motherfuckers",
    "mfw" : "my face when",
    "mofo" : "motherfucker",
    "mph" : "miles per hour",
    "mr" : "mister",
    "mrw" : "my reaction when",
    "ms" : "miss",
    "mte" : "my thoughts exactly",
    "nagi" : "not a good idea",
    "nbc" : "national broadcasting company",
    "nbd" : "not big deal",
    "nfs" : "not for sale",
    "ngl" : "not going to lie",
    "nhs" : "national health service",
    "nrn" : "no reply necessary",
    "nsfl" : "not safe for life",
    "nsfw" : "not safe for work",
    "nth" : "nice to have",
    "nvr" : "never",
    "nyc" : "new york city",
    "oc" : "original content",
    "og" : "original",
    "ohp" : "overhead projector",
    "oic" : "oh i see",
    "omdb" : "over my dead body",
    "omg" : "oh my god",
    "omw" : "on my way",
    "p.a" : "per annum",
    "p.m" : "after midday",
    "pm" : "prime minister",
    "poc" : "people of color",
    "pov" : "point of view",
    "pp" : "pages",
    "ppl" : "people",
    "prw" : "parents are watching",
    "ps" : "postscript",
    "pt" : "point",
    "ptb" : "please text back",
    "pto" : "please turn over",
    "qpsa" : "what happens", #"que pasa",
    "ratchet" : "rude",
    "rbtl" : "read between the lines",
    "rlrt" : "real life retweet",
    "rofl" : "rolling on the floor laughing",
    "roflol" : "rolling on the floor laughing out loud",
    "rotflmao" : "rolling on the floor laughing my ass off",
    "rt" : "retweet",
    "ruok" : "are you ok",
    "sfw" : "safe for work",
    "sk8" : "skate",
    "smh" : "shake my head",
    "sq" : "square",
    "srsly" : "seriously",
    "ssdd" : "same stuff different day",
    "tbh" : "to be honest",
    "tbs" : "tablespooful",
    "tbsp" : "tablespooful",
    "tfw" : "that feeling when",
    "thks" : "thank you",
    "tho" : "though",
    "thx" : "thank you",
    "tia" : "thanks in advance",
    "til" : "today i learned",
    "tl;dr" : "too long i did not read",
    "tldr" : "too long i did not read",
    "tmb" : "tweet me back",
    "tntl" : "trying not to laugh",
    "ttyl" : "talk to you later",
    "u" : "you",
    "u2" : "you too",
    "u4e" : "yours for ever",
    "utc" : "coordinated universal time",
    "w/" : "with",
    "w/o" : "without",
    "w8" : "wait",
    "wassup" : "what is up",
    "wb" : "welcome back",
    "wtf" : "what the fuck",
    "wtg" : "way to go",
    "wtpa" : "where the party at",
    "wuf" : "where are you from",
    "wuzup" : "what is up",
    "wywh" : "wish you were here",
    "yd" : "yard",
    "ygtr" : "you got that right",
    "ynk" : "you never know",
    "zzz" : "sleeping bored and tired"
}

In [18]:
def chat_conversion(text):
    new_text = []
    for w in text.split():
        if w in chat_words:
            new_text.append(chat_words[w])
        else:
            new_text.append(w)
    return " ".join(new_text)

In [19]:
df['review'] = df['review'].apply(chat_conversion)

### Remove `EMOJI` if there is any emoji

In [20]:
def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

In [21]:
df['review'] = df['review'].apply(remove_emoji)

### Remove Punctuation

In [22]:
exclude = string.punctuation

In [23]:
def remove_punctuation(text):
    return text.translate(str.maketrans('','',exclude))

In [24]:
df['review'] = df['review'].apply(remove_punctuation)

### Remove Stop words

In [25]:
sw_list = stopwords.words('english')

df['review'] = df['review'].apply(lambda x: [item for item in x.split() if item not in sw_list]).apply(lambda x:" ".join(x))

In [26]:
df

Unnamed: 0,review,sentiment
0,one reviewers mentioned watching 1 oz episode ...,positive
1,wonderful little production filming technique ...,positive
2,thought wonderful way spend time hot summer we...,positive
3,basically theres family little boy jake thinks...,negative
4,petter matteis love time money visually stunni...,positive
...,...,...
49995,thought movie right good job wasnt creative or...,positive
49996,bad plot bad dialogue bad acting idiotic direc...,negative
49997,catholic taught parochial elementary schools n...,negative
49998,im going disagree previous comment side maltin...,negative


In [27]:
df["review"][12]

'im big fan bolls work many enjoyed movie postal maybe im one boll apparently bought rights use far cry long ago even game even finsished people enjoyed killing mercs infiltrating secret research labs located tropical island warned far cry something mister boll schemed together along legion schmucks feeling loneley set mister boll invites three countrymen play players go names today learned schweiger udo kier ralf moellerthree names actually made selfs pretty big movie biz tale goes like jack carver played today learned schweiger yes carver german hail bratwurst eating dudes however find tils acting movie pretty badass people complained hes really staying true whole carver agenda saw carver first person perspective dont really know looked like kicking however storyline film beyond demented see evil mad scientist dr krieger played udo kier making geneticallymutatedsoldiers gms called performing topsecret research island reminds spoiler vancouver reason thats right palm trees instead got

### Now Tokenize the reviews

In [28]:
def tokenize_with_nltk(text):
    return nltk.word_tokenize(text)

In [29]:
tokens = df['review'].apply(tokenize_with_nltk)

In [30]:
df['tokens'] = tokens

In [31]:
df

Unnamed: 0,review,sentiment,tokens
0,one reviewers mentioned watching 1 oz episode ...,positive,"[one, reviewers, mentioned, watching, 1, oz, e..."
1,wonderful little production filming technique ...,positive,"[wonderful, little, production, filming, techn..."
2,thought wonderful way spend time hot summer we...,positive,"[thought, wonderful, way, spend, time, hot, su..."
3,basically theres family little boy jake thinks...,negative,"[basically, theres, family, little, boy, jake,..."
4,petter matteis love time money visually stunni...,positive,"[petter, matteis, love, time, money, visually,..."
...,...,...,...
49995,thought movie right good job wasnt creative or...,positive,"[thought, movie, right, good, job, wasnt, crea..."
49996,bad plot bad dialogue bad acting idiotic direc...,negative,"[bad, plot, bad, dialogue, bad, acting, idioti..."
49997,catholic taught parochial elementary schools n...,negative,"[catholic, taught, parochial, elementary, scho..."
49998,im going disagree previous comment side maltin...,negative,"[im, going, disagree, previous, comment, side,..."


In [32]:
df['tokens'][10]

['phil',
 'alien',
 'one',
 'quirky',
 'films',
 'humour',
 'based',
 'around',
 'oddness',
 'everything',
 'rather',
 'actual',
 'punchlinesat',
 'first',
 'odd',
 'pretty',
 'funny',
 'movie',
 'progressed',
 'didnt',
 'find',
 'jokes',
 'oddness',
 'funny',
 'anymoreits',
 'low',
 'budget',
 'film',
 'thats',
 'never',
 'problem',
 'pretty',
 'interesting',
 'characters',
 'eventually',
 'lost',
 'interesti',
 'imagine',
 'film',
 'would',
 'appeal',
 'stoner',
 'currently',
 'partakingfor',
 'something',
 'similar',
 'better',
 'try',
 'brother',
 'another',
 'planet']

### Lemmatization for convenience

In [33]:
lemmatizer = WordNetLemmatizer()

def lemmatize_tokens(tokens):
    return [lemmatizer.lemmatize(token) for token in tokens]

In [34]:
df['lemmatize_tokens'] = tokens.apply(lemmatize_tokens)

In [35]:
df

Unnamed: 0,review,sentiment,tokens,lemmatize_tokens
0,one reviewers mentioned watching 1 oz episode ...,positive,"[one, reviewers, mentioned, watching, 1, oz, e...","[one, reviewer, mentioned, watching, 1, oz, ep..."
1,wonderful little production filming technique ...,positive,"[wonderful, little, production, filming, techn...","[wonderful, little, production, filming, techn..."
2,thought wonderful way spend time hot summer we...,positive,"[thought, wonderful, way, spend, time, hot, su...","[thought, wonderful, way, spend, time, hot, su..."
3,basically theres family little boy jake thinks...,negative,"[basically, theres, family, little, boy, jake,...","[basically, there, family, little, boy, jake, ..."
4,petter matteis love time money visually stunni...,positive,"[petter, matteis, love, time, money, visually,...","[petter, matteis, love, time, money, visually,..."
...,...,...,...,...
49995,thought movie right good job wasnt creative or...,positive,"[thought, movie, right, good, job, wasnt, crea...","[thought, movie, right, good, job, wasnt, crea..."
49996,bad plot bad dialogue bad acting idiotic direc...,negative,"[bad, plot, bad, dialogue, bad, acting, idioti...","[bad, plot, bad, dialogue, bad, acting, idioti..."
49997,catholic taught parochial elementary schools n...,negative,"[catholic, taught, parochial, elementary, scho...","[catholic, taught, parochial, elementary, scho..."
49998,im going disagree previous comment side maltin...,negative,"[im, going, disagree, previous, comment, side,...","[im, going, disagree, previous, comment, side,..."


### Stemming

In [36]:
stemmer = PorterStemmer()

def stem_tokens(tokens):
    return [stemmer.stem(token) for token in tokens]

In [37]:
df['Stemming'] = df['tokens'].apply(stem_tokens)

In [38]:
# We will get a list of stemming words for each sentence so convert it into string

In [39]:
df['Stemming'] = df['Stemming'].apply(lambda x: ' '.join(x))

In [40]:
df

Unnamed: 0,review,sentiment,tokens,lemmatize_tokens,Stemming
0,one reviewers mentioned watching 1 oz episode ...,positive,"[one, reviewers, mentioned, watching, 1, oz, e...","[one, reviewer, mentioned, watching, 1, oz, ep...",one review mention watch 1 oz episod youll hoo...
1,wonderful little production filming technique ...,positive,"[wonderful, little, production, filming, techn...","[wonderful, little, production, filming, techn...",wonder littl product film techniqu unassum old...
2,thought wonderful way spend time hot summer we...,positive,"[thought, wonderful, way, spend, time, hot, su...","[thought, wonderful, way, spend, time, hot, su...",thought wonder way spend time hot summer weeke...
3,basically theres family little boy jake thinks...,negative,"[basically, theres, family, little, boy, jake,...","[basically, there, family, little, boy, jake, ...",basic there famili littl boy jake think there ...
4,petter matteis love time money visually stunni...,positive,"[petter, matteis, love, time, money, visually,...","[petter, matteis, love, time, money, visually,...",petter mattei love time money visual stun film...
...,...,...,...,...,...
49995,thought movie right good job wasnt creative or...,positive,"[thought, movie, right, good, job, wasnt, crea...","[thought, movie, right, good, job, wasnt, crea...",thought movi right good job wasnt creativ orig...
49996,bad plot bad dialogue bad acting idiotic direc...,negative,"[bad, plot, bad, dialogue, bad, acting, idioti...","[bad, plot, bad, dialogue, bad, acting, idioti...",bad plot bad dialogu bad act idiot direct anno...
49997,catholic taught parochial elementary schools n...,negative,"[catholic, taught, parochial, elementary, scho...","[catholic, taught, parochial, elementary, scho...",cathol taught parochi elementari school nun ta...
49998,im going disagree previous comment side maltin...,negative,"[im, going, disagree, previous, comment, side,...","[im, going, disagree, previous, comment, side,...",im go disagre previou comment side maltin one ...


# Prepare Input and Output For Machine Learning Algorithms

In [41]:
X = df['Stemming']
y = df['sentiment']

In [42]:
X

0        one review mention watch 1 oz episod youll hoo...
1        wonder littl product film techniqu unassum old...
2        thought wonder way spend time hot summer weeke...
3        basic there famili littl boy jake think there ...
4        petter mattei love time money visual stun film...
                               ...                        
49995    thought movi right good job wasnt creativ orig...
49996    bad plot bad dialogu bad act idiot direct anno...
49997    cathol taught parochi elementari school nun ta...
49998    im go disagre previou comment side maltin one ...
49999    one expect star trek movi high art fan expect ...
Name: Stemming, Length: 49582, dtype: object

In [43]:
y

0        positive
1        positive
2        positive
3        negative
4        positive
           ...   
49995    positive
49996    negative
49997    negative
49998    negative
49999    negative
Name: sentiment, Length: 49582, dtype: object

### Convert the values of Y column or output column in 0 and 1

In [44]:
encoder = LabelEncoder()

y = encoder.fit_transform(y)

In [45]:
y

array([1, 1, 1, ..., 0, 0, 0])

### Split your data into training and testing

In [46]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=1)

In [47]:
X_train.shape

(39665,)

# Apply Machine Learning Algorithms

### 1. Apply `BOW` to convert a text into `Vector` for our Machine Learning Model

In [46]:
cv = CountVectorizer(max_features=5000)

In [47]:
# Now apply CountVectorizer
X_train_bow = cv.fit_transform(X_train).toarray()
X_test_bow = cv.transform(X_test).toarray()

In [48]:
X_train_bow.shape

(39665, 5000)

In [49]:
from scipy.stats import loguniform

#### 1. Gaussian Naive Bayes

In [65]:
gnb_bow = GaussianNB()

cv_scores = cross_val_score(gnb_bow, X_train_bow, y_train, cv=5, n_jobs = 6)
print("Cross-validation accuracy scores:", cv_scores)

gnb_bow_accuracy = np.mean(cv_scores)
print("Mean accuracy of model:", gnb_bow_accuracy)

gnb_bow.fit(X_train_bow, y_train) 

y_pred_gnb_bow = gnb_bow.predict(X_test_bow)

Cross-validation accuracy scores: [0.73238371 0.7309971  0.72898021 0.72355981 0.71889575]
Mean accuracy of model: 0.7269633177864616


In [66]:
# Calculate precision, recall, and F1 score
precision_gnb_bow = precision_score(y_test, y_pred_gnb_bow, average='weighted')  # or 'macro', 'micro', etc.
recall_gnb_bow = recall_score(y_test, y_pred_gnb_bow, average='weighted')
f1_score_gnb_bow = f1_score(y_test, y_pred_gnb_bow, average='weighted')

# Print the results
print("Precision:", precision_gnb_bow)
print("Recall:", recall_gnb_bow)
print("F1 Score:", f1_score_gnb_bow)

Precision: 0.74353394874409
Recall: 0.729353635171927
F1 Score: 0.7245639335885092


In [69]:
confusion_matrix_gnb_bow = confusion_matrix(y_test, y_pred_gnb_bow)
print("Confusion Matrix:\n", confusion_matrix_gnb_bow)

Confusion Matrix:
 [[4308  725]
 [1959 2925]]


#### 2. Random Forest

In [62]:
rf_bow = RandomForestClassifier()
param_distributions = {
    'n_estimators': [100, 200, 300], 
    'max_depth': [4, 8, 12],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2'],
}

grid_search = RandomizedSearchCV(estimator=rf_bow, param_distributions=param_distributions , n_jobs=6)
grid_search.fit(X_train_bow, y_train)
best_model = grid_search.best_estimator_
print("Best hyperparameters:", grid_search.best_params_)

Best hyperparameters: {'n_estimators': 300, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 'log2', 'max_depth': 12}


In [71]:
rf_bow = RandomForestClassifier(n_estimators=300, min_samples_split = 5,
                           min_samples_leaf = 2, max_features = 'log2',
                           max_depth = 12, n_jobs = 6)

cv_scores = cross_val_score(rf_bow, X_train_bow, y_train, cv=5)
print("Cross-validation accuracy scores:", cv_scores)

rf_bow_accuracy = np.mean(cv_scores)
print("Mean accuracy of model:", rf_bow_accuracy)

rf_bow.fit(X_train_bow, y_train) 

y_pred_rf_bow = rf_bow.predict(X_test_bow)

Cross-validation accuracy scores: [0.83839657 0.8346149  0.83373251 0.83524518 0.84772469]
Mean accuracy of model: 0.8379427707046514


In [72]:
precision_rf_bow = precision_score(y_test, y_pred_rf_bow, average='weighted')  # or 'macro', 'micro', etc.
recall_rf_bow = recall_score(y_test, y_pred_rf_bow, average='weighted')
f1_score_rf_bow = f1_score(y_test, y_pred_rf_bow, average='weighted')


print("Precision:", precision_rf_bow)
print("Recall:", recall_rf_bow)
print("F1 Score:", f1_score_rf_bow)

Precision: 0.8400600392915734
Recall: 0.8307956035091257
F1 Score: 0.8298906512696285


In [73]:
confusion_matrix_rf_bow = confusion_matrix(y_test, y_pred_rf_bow)
print("Confusion Matrix:\n", confusion_matrix_rf_bow)

Confusion Matrix:
 [[3793 1240]
 [ 438 4446]]


#### 3. Decision Tree

In [79]:
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10, 20],
    'min_samples_leaf': [1, 2, 4, 10],
    'max_features': ['auto', 'sqrt', 'log2', None]
}

dt = DecisionTreeClassifier(random_state=42)

grid_search = RandomizedSearchCV(estimator=dt, param_distributions=param_grid, scoring='accuracy', n_jobs=8)

grid_search.fit(X_train_bow, y_train)

best_dt = grid_search.best_estimator_

print("Best hyperparameters:", grid_search.best_params_)

Best hyperparameters: {'min_samples_split': 20, 'min_samples_leaf': 10, 'max_features': None, 'max_depth': 50, 'criterion': 'gini'}


In [82]:
dt_bow = DecisionTreeClassifier(min_samples_split=20, min_samples_leaf=10, max_depth=50,criterion='gini')

cv_scores = cross_val_score(dt_bow, X_train_bow, y_train, cv=5, n_jobs=8)
print("Cross-validation accuracy scores:", cv_scores)

dt_bow_accuracy = np.mean(cv_scores)
print("Mean accuracy of model:", dt_bow_accuracy)

dt_bow.fit(X_train_bow, y_train) 

y_pred_dt_bow = dt_bow.predict(X_test_bow)

Cross-validation accuracy scores: [0.74070339 0.74158578 0.7321316  0.73276188 0.73843439]
Mean accuracy of model: 0.7371234085465777


In [83]:
precision_dt_bow = precision_score(y_test, y_pred_dt_bow, average='weighted')  # or 'macro', 'micro', etc.
recall_dt_bow = recall_score(y_test, y_pred_dt_bow, average='weighted')
f1_score_dt_bow = f1_score(y_test, y_pred_dt_bow, average='weighted')


print("Precision:", precision_dt_bow)
print("Recall:", recall_dt_bow)
print("F1 Score:", f1_score_dt_bow)

Precision: 0.7335908463126383
Recall: 0.7333871130382172
F1 Score: 0.7333987431668987


In [84]:
confusion_matrix_dt_bow = confusion_matrix(y_test, y_pred_dt_bow)
print("Confusion Matrix:\n", confusion_matrix_dt_bow)

Confusion Matrix:
 [[3656 1377]
 [1267 3617]]


#### 4. Logistic Regression

In [85]:
lr = LogisticRegression()

param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100],
              'solver': ['lbfgs', 'sag', 'liblinear']}

grid_search = RandomizedSearchCV(lr, param_grid, scoring='accuracy', n_jobs = 8)
grid_search.fit(X_train_bow, y_train)

best_model = grid_search.best_estimator_
print("Best hyperparameters:", best_model.get_params())

Best hyperparameters: {'C': 0.1, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 100, 'multi_class': 'auto', 'n_jobs': None, 'penalty': 'l2', 'random_state': None, 'solver': 'sag', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}


In [87]:
lr_bow = LogisticRegression(C=0.1,max_iter=100,
                        n_jobs=12,penalty="l2",
                        solver='sag')

cv_scores = cross_val_score(lr_bow, X_train_bow, y_train, cv=5)
print("Cross-validation accuracy scores:", cv_scores)

lr_bow_accuracy = np.mean(cv_scores)
print("Mean accuracy of model:", lr_bow_accuracy)

lr_bow.fit(X_train_bow, y_train) 
y_pred_lr_bow = lr_bow.predict(X_test_bow)

Cross-validation accuracy scores: [0.88251607 0.88226396 0.87911257 0.87520484 0.88289424]
Mean accuracy of model: 0.8803983360645405


In [88]:
precision_lr_bow = precision_score(y_test, y_pred_lr_bow, average='weighted')  # or 'macro', 'micro', etc.
recall_lr_bow = recall_score(y_test, y_pred_lr_bow, average='weighted')
f1_score_lr_bow = f1_score(y_test, y_pred_lr_bow, average='weighted')


print("Precision:", precision_lr_bow)
print("Recall:", recall_lr_bow)
print("F1 Score:", f1_score_lr_bow)

Precision: 0.8830296552403704
Recall: 0.8825249571442977
F1 Score: 0.8825209197287387


In [89]:
confusion_matrix_lr_bow = confusion_matrix(y_test, y_pred_lr_bow)
print("Confusion Matrix:\n", confusion_matrix_lr_bow)

Confusion Matrix:
 [[4366  667]
 [ 498 4386]]


#### 5. Support Vector Machine

In [56]:
cv = CountVectorizer(max_features=1000)

In [57]:
# Now apply CountVectorizer
X_train_bow = cv.fit_transform(X_train).toarray()
X_test_bow = cv.transform(X_test).toarray()

In [58]:
svm_bow = SVC(C=1, kernel="linear")

In [59]:
cv_scores = cross_val_score(svm_bow, X_train_bow, y_train, cv=5)
print("Cross-validation accuracy scores:", cv_scores)

svm_bow_accuracy = np.mean(cv_scores)
print("Mean accuracy of model:", svm_bow_accuracy)

Cross-validation accuracy scores: [0.86285138 0.86020421 0.85717887 0.85125425 0.86234716]
Mean accuracy of model: 0.8587671750913903


In [60]:
svm_bow.fit(X_train_bow, y_train)
y_pred_svm_bow = svm_bow.predict(X_test_bow)

In [61]:
precision_svm_bow = precision_score(y_test, y_pred_svm_bow, average='weighted')  # or 'macro', 'micro', etc.
recall_svm_bow = recall_score(y_test, y_pred_svm_bow, average='weighted')
f1_score_svm_bow = f1_score(y_test, y_pred_svm_bow, average='weighted')


print("Precision:", precision_svm_bow)
print("Recall:", recall_svm_bow)
print("F1 Score:", f1_score_svm_bow)

Precision: 0.8617640845659432
Recall: 0.8611475244529596
F1 Score: 0.8611361982732547


In [62]:
confusion_matrix_svm_bow = confusion_matrix(y_test, y_pred_svm_bow)
print("Confusion Matrix:\n", confusion_matrix_svm_bow)

Confusion Matrix:
 [[4249  784]
 [ 593 4291]]


### `B` Now try with n-grams

In [48]:
cv = CountVectorizer(ngram_range=(1,5),max_features=5000)

X_train_bow_ng = cv.fit_transform(X_train).toarray()
X_test_bow_ng = cv.transform(X_test).toarray()

#### 1. Random Forest

In [51]:
rf_ngrams = RandomForestClassifier(n_estimators=300, min_samples_split = 5,
                           min_samples_leaf = 2, max_features = 'log2',
                           max_depth = 12, n_jobs = 6)

cv_scores = cross_val_score(rf_ngrams, X_train_bow_ng, y_train, cv=5)
print("Cross-validation accuracy scores:", cv_scores)

rf_ngrams_accuracy = np.mean(cv_scores)
print("Mean accuracy of model:", rf_ngrams_accuracy)

rf_ngrams.fit(X_train_bow_ng, y_train) 
y_pred_rf_ngrams = rf_ngrams.predict(X_test_bow_ng)

Cross-validation accuracy scores: [0.8400353  0.83032901 0.82768184 0.83184167 0.84268247]
Mean accuracy of model: 0.834514055212404


In [52]:
precision_rf_ngrams = precision_score(y_test, y_pred_rf_ngrams, average='weighted')  # or 'macro', 'micro', etc.
recall_rf_ngrams = recall_score(y_test, y_pred_rf_ngrams, average='weighted')
f1_score_rf_ngrams = f1_score(y_test, y_pred_rf_ngrams, average='weighted')


print("Precision:", precision_rf_ngrams)
print("Recall:", recall_rf_ngrams)
print("F1 Score:", f1_score_rf_ngrams)

Precision: 0.8416226854466442
Recall: 0.8289805384692952
F1 Score: 0.8276853812048496


In [53]:
confusion_matrix_rf_ngrams = confusion_matrix(y_test, y_pred_rf_ngrams)
print("Confusion Matrix:\n", confusion_matrix_rf_ngrams)

Confusion Matrix:
 [[3716 1317]
 [ 379 4505]]


#### 2. Decision Tree

In [54]:
dt_ngrams = DecisionTreeClassifier(min_samples_split=20, min_samples_leaf=10, max_depth=50,criterion='gini')

cv_scores = cross_val_score(dt_ngrams, X_train_bow_ng, y_train, cv=5)
print("Cross-validation accuracy scores:", cv_scores)

dt_ngrams_accuracy = np.mean(cv_scores)
print("Mean accuracy of model:", dt_ngrams_accuracy)

dt_ngrams.fit(X_train_bow_ng, y_train) 
y_pred_dt_ngrams = dt_ngrams.predict(X_test_bow_ng)

Cross-validation accuracy scores: [0.74284634 0.74259423 0.73477877 0.73250977 0.73629144]
Mean accuracy of model: 0.7378041094163621


In [55]:
precision_dt_ngrams = precision_score(y_test, y_pred_dt_ngrams, average='weighted')  # or 'macro', 'micro', etc.
recall_dt_ngrams = recall_score(y_test, y_pred_dt_ngrams, average='weighted')
f1_score_dt_ngrams = f1_score(y_test, y_pred_dt_ngrams, average='weighted')


print("Precision:", precision_dt_ngrams)
print("Recall:", recall_dt_ngrams)
print("F1 Score:", f1_score_dt_ngrams)

Precision: 0.740509473250289
Recall: 0.7404456993042251
F1 Score: 0.7404587645048992


In [56]:
confusion_matrix_dt_ngrams = confusion_matrix(y_test, y_pred_dt_ngrams)
print("Confusion Matrix:\n", confusion_matrix_dt_ngrams)

Confusion Matrix:
 [[3721 1312]
 [1262 3622]]


#### 3. Logistic Regression

In [49]:
lr_ngrams = LogisticRegression(C=0.1,max_iter=100,
                        n_jobs=12,penalty="l2",
                        solver='sag')

cv_scores = cross_val_score(lr_ngrams, X_train_bow_ng, y_train, cv=5)
print("Cross-validation accuracy scores:", cv_scores)

lr_ngrams_accuracy = np.mean(cv_scores)
print("Mean accuracy of model:", lr_ngrams_accuracy)

lr_ngrams.fit(X_train_bow_ng, y_train) 
y_pred_lr_ngrams = lr_ngrams.predict(X_test_bow_ng)

Cross-validation accuracy scores: [0.88327241 0.88201185 0.87961679 0.87860834 0.88604563]
Mean accuracy of model: 0.8819110046640615


In [50]:
precision_lr_ngrams = precision_score(y_test, y_pred_lr_ngrams, average='weighted')  # or 'macro', 'micro', etc.
recall_lr_ngrams = recall_score(y_test, y_pred_lr_ngrams, average='weighted')
f1_score_lr_ngrams = f1_score(y_test, y_pred_lr_ngrams, average='weighted')


print("Precision:", precision_lr_ngrams)
print("Recall:", recall_lr_ngrams)
print("F1 Score:", f1_score_lr_ngrams)

Precision: 0.8852382837575559
Recall: 0.8847433699707573
F1 Score: 0.8847398471082164


In [51]:
confusion_matrix_lr_ngrams = confusion_matrix(y_test, y_pred_lr_ngrams)
print("Confusion Matrix:\n", confusion_matrix_lr_ngrams)

Confusion Matrix:
 [[4378  655]
 [ 488 4396]]


#### 4. Gaussian Naive Bayes

In [60]:
gnb_ngrams = GaussianNB()

cv_scores = cross_val_score(gnb_ngrams, X_train_bow_ng, y_train, cv=5)
print("Cross-validation accuracy scores:", cv_scores)

gnb_ngrams_accuracy = np.mean(cv_scores)
print("Mean accuracy of model:", gnb_ngrams_accuracy)

gnb_ngrams.fit(X_train_bow_ng, y_train) 
y_pred_gnb_ngrams = gnb_ngrams.predict(X_test_bow_ng)

Cross-validation accuracy scores: [0.80864742 0.81205093 0.80637842 0.80600025 0.80549603]
Mean accuracy of model: 0.8077146098575569


In [61]:
precision_gnb_ngrams = precision_score(y_test, y_pred_gnb_ngrams, average='weighted')  # or 'macro', 'micro', etc.
recall_gnb_ngrams = recall_score(y_test, y_pred_gnb_ngrams, average='weighted')
f1_score_gnb_ngrams = f1_score(y_test, y_pred_gnb_ngrams, average='weighted')


print("Precision:", precision_gnb_ngrams)
print("Recall:", recall_gnb_ngrams)
print("F1 Score:", f1_score_gnb_ngrams)

Precision: 0.8099499375528331
Recall: 0.8090148230311586
F1 Score: 0.8087585127033047


In [62]:
confusion_matrix_gnb_ngrams = confusion_matrix(y_test, y_pred_gnb_ngrams)
print("Confusion Matrix:\n", confusion_matrix_gnb_ngrams)

Confusion Matrix:
 [[4234  799]
 [1095 3789]]


### `C.` Using TfIdf

In [47]:
tfidf = TfidfVectorizer()

In [48]:
X_train_tfidf = tfidf.fit_transform(X_train[:15000]).toarray()

In [49]:
X_test_tfidf = tfidf.transform(X_test[:15000]).toarray()

#### 1.Random Forest

In [66]:
rf_tfidf = RandomForestClassifier(n_estimators=300, min_samples_split = 5,
                           min_samples_leaf = 2, max_features = 'log2',
                           max_depth = 12, n_jobs = 6)

cv_scores = cross_val_score(rf_tfidf, X_train_tfidf, y_train[:15000], cv=5)
print("Cross-validation accuracy scores:", cv_scores)

rf_tfidf_accuracy = np.mean(cv_scores)
print("Mean accuracy :", rf_tfidf_accuracy)

rf_tfidf.fit(X_train_tfidf, y_train[:15000]) 
y_pred_rf_tfidf = rf_tfidf.predict(X_test_tfidf)

Cross-validation accuracy scores: [0.77833333 0.778      0.78933333 0.76233333 0.77866667]
Mean accuracy : 0.7773333333333333


In [67]:
precision_rf_tfidf = precision_score(y_test, y_pred_rf_tfidf, average='weighted')  # or 'macro', 'micro', etc.
recall_rf_tfidf = recall_score(y_test, y_pred_rf_tfidf, average='weighted')
f1_score_rf_tfidf = f1_score(y_test, y_pred_rf_tfidf, average='weighted')


print("Precision:", precision_rf_tfidf)
print("Recall:", recall_rf_tfidf)
print("F1 Score:", f1_score_rf_tfidf)

Precision: 0.7913413602578989
Recall: 0.7875365533931633
F1 Score: 0.7870776872336211


In [68]:
confusion_matrix_rf_tfidf = confusion_matrix(y_test[:15000], y_pred_rf_tfidf)
print("Confusion Matrix:\n", confusion_matrix_rf_tfidf)

Confusion Matrix:
 [[3709 1324]
 [ 783 4101]]


#### 2. Logistic Regression

In [52]:
lr_tfidf = LogisticRegression(C=0.1,max_iter=100,
                        n_jobs=12,penalty="l2",
                        solver='sag')

cv_scores = cross_val_score(lr_tfidf, X_train_tfidf, y_train[:15000], cv=5)
print("Cross-validation accuracy scores:", cv_scores)

lr_tfidf_accuracy = np.mean(cv_scores)
print("Mean accuracy:", lr_tfidf_accuracy)

Cross-validation accuracy scores: [0.845      0.83933333 0.85966667 0.83733333 0.83733333]
Mean accuracy: 0.8437333333333333


In [53]:
lr_tfidf.fit(X_train_tfidf, y_train[:15000]) 
y_pred_lr_tfidf = lr_tfidf.predict(X_test_tfidf)

In [54]:
precision_lr_tfidf = precision_score(y_test, y_pred_lr_tfidf, average='weighted')  # or 'macro', 'micro', etc.
recall_lr_tfidf = recall_score(y_test, y_pred_lr_tfidf, average='weighted')
f1_score_lr_tfidf = f1_score(y_test, y_pred_lr_tfidf, average='weighted')


print("Precision:", precision_lr_tfidf)
print("Recall:", recall_lr_tfidf)
print("F1 Score:", f1_score_lr_tfidf)

Precision: 0.8503632465487019
Recall: 0.8485429061208026
F1 Score: 0.8484412164295516


In [55]:
confusion_matrix_lr_tfidf = confusion_matrix(y_test[:15000], y_pred_lr_tfidf)
print("Confusion Matrix:\n", confusion_matrix_lr_tfidf)

Confusion Matrix:
 [[4111  922]
 [ 580 4304]]


#### 3. Decision Tree

In [72]:
dt_tfidf = DecisionTreeClassifier(min_samples_split=20, min_samples_leaf=10, max_depth=50,criterion='gini')

cv_scores = cross_val_score(dt_tfidf, X_train_tfidf, y_train[:15000], cv=5)
print("Cross-validation accuracy scores:", cv_scores)

dt_tfidf_accuracy = np.mean(cv_scores)
print("Mean accuracy :", dt_tfidf_accuracy)

dt_tfidf.fit(X_train_tfidf, y_train[:15000]) 
y_pred_dt_tfidf = dt_tfidf.predict(X_test_tfidf)

Cross-validation accuracy scores: [0.72233333 0.713      0.73166667 0.705      0.71633333]
Mean accuracy : 0.7176666666666667


In [73]:
precision_dt_tfidf = precision_score(y_test, y_pred_dt_tfidf, average='weighted')  # or 'macro', 'micro', etc.
recall_dt_tfidf = recall_score(y_test, y_pred_dt_tfidf, average='weighted')
f1_score_dt_tfidf = f1_score(y_test, y_pred_dt_tfidf, average='weighted')


print("Precision:", precision_dt_tfidf)
print("Recall:", recall_dt_tfidf)
print("F1 Score:", f1_score_dt_tfidf)

Precision: 0.7186535522728146
Recall: 0.7186649188262579
F1 Score: 0.718657364644479


In [74]:
confusion_matrix_dt_tfidf = confusion_matrix(y_test[:15000], y_pred_dt_tfidf)
print("Confusion Matrix:\n", confusion_matrix_dt_tfidf)

Confusion Matrix:
 [[3646 1387]
 [1403 3481]]


#### 4. Gussian Naive Bayes

In [50]:
gnb_tfidf = GaussianNB()

cv_scores = cross_val_score(gnb_tfidf, X_train_tfidf[:10000], y_train[:10000], cv=5)
print("Cross-validation accuracy scores:", cv_scores)

gnb_tfidf_accuracy = np.mean(cv_scores)
print("Mean accuracy :", gnb_tfidf_accuracy)

Cross-validation accuracy scores: [0.622  0.629  0.6235 0.636  0.6235]
Mean accuracy : 0.6268


In [52]:
gnb_tfidf.fit(X_train_tfidf[:10000], y_train[:10000]) 
y_pred_gnb_tfidf = gnb_tfidf.predict(X_test_tfidf[:10000])

In [53]:
precision_gnb_tfidf = precision_score(y_test, y_pred_gnb_tfidf, average='weighted')  # or 'macro', 'micro', etc.
recall_gnb_tfidf = recall_score(y_test, y_pred_gnb_tfidf, average='weighted')
f1_score_gnb_tfidf = f1_score(y_test, y_pred_gnb_tfidf, average='weighted')


print("Precision:", precision_gnb_tfidf)
print("Recall:", recall_gnb_tfidf)
print("F1 Score:", f1_score_gnb_tfidf)

Precision: 0.6325333556082798
Recall: 0.6320459816476757
F1 Score: 0.6311451845236343


In [54]:
confusion_matrix_gnb_tfidf = confusion_matrix(y_test[:10000], y_pred_gnb_tfidf)
print("Confusion Matrix:\n", confusion_matrix_gnb_tfidf)

Confusion Matrix:
 [[3419 1614]
 [2035 2849]]


# Using `Word2Vec`

In [52]:
import gensim
from gensim.models import Word2Vec,KeyedVectors
from tqdm import tqdm

### Create Your Own Word2Vec Model

In [53]:
model = gensim.models.Word2Vec(
    df['lemmatize_tokens'],
    window=16,
    min_count=2,
    workers=12,
    vector_size = 300,
    epochs=5
)

#### Build your vocab

In [54]:
model.build_vocab(df['lemmatize_tokens'])

#### Train your model

In [55]:
model.train(df['lemmatize_tokens'], total_examples = model.corpus_count, epochs = model.epochs)

(27739299, 29764055)

#### Convert each sentence into vector by taking the mean of all the vectors of the words , which are in the sentence

In [56]:
def document_vector(doc):
    doc = [word for word in doc if word in model.wv.index_to_key]
    return np.mean(model.wv[doc], axis=0)

#### Sample, how the vector of first sentence looks

In [57]:
document_vector(df['lemmatize_tokens'].values[0])

array([-1.78147599e-01, -5.10223024e-02,  3.99951562e-02, -9.18036792e-03,
       -3.79882842e-01,  2.67166793e-01, -5.22094190e-01, -1.31891429e+00,
       -6.68720603e-02,  2.20211238e-01,  1.08216479e-01,  1.08081885e-01,
        1.50446683e-01,  1.17426291e-01,  6.65958762e-01,  4.54275876e-01,
       -3.76631543e-02,  2.26009876e-01,  6.31444827e-02,  6.09580316e-02,
       -1.03368005e-02,  2.53046244e-01, -4.07715499e-01, -3.74304712e-01,
       -2.12207243e-01,  1.74968094e-01,  7.83290416e-02, -1.23858273e-01,
        5.82629396e-03,  8.27625021e-02, -2.24768415e-01, -2.06145868e-02,
       -1.17274940e-01,  3.59415263e-01, -1.49576992e-01, -8.84710774e-02,
        7.03948438e-02, -1.30212102e-02,  3.32835242e-02, -1.54258102e-01,
        1.60328932e-02,  3.05197209e-01,  2.27816522e-01, -4.99388069e-01,
        5.76038808e-02, -1.58826306e-01, -3.06345671e-01, -5.07605970e-01,
        3.10602874e-01, -7.32449174e-01, -5.41511066e-02,  5.49884923e-02,
        1.97481230e-01, -

### Convert each sentence into a vector

In [58]:
X = []
for doc in tqdm(df['lemmatize_tokens'].values):
    X.append(document_vector(doc))

  0%|          | 0/49582 [00:00<?, ?it/s]

100%|██████████| 49582/49582 [13:10<00:00, 62.69it/s] 


#### We will get sparse matrix of each sentence so convert them into numpy array

In [59]:
X = np.array(X)

### Convert the value of column `Y` into zero and one as they are positive and negative 

In [60]:
encoder = LabelEncoder()
y = encoder.fit_transform(df['sentiment'])

### Split our dataset into train and test

In [61]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=1)

# Now Apply Machine Learning Algorithm on data using `Word2Vec`

#### 1. Logistic Regression

In [57]:
lr = LogisticRegression()
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100],
              'solver': ['lbfgs', 'sag', 'liblinear']}

grid_search = RandomizedSearchCV(lr, param_grid, scoring='accuracy', n_jobs = -1)
grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_
print("Best hyperparameters:", best_model.get_params())

Best hyperparameters: {'C': 1, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 100, 'multi_class': 'auto', 'n_jobs': None, 'penalty': 'l2', 'random_state': None, 'solver': 'sag', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}


In [62]:
lr_w2v = LogisticRegression(C=1,max_iter=100,multi_class="auto", 
                        n_jobs=12,penalty="l2",
                        solver='sag')

cv_scores = cross_val_score(lr_w2v, X_train, y_train, cv=5)
print("Cross-validation accuracy scores:", cv_scores)

lr_w2v_accuracy = np.mean(cv_scores)
print("Mean accuracy of model:", lr_w2v_accuracy)

lr_w2v.fit(X_train, y_train)
y_pred_lr_w2v = lr_w2v.predict(X_test)

Cross-validation accuracy scores: [0.87898651 0.88087735 0.87747384 0.87936468 0.88352452]
Mean accuracy of model: 0.8800453800579856


In [63]:
precision_lr_w2v = precision_score(y_test, y_pred_lr_w2v, average='weighted')  # or 'macro', 'micro', etc.
recall_lr_w2v = recall_score(y_test, y_pred_lr_w2v, average='weighted')
f1_score_lr_w2v = f1_score(y_test, y_pred_lr_w2v, average='weighted')


print("Precision:", precision_lr_w2v)
print("Recall:", recall_lr_w2v)
print("F1 Score:", f1_score_lr_w2v)

Precision: 0.8769281948862215
Recall: 0.8765755772915196
F1 Score: 0.8765771585809969


In [64]:
confusion_matrix_lr_w2v = confusion_matrix(y_test, y_pred_lr_w2v)
print("Confusion Matrix:\n", confusion_matrix_lr_w2v)

Confusion Matrix:
 [[4351  682]
 [ 542 4342]]


#### 2. Support Vector Machine

In [73]:
svm = SVC()
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100],
              'kernel': ['linear', 'rbf'],
              'gamma': [0.001, 0.01, 0.1, 1]}

grid_search = RandomizedSearchCV(svm, param_grid, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_
print("Best hyperparameters:", best_model.get_params())

Best hyperparameters: {'C': 10, 'break_ties': False, 'cache_size': 200, 'class_weight': None, 'coef0': 0.0, 'decision_function_shape': 'ovr', 'degree': 3, 'gamma': 0.01, 'kernel': 'rbf', 'max_iter': -1, 'probability': False, 'random_state': None, 'shrinking': True, 'tol': 0.001, 'verbose': False}


In [75]:
svm_w2v = SVC(C=10, decision_function_shape="ovr", kernel="rbf")

cv_scores = cross_val_score(svm_w2v, X_train, y_train, cv=5)
print("Cross-validation accuracy scores:", cv_scores)

svm_w2v_accuracy = np.mean(cv_scores)
print("Mean accuracy:", svm_w2v_accuracy)

svm_w2v.fit(X_train, y_train)
y_pred_svm_w2v = svm_w2v.predict(X_test)

Cross-validation accuracy scores: [0.87949073 0.87898651 0.87797807 0.87961679 0.88554141]
Mean accuracy: 0.8803227026345646


In [76]:
precision_svm_w2v = precision_score(y_test, y_pred_svm_w2v, average='weighted')  # or 'macro', 'micro', etc.
recall_svm_w2v = recall_score(y_test, y_pred_svm_w2v, average='weighted')
f1_score_svm_w2v = f1_score(y_test, y_pred_svm_w2v, average='weighted')


print("Precision:", precision_svm_w2v)
print("Recall:", recall_svm_w2v)
print("F1 Score:", f1_score_svm_w2v)

Precision: 0.885235523003334
Recall: 0.8851467177573863
F1 Score: 0.88515309461305


In [77]:
confusion_matrix_svm_w2v = confusion_matrix(y_test, y_pred_svm_w2v)
print("Confusion Matrix:\n", confusion_matrix_svm_w2v)

Confusion Matrix:
 [[4431  602]
 [ 537 4347]]


#### 3. Random Forest

In [62]:
param_distributions = {
    'n_estimators': [100, 200, 300], 
    'max_depth': [4, 8, 12],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2'],
}

grid_search = RandomizedSearchCV(estimator=RandomForestClassifier(), param_distributions=param_distributions, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_
print("Best hyperparameters:", grid_search.best_params_)

Best hyperparameters: {'n_estimators': 300, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'log2', 'max_depth': 12}


In [63]:
rf_w2v = RandomForestClassifier(
    criterion='gini', max_depth=12, 
    max_features = "log2", min_samples_leaf=1,
    n_estimators=300,n_jobs=-1,
    min_samples_split= 2
)

cv_scores = cross_val_score(rf_w2v, X_train, y_train, cv=5)
print("Cross-validation accuracy scores:", cv_scores)

rf_w2v_accuracy = np.mean(cv_scores)
print("Mean accuracy:", rf_w2v_accuracy)

rf_w2v.fit(X_train, y_train)
y_pred_rf_w2v = rf_w2v.predict(X_test)

Cross-validation accuracy scores: [0.84898525 0.8411698  0.84394302 0.84469936 0.84646414]
Mean accuracy: 0.8450523131224001


In [64]:
precision_rf_w2v = precision_score(y_test, y_pred_rf_w2v, average='weighted')  # or 'macro', 'micro', etc.
recall_rf_w2v = recall_score(y_test, y_pred_rf_w2v, average='weighted')
f1_score_rf_w2v = f1_score(y_test, y_pred_rf_w2v, average='weighted')


print("Precision:", precision_rf_w2v)
print("Recall:", recall_rf_w2v)
print("F1 Score:", f1_score_rf_w2v)

Precision: 0.84453820776827
Recall: 0.8437027326812544
F1 Score: 0.843674105389934


In [65]:
confusion_matrix_rf_w2v = confusion_matrix(y_test, y_pred_rf_w2v)
print("Confusion Matrix:\n", confusion_matrix_rf_w2v)

Confusion Matrix:
 [[4144  889]
 [ 661 4223]]


#### 4. Decision Tree

In [66]:
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10, 20],
    'min_samples_leaf': [1, 2, 4, 10],
    'max_features': ['auto', 'sqrt', 'log2', None]
}

dt = DecisionTreeClassifier(random_state=42)

grid_search = RandomizedSearchCV(estimator=dt, param_distributions=param_grid, scoring='accuracy', n_jobs=8)

grid_search.fit(X_train, y_train)

best_dt = grid_search.best_estimator_

print("Best hyperparameters:", grid_search.best_params_)

Best hyperparameters: {'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 10, 'criterion': 'gini'}


In [67]:
dt_w2v = DecisionTreeClassifier(min_samples_split=5, min_samples_leaf=1,
                                max_depth=10,max_features='sqrt', criterion='gini')

cv_scores = cross_val_score(dt_w2v, X_train, y_train, cv=5)
print("Cross-validation accuracy scores:", cv_scores)

dt_w2v_accuracy = np.mean(cv_scores)
print("Mean accuracy:", dt_w2v_accuracy)

dt_w2v.fit(X_train, y_train)
y_pred_dt_w2v = dt_w2v.predict(X_test)

Cross-validation accuracy scores: [0.74763646 0.7475104  0.75343502 0.74209    0.74826673]
Mean accuracy: 0.7477877221732007


In [68]:
precision_dt_w2v = precision_score(y_test, y_pred_dt_w2v, average='weighted')  # or 'macro', 'micro', etc.
recall_dt_w2v = recall_score(y_test, y_pred_dt_w2v, average='weighted')
f1_score_dt_w2v = f1_score(y_test, y_pred_dt_w2v, average='weighted')


print("Precision:", precision_dt_w2v)
print("Recall:", recall_dt_w2v)
print("F1 Score:", f1_score_dt_w2v)

Precision: 0.7651056913974552
Recall: 0.7646465665019663
F1 Score: 0.7646351944309472


In [69]:
confusion_matrix_dt_w2v = confusion_matrix(y_test, y_pred_dt_w2v)
print("Confusion Matrix:\n", confusion_matrix_dt_w2v)

Confusion Matrix:
 [[3778 1255]
 [1079 3805]]


#### 5. Gaussian Naive Bayes

In [70]:
gnb_w2v = GaussianNB()

cv_scores = cross_val_score(gnb_w2v, X_train, y_train, cv=5)
print("Cross-validation accuracy scores:", cv_scores)

gnb_w2v_accuracy = np.mean(cv_scores)
print("Mean accuracy of model:", gnb_w2v_accuracy)

gnb_w2v.fit(X_train, y_train) 
y_pred_gnb_w2v = gnb_w2v.predict(X_test)

Cross-validation accuracy scores: [0.80247069 0.79528552 0.79364679 0.79036934 0.79679818]
Mean accuracy of model: 0.7957141056346905


In [71]:
precision_gnb_w2v = precision_score(y_test, y_pred_gnb_w2v, average='weighted')  # or 'macro', 'micro', etc.
recall_gnb_w2v = recall_score(y_test, y_pred_gnb_w2v, average='weighted')
f1_score_gnb_w2v = f1_score(y_test, y_pred_gnb_w2v, average='weighted')


print("Precision:", precision_gnb_w2v)
print("Recall:", recall_gnb_w2v)
print("F1 Score:", f1_score_gnb_w2v)

Precision: 0.7913264018180213
Recall: 0.7910658465261672
F1 Score: 0.7910724323978559


In [72]:
confusion_matrix_gnb_w2v = confusion_matrix(y_test, y_pred_dt_w2v)
print("Confusion Matrix:\n", confusion_matrix_gnb_w2v)

Confusion Matrix:
 [[3778 1255]
 [1079 3805]]


### We have all the famous algorithm and we got best accuracy with `LOGISITC REGRESSION` by using all diffrent techniques. 

In [65]:
print("Precision lr ngrams:", precision_lr_ngrams)
print("Recall lr ngrams:", recall_lr_ngrams)
print("F1 Score lr ngrams:", f1_score_lr_ngrams)
print("Confusion Matrix lr ngrams :\n", confusion_matrix_lr_ngrams)

print('*'*50)

print("Precision lr w2v:", precision_lr_w2v)
print("Recall lr w2v:", recall_lr_w2v)
print("F1 Score lr w2v:", f1_score_lr_w2v)
print("Confusion Matrix lr w2v :\n", confusion_matrix_lr_w2v)

Precision lr ngrams: 0.8852382837575559
Recall lr ngrams: 0.8847433699707573
F1 Score lr ngrams: 0.8847398471082164
Confusion Matrix lr ngrams :
 [[4378  655]
 [ 488 4396]]
**************************************************
Precision lr w2v: 0.8769281948862215
Recall lr w2v: 0.8765755772915196
F1 Score lr w2v: 0.8765771585809969
Confusion Matrix lr w2v :
 [[4351  682]
 [ 542 4342]]


In [None]:
# We have to choose a model which have low type one error ,
# means how many negative reviews is predicted as positive by our model