In [None]:
import numpy as np
import seaborn as sea
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from bs4 import BeautifulSoup
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

  import pandas.util.testing as tm


In [None]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
df = pd.read_csv('/content/drive/My Drive/IMDB 50k review dataset/IMDB Dataset.csv') 
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [None]:
# Checking the data
df['review'][1]

'A wonderful little production. <br /><br />The filming technique is very unassuming- very old-time-BBC fashion and gives a comforting, and sometimes discomforting, sense of realism to the entire piece. <br /><br />The actors are extremely well chosen- Michael Sheen not only "has got all the polari" but he has all the voices down pat too! You can truly see the seamless editing guided by the references to Williams\' diary entries, not only is it well worth the watching but it is a terrificly written and performed piece. A masterful production about one of the great master\'s of comedy and his life. <br /><br />The realism really comes home with the little things: the fantasy of the guard which, rather than use the traditional \'dream\' techniques remains solid then disappears. It plays on our knowledge and our senses, particularly with the scenes concerning Orton and Halliwell and the sets (particularly of their flat with Halliwell\'s murals decorating every surface) are terribly well d

In [None]:
# def html_clean(review):
#   soup = BeautifulSoup(review, "html.parser")
#   return soup.get_text()

df_test = df.copy()

In [None]:
# Cleaning the data for html content
df_test['review_clean'] = df_test['review'].apply(lambda x: BeautifulSoup(x, "html.parser").get_text())
df_test.head()

Unnamed: 0,review,sentiment,review_clean
0,One of the other reviewers has mentioned that ...,positive,One of the other reviewers has mentioned that ...
1,A wonderful little production. <br /><br />The...,positive,A wonderful little production. The filming tec...
2,I thought this was a wonderful way to spend ti...,positive,I thought this was a wonderful way to spend ti...
3,Basically there's a family where a little boy ...,negative,Basically there's a family where a little boy ...
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,"Petter Mattei's ""Love in the Time of Money"" is..."


In [None]:
tokenizer = RegexpTokenizer(r'[a-zA-Z0-9]+')
stop_words = set(stopwords.words('english'))

In [None]:
df_test['sentiment'] = df_test['sentiment'].apply(lambda x:1 if x=='positive' else 0)
df_test.head()

Unnamed: 0,review,sentiment,review_clean
0,One of the other reviewers has mentioned that ...,1,One of the other reviewers has mentioned that ...
1,A wonderful little production. <br /><br />The...,1,A wonderful little production. The filming tec...
2,I thought this was a wonderful way to spend ti...,1,I thought this was a wonderful way to spend ti...
3,Basically there's a family where a little boy ...,0,Basically there's a family where a little boy ...
4,"Petter Mattei's ""Love in the Time of Money"" is...",1,"Petter Mattei's ""Love in the Time of Money"" is..."


In [None]:
# Checking out a sample data after tokenizing and filtering the stop words
df_test['review_clean'][2]

'I thought this was a wonderful way to spend time on a too hot summer weekend, sitting in the air conditioned theater and watching a light-hearted comedy. The plot is simplistic, but the dialogue is witty and the characters are likable (even the well bread suspected serial killer). While some may be disappointed when they realize this is not Match Point 2: Risk Addiction, I thought it was proof that Woody Allen is still fully in control of the style many of us have grown to love.This was the most I\'d laughed at one of Woody\'s comedies in years (dare I say a decade?). While I\'ve never been impressed with Scarlet Johanson, in this she managed to tone down her "sexy" image and jumped right into a average, but spirited young woman.This may not be the crown jewel of his career, but it was wittier than "Devil Wears Prada" and more interesting than "Superman" a great comedy to go see with friends.'

In [None]:
tokened = tokenizer.tokenize(df_test['review_clean'][2])
tokened_nostop = [word for word in tokened if word not in stop_words]
tokened_nostop

['I',
 'thought',
 'wonderful',
 'way',
 'spend',
 'time',
 'hot',
 'summer',
 'weekend',
 'sitting',
 'air',
 'conditioned',
 'theater',
 'watching',
 'light',
 'hearted',
 'comedy',
 'The',
 'plot',
 'simplistic',
 'dialogue',
 'witty',
 'characters',
 'likable',
 'even',
 'well',
 'bread',
 'suspected',
 'serial',
 'killer',
 'While',
 'may',
 'disappointed',
 'realize',
 'Match',
 'Point',
 '2',
 'Risk',
 'Addiction',
 'I',
 'thought',
 'proof',
 'Woody',
 'Allen',
 'still',
 'fully',
 'control',
 'style',
 'many',
 'us',
 'grown',
 'love',
 'This',
 'I',
 'laughed',
 'one',
 'Woody',
 'comedies',
 'years',
 'dare',
 'I',
 'say',
 'decade',
 'While',
 'I',
 'never',
 'impressed',
 'Scarlet',
 'Johanson',
 'managed',
 'tone',
 'sexy',
 'image',
 'jumped',
 'right',
 'average',
 'spirited',
 'young',
 'woman',
 'This',
 'may',
 'crown',
 'jewel',
 'career',
 'wittier',
 'Devil',
 'Wears',
 'Prada',
 'interesting',
 'Superman',
 'great',
 'comedy',
 'go',
 'see',
 'friends']

In [None]:
# Creating TF-IDF vectorizer with tokenizing and stop words
tfidf = TfidfVectorizer(tokenizer=tokenizer.tokenize, stop_words=stop_words, ngram_range=(1,2), max_df=0.5, min_df=20)

In [None]:
review_vec = tfidf.fit_transform(df_test['review_clean'])
review_vec.shape

(50000, 38904)

In [None]:
# Splitting the dataset
X_train, X_test, Y_train, Y_test = train_test_split(review_vec, df_test['sentiment'], train_size=0.7, random_state=1)
X_train.shape, X_test.shape, Y_train.shape, Y_test.shape

((35000, 38904), (15000, 38904), (35000,), (15000,))

In [None]:
# Using Logistic Regression for Classification
model = LogisticRegression()

In [None]:
model.fit(X_train, Y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [None]:
model.score(X_train, Y_train)

0.9426571428571429

In [None]:
model.score(X_test, Y_test)

0.9036666666666666

In [None]:
from sklearn.metrics import confusion_matrix, classification_report

In [None]:
Y_pred = model.predict(X_test)
confusion_matrix(Y_test, Y_pred)

array([[6690,  831],
       [ 614, 6865]])

In [None]:
len(list(tfidf.get_feature_names()))

38904

In [None]:
word_coef = [(word, coeff) for word, coeff in zip(tfidf.get_feature_names(), model.coef_[0])]

In [None]:
# analyzing the positive and negative words
positive_words = sorted(word_coef, key = lambda x:x[1], reverse=True)[0:5]
negative_words = sorted(word_coef, key = lambda x:x[1], reverse=False)[0:5]

In [None]:
positive_words

[('great', 7.806891933401895),
 ('excellent', 6.216241519883558),
 ('perfect', 5.189217355166635),
 ('best', 5.1114370185902605),
 ('wonderful', 4.966290892142393)]

In [None]:
negative_words

[('worst', -9.246828745513984),
 ('bad', -8.700073883047217),
 ('awful', -7.131218192604851),
 ('boring', -6.774302789186869),
 ('waste', -6.420298600655201)]

In [None]:
# Using RandomForest for Classification
from sklearn.ensemble import RandomForestClassifier
model_rf = RandomForestClassifier()

In [None]:
model_rf.fit(X_train, Y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [None]:
model_rf.score(X_test, Y_test)

0.8700666666666667

In [None]:
# Using Gradient Boosting Trees for Classification
from sklearn.ensemble import GradientBoostingClassifier
model_gb = GradientBoostingClassifier()

In [None]:
model_gb.fit(X_train, Y_train)

GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [None]:
model_gb.score(X_test, Y_test)

0.8151333333333334