In [1]:
import numpy as np
import pandas as pd

In [2]:
temp_df = pd.read_csv('data/imdb_dataset.csv')

In [3]:
df = temp_df.iloc[:10000].copy() # for now we are just taking 10,000 review out of 50,000

In [4]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [5]:
df['review'][1]

'A wonderful little production. <br /><br />The filming technique is very unassuming- very old-time-BBC fashion and gives a comforting, and sometimes discomforting, sense of realism to the entire piece. <br /><br />The actors are extremely well chosen- Michael Sheen not only "has got all the polari" but he has all the voices down pat too! You can truly see the seamless editing guided by the references to Williams\' diary entries, not only is it well worth the watching but it is a terrificly written and performed piece. A masterful production about one of the great master\'s of comedy and his life. <br /><br />The realism really comes home with the little things: the fantasy of the guard which, rather than use the traditional \'dream\' techniques remains solid then disappears. It plays on our knowledge and our senses, particularly with the scenes concerning Orton and Halliwell and the sets (particularly of their flat with Halliwell\'s murals decorating every surface) are terribly well d

In [6]:
df['sentiment'].value_counts() # to check whether the class is imbalance or not

sentiment
positive    5028
negative    4972
Name: count, dtype: int64

In [7]:
df.isnull().sum() #to check whether any values are missing in any row

review       0
sentiment    0
dtype: int64

In [8]:
df.duplicated().sum() # checking duplicate rows(we have 17)

np.int64(17)

In [9]:
df.drop_duplicates(inplace=True) # drop duplicate review

In [10]:
df.duplicated().sum()

np.int64(0)

In [11]:
# for now we will apply some basic preprocessing
# Remove Tags
# Lowercase
# remove stopwords

In [12]:
import re
def remove_tags(raw_text):
    cleaned_text = re.sub(re.compile('<.*?>'), '', raw_text)
    return cleaned_text

In [13]:
df['review'] = df['review'].apply(remove_tags)

In [14]:
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. The filming tec...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
9995,"Fun, entertaining movie about WWII German spy ...",positive
9996,Give me a break. How can anyone say that this ...,negative
9997,This movie is a bad movie. But after watching ...,negative
9998,This is a movie that was probably made to ente...,negative


In [15]:
# convert to lowercase
df['review'] = df['review'].apply(lambda x:x.lower())

In [16]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

df['review'] = df['review'].apply(lambda x:[item for item in x.split() if item not in stop_words]).apply(lambda x:" ".join(x))

In [17]:
df

Unnamed: 0,review,sentiment
0,one reviewers mentioned watching 1 oz episode ...,positive
1,wonderful little production. filming technique...,positive
2,thought wonderful way spend time hot summer we...,positive
3,basically there's family little boy (jake) thi...,negative
4,"petter mattei's ""love time money"" visually stu...",positive
...,...,...
9995,"fun, entertaining movie wwii german spy (julie...",positive
9996,"give break. anyone say ""good hockey movie""? kn...",negative
9997,movie bad movie. watching endless series bad h...,negative
9998,"movie probably made entertain middle school, e...",negative


In [18]:
X = df.iloc[:,0:1] # first (:) means select all rows & 0:1 string slicing concept so here it will select only first column
y = df['sentiment']

In [19]:
#basically here X is input features, and y is output labels
X

Unnamed: 0,review
0,one reviewers mentioned watching 1 oz episode ...
1,wonderful little production. filming technique...
2,thought wonderful way spend time hot summer we...
3,basically there's family little boy (jake) thi...
4,"petter mattei's ""love time money"" visually stu..."
...,...
9995,"fun, entertaining movie wwii german spy (julie..."
9996,"give break. anyone say ""good hockey movie""? kn..."
9997,movie bad movie. watching endless series bad h...
9998,"movie probably made entertain middle school, e..."


In [20]:
y

0       positive
1       positive
2       positive
3       negative
4       positive
          ...   
9995    positive
9996    negative
9997    negative
9998    negative
9999    positive
Name: sentiment, Length: 9983, dtype: object

In [21]:
# since y is in positive or negative format we will use LabelEncoder to convert them into 0 and 1
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
y = encoder.fit_transform(y)

In [22]:
y

array([1, 1, 1, ..., 0, 0, 1])

In [23]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=1)

In [24]:
X_train.shape

(7986, 1)

In [25]:
# Applying BOW(Bag of Words) for feature extraction
from sklearn.feature_extraction.text import CountVectorizer

In [26]:
cv = CountVectorizer()

In [31]:
X_train_bow = cv.fit_transform(X_train['review']).toarray() #its like “Bitch, learn my vocabulary from the training set and turn every damn review into a vector of word counts.” 
X_test_bow = cv.transform(X_test['review']).toarray()

fit_transform() does TWO THINGS:
* .fit() → learns the vocabulary: scans all reviews in X_train['review'], builds a list of every unique word (called “tokens”), and assigns each a column index.
* .transform() → turns each review into a sparse row vector where each column = count of that word in the review.
* .toarray() → converts the sparse matrix into a dense numpy array

In [32]:
X_train_bow.shape # we will see approx 48282 vocab or features

(7986, 48282)

In [33]:
# Applying GaussianNB ML algo
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()

gnb.fit(X_train_bow, y_train)

In [34]:
y_pred = gnb.predict(X_test_bow)

from sklearn.metrics import accuracy_score, confusion_matrix
accuracy_score(y_test,y_pred)

0.6324486730095142

In [35]:
confusion_matrix(y_test, y_pred)

array([[717, 235],
       [499, 546]])

In [38]:
# Now let's apply another ML algorithm RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(X_train_bow,y_train)

y_pred = rf.predict(X_test_bow)
accuracy_score(y_test, y_pred) #Compares your true labels (y_test) with your predicted labels (y_pred).

0.8482724086129194

In [39]:
# Now we can tweak the parameter to see whether we can imporve the accuracy
# like, in this we have taken about 48282 words right, now let's consider 3000 most frequently words instead of 48,000
cv = CountVectorizer(max_features = 3000)

X_train_bow = cv.fit_transform(X_train['review']).toarray()
X_test_bow = cv.transform(X_test['review']).toarray()

rf = RandomForestClassifier()

rf.fit(X_train_bow, y_train)
y_pred = rf.predict(X_test_bow)
accuracy_score(y_test, y_pred)

0.8372558838257386

In [40]:
# let's tweak with ngram
cv = CountVectorizer(ngram_range=(1, 3), max_features=5000) # most frequent 5000 ngram

X_train_bow = cv.fit_transform(X_train['review']).toarray()
X_test_bow = cv.transform(X_test['review']).toarray()

rf = RandomForestClassifier()

rf.fit(X_train_bow, y_train)
y_pred = rf.predict(X_test_bow)
accuracy_score(y_test, y_pred)

0.8397596394591887

In [45]:
# Earlier we have used BOW(Bag of words) for feature extraction,now let's try out with Tf-Idf
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier

In [46]:
tfidf = TfidfVectorizer()

In [47]:
X_train_tfidf = tfidf.fit_transform(X_train['review']).toarray()
X_test_tfidf = tfidf.transform(X_test['review'])

In [49]:
rf = RandomForestClassifier()

rf.fit(X_train_tfidf, y_train)
y_pred = rf.predict(X_test_tfidf)

accuracy_score(y_test, y_pred)

0.8467701552328493