In [1]:
import numpy as np
import pandas as pd

In [2]:
temp_df = pd.read_csv('IMDB Dataset.csv')

In [3]:
df = temp_df.iloc[:10000]

In [4]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [5]:
df.shape

(10000, 2)

In [6]:
df['review'][1]

'A wonderful little production. <br /><br />The filming technique is very unassuming- very old-time-BBC fashion and gives a comforting, and sometimes discomforting, sense of realism to the entire piece. <br /><br />The actors are extremely well chosen- Michael Sheen not only "has got all the polari" but he has all the voices down pat too! You can truly see the seamless editing guided by the references to Williams\' diary entries, not only is it well worth the watching but it is a terrificly written and performed piece. A masterful production about one of the great master\'s of comedy and his life. <br /><br />The realism really comes home with the little things: the fantasy of the guard which, rather than use the traditional \'dream\' techniques remains solid then disappears. It plays on our knowledge and our senses, particularly with the scenes concerning Orton and Halliwell and the sets (particularly of their flat with Halliwell\'s murals decorating every surface) are terribly well d

In [7]:
df['sentiment'].value_counts()

sentiment
positive    5028
negative    4972
Name: count, dtype: int64

In [8]:
df.isnull().sum()

review       0
sentiment    0
dtype: int64

##### Checking for duplicate rows.

In [10]:
df.duplicated().sum()

17

In [11]:
df.drop_duplicates(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop_duplicates(inplace=True)


In [12]:
df.duplicated().sum()

0

In [15]:
import re
def remove_tags(text):
    cleaned_text = re.sub(re.compile('<.*?>'), '', text)
    return cleaned_text

In [16]:
df['review'] = df['review'].apply(remove_tags)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['review'] = df['review'].apply(remove_tags)


In [17]:
df['review'].head()

0    One of the other reviewers has mentioned that ...
1    A wonderful little production. The filming tec...
2    I thought this was a wonderful way to spend ti...
3    Basically there's a family where a little boy ...
4    Petter Mattei's "Love in the Time of Money" is...
Name: review, dtype: object

##### Now, for converting each text to lowercase we can use lambda func.

In [19]:
df['review']=df['review'].apply(lambda x:x.lower())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['review']=df['review'].apply(lambda x:x.lower())


##### Now ,we will remove all the stopwords from the text.

In [22]:
from nltk.corpus import stopwords
sw_list = stopwords.words('english')

df['review'] = df['review'].apply(lambda x: [item for item in x.split() if item not in sw_list]).apply(lambda x:" ".join(x))


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['review'] = df['review'].apply(lambda x: [item for item in x.split() if item not in sw_list]).apply(lambda x:" ".join(x))


In [23]:
df

Unnamed: 0,review,sentiment
0,one reviewers mentioned watching 1 oz episode ...,positive
1,wonderful little production. filming technique...,positive
2,thought wonderful way spend time hot summer we...,positive
3,basically there's family little boy (jake) thi...,negative
4,"petter mattei's ""love time money"" visually stu...",positive
...,...,...
9995,"fun, entertaining movie wwii german spy (julie...",positive
9996,"give break. anyone say ""good hockey movie""? kn...",negative
9997,movie bad movie. watching endless series bad h...,negative
9998,"movie probably made entertain middle school, e...",negative


##### Now ,we need to seperate X and y column.

In [24]:
X = df.iloc[:,0:1]
y = df['sentiment']

In [25]:
X.shape

(9983, 1)

In [26]:
y

0       positive
1       positive
2       positive
3       negative
4       positive
          ...   
9995    positive
9996    negative
9997    negative
9998    negative
9999    positive
Name: sentiment, Length: 9983, dtype: object

##### For target column ,we can use LabelEncoder to encode the classes in the numerical format.

In [27]:
from sklearn.preprocessing import LabelEncoder

le=LabelEncoder()
y=le.fit_transform(y)

In [28]:
y

array([1, 1, 1, ..., 0, 0, 1])

In [29]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=1)

In [30]:
X_train.shape

(7986, 1)

##### Now ,we will apply bag of words to the textual column i.e X_train and X_test.

In [31]:
from sklearn.feature_extraction.text import CountVectorizer

In [32]:
cv=CountVectorizer()

##### We should also convert the ouput to array as bow returns sparse matrix.

In [33]:
X_train_bow = cv.fit_transform(X_train['review']).toarray()
X_test_bow = cv.transform(X_test['review']).toarray()

In [34]:
X_train_bow

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

##### Now ,we will apply our machine learning algorithm on the data.

In [35]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()

gnb.fit(X_train_bow,y_train)

In [36]:
y_pred = gnb.predict(X_test_bow)

from sklearn.metrics import accuracy_score,confusion_matrix
accuracy_score(y_test,y_pred)

0.6324486730095142

In [37]:
confusion_matrix(y_test,y_pred)

array([[717, 235],
       [499, 546]], dtype=int64)

##### Trying another algorithm

In [38]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()

rf.fit(X_train_bow,y_train)
y_pred = rf.predict(X_test_bow)
accuracy_score(y_test,y_pred)

0.8467701552328493

##### While using bag of words , we can decrease the no of selected vocabulory i.e no of features and only use frequent among them.Sometimes due to this ,our performance improves.

In [39]:
cv = CountVectorizer(max_features=3000)

X_train_bow = cv.fit_transform(X_train['review']).toarray()
X_test_bow = cv.transform(X_test['review']).toarray()

rf = RandomForestClassifier()

rf.fit(X_train_bow,y_train)
y_pred = rf.predict(X_test_bow)
accuracy_score(y_test,y_pred)

0.8372558838257386

##### We can also use n grams technique.

In [40]:
cv = CountVectorizer(ngram_range=(1,2),max_features=5000)

X_train_bow = cv.fit_transform(X_train['review']).toarray()
X_test_bow = cv.transform(X_test['review']).toarray()

rf = RandomForestClassifier()

rf.fit(X_train_bow,y_train)
y_pred = rf.predict(X_test_bow)
accuracy_score(y_test,y_pred)

0.8402603905858789

##### We have also included max_features as arg coz n grams generate more feature and hence more memory is req and hence memory error occurs.

### Using Tf-Idf

In [41]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [42]:
tfidf = TfidfVectorizer()

In [43]:
X_train_tfidf = tfidf.fit_transform(X_train['review']).toarray()
X_test_tfidf = tfidf.transform(X_test['review'])

In [44]:
rf = RandomForestClassifier()

rf.fit(X_train_tfidf,y_train)
y_pred = rf.predict(X_test_tfidf)

accuracy_score(y_test,y_pred)

0.8462694041061593

### Using Word2Vec

In [45]:
import gensim

In [47]:
from nltk import sent_tokenize
from gensim.utils import simple_preprocess

In [48]:
story=[]
for doc in df['review']:
    raw_sent=sent_tokenize(doc)
    for sent in raw_sent:
        story.append(simple_preprocess(sent))
        


In [49]:
len(story)

105773

In [50]:
model=gensim.models.Word2Vec(
    
    window=10,
    min_count=2
)

In [51]:
model.build_vocab(story)

In [53]:
model.train(story,total_examples=model.corpus_count,epochs=model.epochs)

(5876040, 6212140)

In [55]:
len(model.wv.index_to_key)

31845

##### Now ,we need to convert all the reviews to numbers.

In [56]:
def document_vector(doc):
    #removing out of vocab words
    doc=[ word for word in doc.split() if word in model.wv.index_to_key]
    return np.mean(model.wv[doc],axis=0)
    

In [57]:
document_vector(df['review'].values[0])

array([-0.17948239,  0.44458705,  0.14456077,  0.2538304 , -0.12525219,
       -0.6085318 ,  0.19500798,  0.88237804, -0.36743638, -0.23244384,
       -0.28208646, -0.47144246,  0.09009554,  0.11770967,  0.17215343,
       -0.11248589,  0.02947626, -0.29770383, -0.11028783, -0.5849714 ,
        0.05263977,  0.2450425 ,  0.13738514, -0.26321757, -0.27594823,
       -0.03365216, -0.3539401 ,  0.01071819, -0.32527605,  0.00594545,
        0.35209534,  0.00352   ,  0.24629755, -0.3007083 , -0.1222117 ,
        0.42381635,  0.06142759, -0.34871873, -0.2476845 , -0.7568688 ,
        0.12046193, -0.26521745,  0.05686786, -0.0987196 ,  0.50216395,
       -0.13690524, -0.31267136, -0.02800409,  0.12488244,  0.38433424,
        0.08971673, -0.39799163, -0.4449893 , -0.1075063 , -0.11766227,
        0.25974536,  0.24361   ,  0.05069469, -0.33330923,  0.11777373,
        0.06972257,  0.08053061,  0.04099515, -0.09475517, -0.45754382,
        0.2891718 ,  0.06785806,  0.11183015, -0.34287256,  0.30

In [58]:
from tqdm import tqdm

In [59]:
X=[]
for doc in tqdm(df['review'].values):
    X.append(document_vector(doc))
    

100%|██████████| 9983/9983 [04:46<00:00, 34.90it/s]


##### Now, converting all X into numpy array.

In [60]:
X=np.array(X)

In [61]:
X.shape

(9983, 100)

In [64]:
X[0]

array([-0.17948239,  0.44458705,  0.14456077,  0.2538304 , -0.12525219,
       -0.6085318 ,  0.19500798,  0.88237804, -0.36743638, -0.23244384,
       -0.28208646, -0.47144246,  0.09009554,  0.11770967,  0.17215343,
       -0.11248589,  0.02947626, -0.29770383, -0.11028783, -0.5849714 ,
        0.05263977,  0.2450425 ,  0.13738514, -0.26321757, -0.27594823,
       -0.03365216, -0.3539401 ,  0.01071819, -0.32527605,  0.00594545,
        0.35209534,  0.00352   ,  0.24629755, -0.3007083 , -0.1222117 ,
        0.42381635,  0.06142759, -0.34871873, -0.2476845 , -0.7568688 ,
        0.12046193, -0.26521745,  0.05686786, -0.0987196 ,  0.50216395,
       -0.13690524, -0.31267136, -0.02800409,  0.12488244,  0.38433424,
        0.08971673, -0.39799163, -0.4449893 , -0.1075063 , -0.11766227,
        0.25974536,  0.24361   ,  0.05069469, -0.33330923,  0.11777373,
        0.06972257,  0.08053061,  0.04099515, -0.09475517, -0.45754382,
        0.2891718 ,  0.06785806,  0.11183015, -0.34287256,  0.30

##### Now , converting the target column values into numerical.

In [62]:
from sklearn.preprocessing import LabelEncoder

le=LabelEncoder()
y=le.fit_transform(df['sentiment'])


In [63]:
y

array([1, 1, 1, ..., 0, 0, 1])

In [65]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=1)

In [66]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
gnb = GaussianNB()

gnb.fit(X_train,y_train)
y_pred=gnb.predict(X_test)
accuracy_score(y_test,y_pred)

0.7245868803204807

In [68]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
rf = RandomForestClassifier()

rf.fit(X_train,y_train)
y_pred=rf.predict(X_test)
accuracy_score(y_test,y_pred)

0.7776664997496244