## Text Classification ( BOW & n_Grams ) 

In [1]:
import numpy as np
import pandas as pd
import os 
import nltk

In [2]:
temp_df=pd.read_csv(r"C:\Users\admin\Desktop\IMDB Dataset.csv")

In [3]:
df=temp_df.iloc[:10000] # Only 10K records
df.sample(4)

Unnamed: 0,review,sentiment
3289,"When many people say it's the ""worst movie I'v...",negative
1880,This movie is trash-poor. It has horrible tast...,negative
1378,"""Coconut Fred's Fruit Salad Island!"" is a hila...",positive
2948,"First, I would like to admit that Chokher Bali...",negative


In [4]:
df.shape

(10000, 2)

In [5]:
df['review'][1]

'A wonderful little production. <br /><br />The filming technique is very unassuming- very old-time-BBC fashion and gives a comforting, and sometimes discomforting, sense of realism to the entire piece. <br /><br />The actors are extremely well chosen- Michael Sheen not only "has got all the polari" but he has all the voices down pat too! You can truly see the seamless editing guided by the references to Williams\' diary entries, not only is it well worth the watching but it is a terrificly written and performed piece. A masterful production about one of the great master\'s of comedy and his life. <br /><br />The realism really comes home with the little things: the fantasy of the guard which, rather than use the traditional \'dream\' techniques remains solid then disappears. It plays on our knowledge and our senses, particularly with the scenes concerning Orton and Halliwell and the sets (particularly of their flat with Halliwell\'s murals decorating every surface) are terribly well d

In [6]:
df['sentiment'].value_counts()

positive    5028
negative    4972
Name: sentiment, dtype: int64

#### Checking for missing values

In [7]:
df.isnull().sum()  #No missing values 

review       0
sentiment    0
dtype: int64

#### Checking for duplicate reviews

In [8]:
df.duplicated().sum() #17 duplicates

17

In [9]:
df.drop_duplicates(inplace=True) #Dropping our duplicates

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop_duplicates(inplace=True) #Dropping our duplicates


In [10]:
df.duplicated().sum()

0

### Preprocessing

In [11]:
import re

def remove(rabindra):
    cleaned_text = re.sub(re.compile('<.*?>'), ' ', rabindra)
    return re.sub(' +', ' ', cleaned_text).strip()

In [12]:
df['review']=df['review'].apply(remove)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['review']=df['review'].apply(remove)


In [13]:
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. The filming tec...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
9995,"Fun, entertaining movie about WWII German spy ...",positive
9996,Give me a break. How can anyone say that this ...,negative
9997,This movie is a bad movie. But after watching ...,negative
9998,This is a movie that was probably made to ente...,negative


  ### Stopword Removal

In [14]:
from nltk.corpus import stopwords
list1=stopwords.words('english')
df['review']=df['review'].apply(lambda x :[item for item in x.split() if item not in list1]).apply(lambda x:" ".join(x))


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['review']=df['review'].apply(lambda x :[item for item in x.split() if item not in list1]).apply(lambda x:" ".join(x))


### LowerCasing

In [15]:
df['review']=df['review'].apply(lambda x:x.lower())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['review']=df['review'].apply(lambda x:x.lower())


In [16]:
df

Unnamed: 0,review,sentiment
0,one reviewers mentioned watching 1 oz episode ...,positive
1,a wonderful little production. the filming tec...,positive
2,i thought wonderful way spend time hot summer ...,positive
3,basically there's family little boy (jake) thi...,negative
4,"petter mattei's ""love time money"" visually stu...",positive
...,...,...
9995,"fun, entertaining movie wwii german spy (julie...",positive
9996,"give break. how anyone say ""good hockey movie""...",negative
9997,this movie bad movie. but watching endless ser...,negative
9998,this movie probably made entertain middle scho...,negative


In [17]:
x=df.iloc[:,0:1]
y=df['sentiment']

In [18]:
x


Unnamed: 0,review
0,one reviewers mentioned watching 1 oz episode ...
1,a wonderful little production. the filming tec...
2,i thought wonderful way spend time hot summer ...
3,basically there's family little boy (jake) thi...
4,"petter mattei's ""love time money"" visually stu..."
...,...
9995,"fun, entertaining movie wwii german spy (julie..."
9996,"give break. how anyone say ""good hockey movie""..."
9997,this movie bad movie. but watching endless ser...
9998,this movie probably made entertain middle scho...


In [19]:
y

0       positive
1       positive
2       positive
3       negative
4       positive
          ...   
9995    positive
9996    negative
9997    negative
9998    negative
9999    positive
Name: sentiment, Length: 9983, dtype: object

#### Since ML model cannot understand positive,negative encode it 

In [21]:
from sklearn.preprocessing import LabelEncoder
encoder=LabelEncoder()
y=encoder.fit_transform(y)
y

array([1, 1, 1, ..., 0, 0, 1], dtype=int64)

In [23]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)


In [24]:
x_train.shape #Baaki 2000 reviews is been given in testing data

(7986, 1)

### Applying BOW

In [25]:
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer()

In [35]:
x_train_bow=cv.fit_transform(x_train['review']).toarray() #fit alag alg data par nhi karna hai 
x_test_bow=cv.transform(x_test['review']).toarray()       #Just transform on already fit data

In [36]:
print(x_train_bow.shape,x_test_bow.shape)

(7986, 47874) (1997, 47874)


### Applying NaiveBayes 

In [37]:
from sklearn.naive_bayes import GaussianNB  #ie.Gaussian Naive Bayes
gnb=GaussianNB()

In [38]:
gnb.fit(x_train_bow,y_train)

GaussianNB()

#### Calculating Accuracy of our Model

In [39]:
y_pred=gnb.predict(x_test_bow)

In [40]:
from sklearn.metrics import accuracy_score,confusion_matrix  #62%Accuracy
accuracy_score(y_test,y_pred)

0.628442663995994

In [41]:
confusion_matrix(y_test,y_pred)

array([[697, 288],
       [454, 558]], dtype=int64)

## Applying Randoem Forest

In [43]:
from sklearn.ensemble import RandomForestClassifier

In [47]:
rf=RandomForestClassifier() #Random Forest is Giving better Accuracy 
rf.fit(x_train_bow,y_train)
y_pred=rf.predict(x_test_bow)
accuracy_score(y_test,y_pred)

0.8327491236855283

In [48]:
cv=CountVectorizer(max_features=4000)
x_train_bow=cv.fit_transform(x_train['review']).toarray() 
x_test_bow=cv.transform(x_test['review']).toarray()
rf=RandomForestClassifier()
rf.fit(x_train_bow,y_train)
y_pred=rf.predict(x_test_bow)
accuracy_score(y_test,y_pred)

0.8367551326990486

#### Note: Can Apply Hyperparameter Tuning for better Accuracy

## Applying n_grams

In [51]:
cv=CountVectorizer(ngram_range=(6,6),max_features=7000) #Because as we increase ngram our dimension gets inc thus memory err
x_train_bow=cv.fit_transform(x_train['review']).toarray() 
x_test_bow=cv.transform(x_test['review']).toarray()
rf=RandomForestClassifier()
rf.fit(x_train_bow,y_train)
y_pred=rf.predict(x_test_bow)
accuracy_score(y_test,y_pred)

0.5127691537305958

#### Note: Decreased Our Model Performance

## Applying TF-IDF

In [53]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [57]:
tfidf=TfidfVectorizer()
x_train_tfidf=tfidf.fit_transform(x_train['review']).toarray()
x_test_tfidf=tfidf.transform(x_test['review'])
rf=RandomForestClassifier()
rf.fit(x_train_tfidf,y_train)

RandomForestClassifier()

In [58]:
y_pred=rf.predict(x_test_tfidf)  #A little better accuracy for tf-idf
accuracy_score(y_pred,y_test)

0.8527791687531296