In [1]:
import pandas as pd
import numpy as np
import nltk
import re

### Importing the dataset into the pandas dataframe

In [2]:
df = pd.read_csv('IMDB Dataset.csv', nrows = 10000)
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
9995,"Fun, entertaining movie about WWII German spy ...",positive
9996,Give me a break. How can anyone say that this ...,negative
9997,This movie is a bad movie. But after watching ...,negative
9998,This is a movie that was probably made to ente...,negative


In [3]:
df.shape

(10000, 2)

In [4]:
df['sentiment'].value_counts()

positive    5028
negative    4972
Name: sentiment, dtype: int64

### Splitting the dataset into features and target

In [5]:
X = df['review']

In [6]:
Y = df['sentiment']

### Text Preprocessing

In [7]:
sentence = "she is@ a good %girl."
x = re.sub("[^A-Za-z]"," ",sentence)
print(x)

she is  a good  girl 


In [8]:
def sentence_cleaning(sentence):
    x = re.sub("[^A-Za-z]"," ",sentence)
    return x

In [9]:
df['review'] = df['review'].apply(lambda sentence: sentence_cleaning(sentence))

In [10]:
df['review']

0       One of the other reviewers has mentioned that ...
1       A wonderful little production   br    br   The...
2       I thought this was a wonderful way to spend ti...
3       Basically there s a family where a little boy ...
4       Petter Mattei s  Love in the Time of Money  is...
                              ...                        
9995    Fun  entertaining movie about WWII German spy ...
9996    Give me a break  How can anyone say that this ...
9997    This movie is a bad movie  But after watching ...
9998    This is a movie that was probably made to ente...
9999    Smashing film about film making  Shows the int...
Name: review, Length: 10000, dtype: object

### Tokenisation - Converting the entire dataset into individual words or in the form of a list

In [11]:
def Tokenisation(sentence):
    new_sentence = sentence.split(" ")
    return new_sentence

In [12]:
Tokenisation("palash is my name")

['palash', 'is', 'my', 'name']

In [13]:
df['token_sentence'] = df['review'].apply(lambda cleaned_sentence : Tokenisation(cleaned_sentence))

In [14]:
df

Unnamed: 0,review,sentiment,token_sentence
0,One of the other reviewers has mentioned that ...,positive,"[One, of, the, other, reviewers, has, mentione..."
1,A wonderful little production br br The...,positive,"[A, wonderful, little, production, , , br, , ,..."
2,I thought this was a wonderful way to spend ti...,positive,"[I, thought, this, was, a, wonderful, way, to,..."
3,Basically there s a family where a little boy ...,negative,"[Basically, there, s, a, family, where, a, lit..."
4,Petter Mattei s Love in the Time of Money is...,positive,"[Petter, Mattei, s, , Love, in, the, Time, of,..."
...,...,...,...
9995,Fun entertaining movie about WWII German spy ...,positive,"[Fun, , entertaining, movie, about, WWII, Germ..."
9996,Give me a break How can anyone say that this ...,negative,"[Give, me, a, break, , How, can, anyone, say, ..."
9997,This movie is a bad movie But after watching ...,negative,"[This, movie, is, a, bad, movie, , But, after,..."
9998,This is a movie that was probably made to ente...,negative,"[This, is, a, movie, that, was, probably, made..."


### Now we will perform removal of unnecessary words

In [15]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\palas\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [16]:
from nltk.corpus import stopwords
stop_words = stopwords.words("english")

In [17]:
stop_words

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [18]:
def remove_stopwords(sentence):
    without_stopwords = []
    for i in sentence:
        if i not in stop_words:
            without_stopwords.append(i)
    return without_stopwords

In [19]:
df['token_sentence'] = df['token_sentence'].apply(lambda token_sentence: remove_stopwords(token_sentence))

In [20]:
df

Unnamed: 0,review,sentiment,token_sentence
0,One of the other reviewers has mentioned that ...,positive,"[One, reviewers, mentioned, watching, , , Oz, ..."
1,A wonderful little production br br The...,positive,"[A, wonderful, little, production, , , br, , ,..."
2,I thought this was a wonderful way to spend ti...,positive,"[I, thought, wonderful, way, spend, time, hot,..."
3,Basically there s a family where a little boy ...,negative,"[Basically, family, little, boy, , Jake, , thi..."
4,Petter Mattei s Love in the Time of Money is...,positive,"[Petter, Mattei, , Love, Time, Money, , visual..."
...,...,...,...
9995,Fun entertaining movie about WWII German spy ...,positive,"[Fun, , entertaining, movie, WWII, German, spy..."
9996,Give me a break How can anyone say that this ...,negative,"[Give, break, , How, anyone, say, , good, hock..."
9997,This movie is a bad movie But after watching ...,negative,"[This, movie, bad, movie, , But, watching, end..."
9998,This is a movie that was probably made to ente...,negative,"[This, movie, probably, made, entertain, middl..."


In [21]:
# converting all capital words into lower form 
df['token_sentence'] = df['token_sentence'].apply(lambda sentence : [i.lower() for i in sentence])

### Text Stemming
* Stemming - In this process we reduce the words to their root form. The word may or may not have meaning

In [22]:
from nltk.stem import PorterStemmer
ps = PorterStemmer()

In [23]:
def stemming(sentence):
    stemmed_sentence = []
    for i in sentence:
        root_word = ps.stem(i)
        stemmed_sentence.append(root_word)
    return stemmed_sentence

In [24]:
df.head()

Unnamed: 0,review,sentiment,token_sentence
0,One of the other reviewers has mentioned that ...,positive,"[one, reviewers, mentioned, watching, , , oz, ..."
1,A wonderful little production br br The...,positive,"[a, wonderful, little, production, , , br, , ,..."
2,I thought this was a wonderful way to spend ti...,positive,"[i, thought, wonderful, way, spend, time, hot,..."
3,Basically there s a family where a little boy ...,negative,"[basically, family, little, boy, , jake, , thi..."
4,Petter Mattei s Love in the Time of Money is...,positive,"[petter, mattei, , love, time, money, , visual..."


In [25]:
df['stemmed_sentence'] = df['token_sentence'].apply(lambda stemmed_sentence: stemming(stemmed_sentence))

* Lemmatization - this is the process in which you convert the words to their root form without deleting their actual meaning

In [26]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\palas\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [27]:
wnl = WordNetLemmatizer()

In [28]:
def lemmatized_sentence(sentence):
    lemmatized = []
    for i in sentence:
        lemm = wnl.lemmatize(i, pos = 'v')
        lemmatized.append(lemm)
    return lemmatized

In [29]:
df

Unnamed: 0,review,sentiment,token_sentence,stemmed_sentence
0,One of the other reviewers has mentioned that ...,positive,"[one, reviewers, mentioned, watching, , , oz, ...","[one, review, mention, watch, , , oz, episod, ..."
1,A wonderful little production br br The...,positive,"[a, wonderful, little, production, , , br, , ,...","[a, wonder, littl, product, , , br, , , , br, ..."
2,I thought this was a wonderful way to spend ti...,positive,"[i, thought, wonderful, way, spend, time, hot,...","[i, thought, wonder, way, spend, time, hot, su..."
3,Basically there s a family where a little boy ...,negative,"[basically, family, little, boy, , jake, , thi...","[basic, famili, littl, boy, , jake, , think, z..."
4,Petter Mattei s Love in the Time of Money is...,positive,"[petter, mattei, , love, time, money, , visual...","[petter, mattei, , love, time, money, , visual..."
...,...,...,...,...
9995,Fun entertaining movie about WWII German spy ...,positive,"[fun, , entertaining, movie, wwii, german, spy...","[fun, , entertain, movi, wwii, german, spi, , ..."
9996,Give me a break How can anyone say that this ...,negative,"[give, break, , how, anyone, say, , good, hock...","[give, break, , how, anyon, say, , good, hocke..."
9997,This movie is a bad movie But after watching ...,negative,"[this, movie, bad, movie, , but, watching, end...","[thi, movi, bad, movi, , but, watch, endless, ..."
9998,This is a movie that was probably made to ente...,negative,"[this, movie, probably, made, entertain, middl...","[thi, movi, probabl, made, entertain, middl, s..."


In [30]:
df['lemmatized_sentence'] = df['stemmed_sentence'].apply(lambda sentence: lemmatized_sentence(sentence))

In [31]:
df.drop('token_sentence', inplace = True, axis = 1)

In [32]:
df

Unnamed: 0,review,sentiment,stemmed_sentence,lemmatized_sentence
0,One of the other reviewers has mentioned that ...,positive,"[one, review, mention, watch, , , oz, episod, ...","[one, review, mention, watch, , , oz, episod, ..."
1,A wonderful little production br br The...,positive,"[a, wonder, littl, product, , , br, , , , br, ...","[a, wonder, littl, product, , , br, , , , br, ..."
2,I thought this was a wonderful way to spend ti...,positive,"[i, thought, wonder, way, spend, time, hot, su...","[i, think, wonder, way, spend, time, hot, summ..."
3,Basically there s a family where a little boy ...,negative,"[basic, famili, littl, boy, , jake, , think, z...","[basic, famili, littl, boy, , jake, , think, z..."
4,Petter Mattei s Love in the Time of Money is...,positive,"[petter, mattei, , love, time, money, , visual...","[petter, mattei, , love, time, money, , visual..."
...,...,...,...,...
9995,Fun entertaining movie about WWII German spy ...,positive,"[fun, , entertain, movi, wwii, german, spi, , ...","[fun, , entertain, movi, wwii, german, spi, , ..."
9996,Give me a break How can anyone say that this ...,negative,"[give, break, , how, anyon, say, , good, hocke...","[give, break, , how, anyon, say, , good, hocke..."
9997,This movie is a bad movie But after watching ...,negative,"[thi, movi, bad, movi, , but, watch, endless, ...","[thi, movi, bad, movi, , but, watch, endless, ..."
9998,This is a movie that was probably made to ente...,negative,"[thi, movi, probabl, made, entertain, middl, s...","[thi, movi, probabl, make, entertain, middl, s..."


### Text Vector Generation

* Bag Of Words

In [33]:
from sklearn.feature_extraction.text import CountVectorizer # aka bag of words

In [34]:
cv = CountVectorizer()

In [35]:
# review_features = cv.fit_transform(df['stemmed_sentence'])
# review_features.get_shape()

#### since i got an error so i need to first transform the listt into a string format 

In [36]:
df['cleaned_text'] = df['stemmed_sentence'].apply(lambda sentence: (" ").join(sentence))

In [37]:
df

Unnamed: 0,review,sentiment,stemmed_sentence,lemmatized_sentence,cleaned_text
0,One of the other reviewers has mentioned that ...,positive,"[one, review, mention, watch, , , oz, episod, ...","[one, review, mention, watch, , , oz, episod, ...",one review mention watch oz episod hook the...
1,A wonderful little production br br The...,positive,"[a, wonder, littl, product, , , br, , , , br, ...","[a, wonder, littl, product, , , br, , , , br, ...",a wonder littl product br br the film t...
2,I thought this was a wonderful way to spend ti...,positive,"[i, thought, wonder, way, spend, time, hot, su...","[i, think, wonder, way, spend, time, hot, summ...",i thought wonder way spend time hot summer wee...
3,Basically there s a family where a little boy ...,negative,"[basic, famili, littl, boy, , jake, , think, z...","[basic, famili, littl, boy, , jake, , think, z...",basic famili littl boy jake think zombi clos...
4,Petter Mattei s Love in the Time of Money is...,positive,"[petter, mattei, , love, time, money, , visual...","[petter, mattei, , love, time, money, , visual...",petter mattei love time money visual stun fi...
...,...,...,...,...,...
9995,Fun entertaining movie about WWII German spy ...,positive,"[fun, , entertain, movi, wwii, german, spi, , ...","[fun, , entertain, movi, wwii, german, spi, , ...",fun entertain movi wwii german spi juli andr...
9996,Give me a break How can anyone say that this ...,negative,"[give, break, , how, anyon, say, , good, hocke...","[give, break, , how, anyon, say, , good, hocke...",give break how anyon say good hockey movi ...
9997,This movie is a bad movie But after watching ...,negative,"[thi, movi, bad, movi, , but, watch, endless, ...","[thi, movi, bad, movi, , but, watch, endless, ...",thi movi bad movi but watch endless seri bad ...
9998,This is a movie that was probably made to ente...,negative,"[thi, movi, probabl, made, entertain, middl, s...","[thi, movi, probabl, make, entertain, middl, s...",thi movi probabl made entertain middl school ...


In [38]:
review_features = cv.fit_transform(df['cleaned_text'])
review_features.shape

(10000, 35007)

In [39]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

#### i for got to ordinal encode the target column my bad

In [40]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [41]:
feature_data = review_features
target_data = le.fit_transform(df['sentiment'])

In [42]:
target_data

array([1, 1, 1, ..., 0, 0, 1])

In [43]:
x_train, x_test, y_train, y_test = train_test_split(feature_data, target_data, random_state = 0, shuffle = False, test_size = 0.1)

In [44]:
print('size of feature data for training', x_train.shape)
print('size of feature data for testing', x_test.shape)
print('size of target data for training', y_train.shape)
print('size of target data for testing', y_test.shape)

size of feature data for training (9000, 35007)
size of feature data for testing (1000, 35007)
size of target data for training (9000,)
size of target data for testing (1000,)


In [45]:
rfc = RandomForestClassifier()
rfc.fit(x_train, y_train)

RandomForestClassifier()

In [46]:
y_pred = rfc.predict(x_test)

In [47]:
from sklearn.metrics import roc_auc_score

In [48]:
y_test

array([0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1,
       1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0,
       0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1,
       1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0,
       0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0,
       1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1,
       1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1,
       0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0,
       0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0,
       1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1,
       0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1,

In [49]:
print(roc_auc_score(y_test,y_pred))

0.847998207648699


In [50]:
x_train.shape

(9000, 35007)

* TFIDF 

In [52]:
from sklearn.feature_extraction.text import TfidfVectorizer
Tfidf_review_vectorizer = TfidfVectorizer()

In [55]:
tfidf_review_features = Tfidf_review_vectorizer.fit_transform(df['cleaned_text'])

In [57]:
tfidf_review_features.shape

(10000, 35007)

In [58]:
x_train_1, x_test_1, y_train_1, y_test_1 = train_test_split(tfidf_review_features, target_data, test_size = 0.1, random_state = 1, shuffle = False) 

In [60]:
rfc.fit(x_train_1, y_train_1)

RandomForestClassifier()

In [61]:
y_pred_1 = rfc.predict(x_test_1)

In [62]:
print(roc_auc_score(y_test_1,y_pred_1))

0.8508447655740525
