## Installing and Importing Packages

In [1]:
%pip install nltk

Note: you may need to restart the kernel to use updated packages.




In [2]:
import nltk
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [79]:
import pandas as pd
pd.set_option('display.max_colwidth', 100)

import numpy as np
import re
import string 

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer


# Reading the Data

In [4]:
imdb_prelim = pd.read_excel("IMDB_dataset.xlsx")
imdb_prelim.head()

Unnamed: 0,review,sentiment
0,"I thought this was a wonderful way to spend time on a too hot summer weekend, sitting in the air...",positive
1,"Probably my all-time favorite movie, a story of selflessness, sacrifice and dedication to a nobl...",positive
2,I sure would like to see a resurrection of a up dated Seahunt series with the tech they have tod...,positive
3,"This show was an amazing, fresh & innovative idea in the 70's when it first aired. The first 7 o...",negative
4,Encouraged by the positive comments about this film on here I was looking forward to watching th...,negative


In [5]:
imdb_prelim['review'].loc[23]

"It had all the clichÃ©s of movies of this type and no substance. The plot went nowhere and at the end of the movie I felt like a sucker for watching it. The production was good; however, the script and acting were B-movie quality. The casting was poor because there were good actors mixed in with crumby actors. The good actors didn't hold their own nor did they lift up the others. <br /><br />This movie is not worthy of more words, but I will say more to meet the minimum requirement of ten lines. James Wood and Cuba Gooding, Jr. play caricatures of themselves in other movies. <br /><br />If you are looking for mindless entertainment, I still wouldn't recommend this movie."

In [6]:
imdb_prelim.describe()

Unnamed: 0,review,sentiment
count,25000,25000
unique,24898,2
top,"When i got this movie free from my job, along with three other similar movies.. I watched then w...",positive
freq,3,12500


In [7]:
imdb_prelim.sentiment.value_counts()

positive    12500
negative    12500
Name: sentiment, dtype: int64

#### We can see that the dataset is balanced.

## Data Preprocessing

### Converting 'reviews' to Lowercase

In [8]:
imdb_prelim['review'] = imdb_prelim['review'].str.lower()
imdb_prelim.head()

Unnamed: 0,review,sentiment
0,"i thought this was a wonderful way to spend time on a too hot summer weekend, sitting in the air...",positive
1,"probably my all-time favorite movie, a story of selflessness, sacrifice and dedication to a nobl...",positive
2,i sure would like to see a resurrection of a up dated seahunt series with the tech they have tod...,positive
3,"this show was an amazing, fresh & innovative idea in the 70's when it first aired. the first 7 o...",negative
4,encouraged by the positive comments about this film on here i was looking forward to watching th...,negative


### Removing HTML tags

In [9]:
def remove_html_tags (text):
    rmv = re.compile('<.*?>')
    return rmv.sub(r'', text)

In [10]:
imdb_prelim['review'] = imdb_prelim['review'].apply(remove_html_tags)
imdb_prelim['review'].loc[:24]

0     i thought this was a wonderful way to spend time on a too hot summer weekend, sitting in the air...
1     probably my all-time favorite movie, a story of selflessness, sacrifice and dedication to a nobl...
2     i sure would like to see a resurrection of a up dated seahunt series with the tech they have tod...
3     this show was an amazing, fresh & innovative idea in the 70's when it first aired. the first 7 o...
4     encouraged by the positive comments about this film on here i was looking forward to watching th...
5     phil the alien is one of those quirky films where the humour is based around the oddness of ever...
6     i saw this movie when i was about 12 when it came out. i recall the scariest scene was the big b...
7     so im not a big fan of boll's work but then again not many are. i enjoyed his movie postal (mayb...
8     this a fantastic movie of three prisoners who become famous. one of the actors is george clooney...
9     this movie made it into one of my top 10

### Removing URLs

In [11]:
def remove_url(text):
    re_url = re.compile('https?://\S+|www\.\S+')
    return re_url.sub('', text)

In [12]:
imdb_prelim['review'] = imdb_prelim['review'].apply(remove_url)
imdb_prelim['review'].loc[2336]

"a message movie, but a rather good one. outstanding cast, top to bottom. interesting in that bette davis's plot line is essentially back story! the extremely negative reviews (name throwing at the screenplay/playwright, associating this somehow with extremely negative comments about 'angles in america', etc. etc.) object to the movie being too preachy about germany in wwii. gosh, that is just a bit too sophisticated an understanding of morality for me.theatrical and movie-making, and acting styles vary over time and of course 70 years later this particular movie would not be made in this way. yes casablanca is a better movie (i guess), but although made in the same year and both having nazis in them, casablanca is primarily a love story. the love story in this movie takes second seat to the spy plot--more of a thriller. both have a rather large number of somewhat cheesy accents and wonderful character actors. the children are a bit tedious and could have been edited"

## Removing Punctuations

In [13]:
def remove_punct(text):
    re_punct = "".join([char for char in text if char not in string.punctuation])
    return re_punct

In [14]:
imdb_prelim['review'] = imdb_prelim['review'].apply(remove_punct)
imdb_prelim.head()

Unnamed: 0,review,sentiment
0,i thought this was a wonderful way to spend time on a too hot summer weekend sitting in the air ...,positive
1,probably my alltime favorite movie a story of selflessness sacrifice and dedication to a noble c...,positive
2,i sure would like to see a resurrection of a up dated seahunt series with the tech they have tod...,positive
3,this show was an amazing fresh innovative idea in the 70s when it first aired the first 7 or 8 ...,negative
4,encouraged by the positive comments about this film on here i was looking forward to watching th...,negative


## Tokenization 

In [15]:
def tokenize(text):
    tokens = re.split('\W+', text)
    return tokens

In [16]:
imdb_prelim['review_token'] = imdb_prelim['review'].apply(tokenize)
imdb_prelim.head()

Unnamed: 0,review,sentiment,review_token
0,i thought this was a wonderful way to spend time on a too hot summer weekend sitting in the air ...,positive,"[i, thought, this, was, a, wonderful, way, to, spend, time, on, a, too, hot, summer, weekend, si..."
1,probably my alltime favorite movie a story of selflessness sacrifice and dedication to a noble c...,positive,"[probably, my, alltime, favorite, movie, a, story, of, selflessness, sacrifice, and, dedication,..."
2,i sure would like to see a resurrection of a up dated seahunt series with the tech they have tod...,positive,"[i, sure, would, like, to, see, a, resurrection, of, a, up, dated, seahunt, series, with, the, t..."
3,this show was an amazing fresh innovative idea in the 70s when it first aired the first 7 or 8 ...,negative,"[this, show, was, an, amazing, fresh, innovative, idea, in, the, 70s, when, it, first, aired, th..."
4,encouraged by the positive comments about this film on here i was looking forward to watching th...,negative,"[encouraged, by, the, positive, comments, about, this, film, on, here, i, was, looking, forward,..."


## Removing Stopwords

In [17]:
from nltk.corpus import stopwords
stopwords_english = stopwords.words('english')

In [18]:
def remove_stopwords(text):
    re_stp = [word for word in text if word not in stopwords_english]
    return re_stp

In [19]:
imdb_prelim['review_stopwords'] = imdb_prelim['review_token'].apply(remove_stopwords)
imdb_prelim.head()

Unnamed: 0,review,sentiment,review_token,review_stopwords
0,i thought this was a wonderful way to spend time on a too hot summer weekend sitting in the air ...,positive,"[i, thought, this, was, a, wonderful, way, to, spend, time, on, a, too, hot, summer, weekend, si...","[thought, wonderful, way, spend, time, hot, summer, weekend, sitting, air, conditioned, theater,..."
1,probably my alltime favorite movie a story of selflessness sacrifice and dedication to a noble c...,positive,"[probably, my, alltime, favorite, movie, a, story, of, selflessness, sacrifice, and, dedication,...","[probably, alltime, favorite, movie, story, selflessness, sacrifice, dedication, noble, cause, p..."
2,i sure would like to see a resurrection of a up dated seahunt series with the tech they have tod...,positive,"[i, sure, would, like, to, see, a, resurrection, of, a, up, dated, seahunt, series, with, the, t...","[sure, would, like, see, resurrection, dated, seahunt, series, tech, today, would, bring, back, ..."
3,this show was an amazing fresh innovative idea in the 70s when it first aired the first 7 or 8 ...,negative,"[this, show, was, an, amazing, fresh, innovative, idea, in, the, 70s, when, it, first, aired, th...","[show, amazing, fresh, innovative, idea, 70s, first, aired, first, 7, 8, years, brilliant, thing..."
4,encouraged by the positive comments about this film on here i was looking forward to watching th...,negative,"[encouraged, by, the, positive, comments, about, this, film, on, here, i, was, looking, forward,...","[encouraged, positive, comments, film, looking, forward, watching, film, bad, mistake, ive, seen..."


## Stemming 

In [20]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
def perform_stemming(text):
    new_text = [ps.stem(word) for word in text]
    return ' '.join(new_text)

In [22]:
imdb_prelim['review_stemmed'] = imdb_prelim['review_stopwords'].apply(perform_stemming)
imdb_prelim.head()

Unnamed: 0,review,sentiment,review_token,review_stopwords,review_stemmed
0,i thought this was a wonderful way to spend time on a too hot summer weekend sitting in the air ...,positive,"[i, thought, this, was, a, wonderful, way, to, spend, time, on, a, too, hot, summer, weekend, si...","[thought, wonderful, way, spend, time, hot, summer, weekend, sitting, air, conditioned, theater,...",thought wonder way spend time hot summer weekend sit air condit theater watch lightheart comedi ...
1,probably my alltime favorite movie a story of selflessness sacrifice and dedication to a noble c...,positive,"[probably, my, alltime, favorite, movie, a, story, of, selflessness, sacrifice, and, dedication,...","[probably, alltime, favorite, movie, story, selflessness, sacrifice, dedication, noble, cause, p...",probabl alltim favorit movi stori selfless sacrific dedic nobl caus preachi bore never get old d...
2,i sure would like to see a resurrection of a up dated seahunt series with the tech they have tod...,positive,"[i, sure, would, like, to, see, a, resurrection, of, a, up, dated, seahunt, series, with, the, t...","[sure, would, like, see, resurrection, dated, seahunt, series, tech, today, would, bring, back, ...",sure would like see resurrect date seahunt seri tech today would bring back kid excit mei grew b...
3,this show was an amazing fresh innovative idea in the 70s when it first aired the first 7 or 8 ...,negative,"[this, show, was, an, amazing, fresh, innovative, idea, in, the, 70s, when, it, first, aired, th...","[show, amazing, fresh, innovative, idea, 70s, first, aired, first, 7, 8, years, brilliant, thing...",show amaz fresh innov idea 70 first air first 7 8 year brilliant thing drop 1990 show realli fun...
4,encouraged by the positive comments about this film on here i was looking forward to watching th...,negative,"[encouraged, by, the, positive, comments, about, this, film, on, here, i, was, looking, forward,...","[encouraged, positive, comments, film, looking, forward, watching, film, bad, mistake, ive, seen...",encourag posit comment film look forward watch film bad mistak ive seen 950 film truli one worst...


## TF-IDF Vectorization

### Read and Clean the dataset

In [63]:
imdb = pd.read_excel("IMDB_dataset.xlsx")
imdb.head()

Unnamed: 0,review,sentiment
0,"I thought this was a wonderful way to spend time on a too hot summer weekend, sitting in the air...",positive
1,"Probably my all-time favorite movie, a story of selflessness, sacrifice and dedication to a nobl...",positive
2,I sure would like to see a resurrection of a up dated Seahunt series with the tech they have tod...,positive
3,"This show was an amazing, fresh & innovative idea in the 70's when it first aired. The first 7 o...",negative
4,Encouraged by the positive comments about this film on here I was looking forward to watching th...,negative


### Create feature for text message length and % of text that is punctuation

In [65]:
def count_punct(text):
    count = sum([1 for char in text if char in string.punctuation])
    return round(count/(len(text) - text.count(" ")), 3)

imdb['body_len'] = imdb['review'].apply(lambda x: len(x) - x.count(" "))
imdb['punct%'] = imdb['review'].apply(lambda x: count_punct(x))

imdb.head()

Unnamed: 0,review,sentiment,body_len,punct%
0,"I thought this was a wonderful way to spend time on a too hot summer weekend, sitting in the air...",positive,761,0.053
1,"Probably my all-time favorite movie, a story of selflessness, sacrifice and dedication to a nobl...",positive,538,0.052
2,I sure would like to see a resurrection of a up dated Seahunt series with the tech they have tod...,positive,577,0.021
3,"This show was an amazing, fresh & innovative idea in the 70's when it first aired. The first 7 o...",negative,761,0.043
4,Encouraged by the positive comments about this film on here I was looking forward to watching th...,negative,552,0.056


In [66]:
def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [ps.stem(word) for word in tokens if word not in stopwords]
    return text

### Apply TfidfVectorizer

In [81]:
tfidf_vect_sample = TfidfVectorizer(analyzer=clean_text, max_features=9500)
X_tfidf_sample = tfidf_vect_sample.fit_transform(imdb['review'])
print(X_tfidf_sample.shape)
print(tfidf_vect_sample.get_feature_names_out())

(25000, 9500)
['' '0' '007' ... 'zucker' 'â' 'ã']


In [82]:
X_features = pd.concat([imdb['body_len'], imdb['punct%'], pd.DataFrame(X_tfidf_sample.toarray())], axis=1)
X_features.head()

Unnamed: 0,body_len,punct%,0,1,2,3,4,5,6,7,...,9490,9491,9492,9493,9494,9495,9496,9497,9498,9499
0,761,0.053,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,538,0.052,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,577,0.021,0.0,0.0,0.0,0.0,0.0,0.099531,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,761,0.043,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,552,0.056,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Vectorizers output sparse matrices

In [78]:
X_tfidf_df = pd.DataFrame(X_tfidf_sample.toarray())
X_tfidf_df.columns = tfidf_vect_sample.get_feature_names_out()
X_tfidf_df

Unnamed: 0,Unnamed: 1,0,007,010,1,10,100,1000,10000,101,...,zizek,zoey,zombi,zone,zoo,zoom,zorro,zucco,â,ã
0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.101944,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24995,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
24996,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
24997,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
24998,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
