In [182]:
import pandas as pd
import os
import string 

In [183]:
for dirname, _, filenames in os.walk('../../movie_dataset/'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        break 

../../movie_dataset/credits.csv
../../movie_dataset/aclImdb\imdb.vocab
../../movie_dataset/aclImdb\test\labeledBow.feat
../../movie_dataset/aclImdb\test\neg\0_2.txt
../../movie_dataset/aclImdb\test\pos\0_10.txt
../../movie_dataset/aclImdb\train\labeledBow.feat
../../movie_dataset/aclImdb\train\neg\0_3.txt
../../movie_dataset/aclImdb\train\pos\0_9.txt
../../movie_dataset/aclImdb\train\unsup\0_0.txt


In [184]:
# taking only one set of reviews for text preprocessing
reviews = []
data_path = "../../movie_dataset/aclImdb/train/unsup/"
for dirname, _, filenames in os.walk(data_path):
    for filename in filenames[:1000]:
        with open(os.path.join(data_path, filename), "r", encoding="utf8") as f:
            reviews.append(f.read())
print(len(reviews)) 

1000


In [185]:
reviews[0]

'I admit, the great majority of films released before say 1933 are just not for me. Of the dozen or so "major" silents I have viewed, one I loved (The Crowd), and two were very good (The Last Command and City Lights, that latter Chaplin circa 1931).<br /><br />So I was apprehensive about this one, and humor is often difficult to appreciate (uh, enjoy) decades later. I did like the lead actors, but thought little of the film.<br /><br />One intriguing sequence. Early on, the guys are supposed to get "de-loused" and for about three minutes, fully dressed, do some schtick. In the background, perhaps three dozen men pass by, all naked, white and black (WWI ?), and for most, their butts, part or full backside, are shown. Was this an early variation of beefcake courtesy of Howard Hughes?'

In [186]:
df = pd.DataFrame(reviews) 
df = df.rename(columns={0:'reviews'})
df.head()

Unnamed: 0,reviews
0,"I admit, the great majority of films released ..."
1,"Take a low budget, inexperienced actors doubli..."
2,"Everybody has seen 'Back To The Future,' right..."
3,Doris Day was an icon of beauty in singing and...
4,"After a series of silly, fun-loving movies, 19..."


In [187]:
df.info

<bound method DataFrame.info of                                                reviews
0    I admit, the great majority of films released ...
1    Take a low budget, inexperienced actors doubli...
2    Everybody has seen 'Back To The Future,' right...
3    Doris Day was an icon of beauty in singing and...
4    After a series of silly, fun-loving movies, 19...
..                                                 ...
995  This has to be one of the best shows ever made...
996  As I post this we are only 2 episodes in, but ...
997  I'm basically recapping what other people have...
998  This was a fun movie. It's more comedy than ac...
999  I'm another one who is waiting for the day thi...

[1000 rows x 1 columns]>

## Lowercasing the reviews in the dataframe 

In [188]:
df["lowercase_reviews"] = df["reviews"].str.lower() 
df.head()

Unnamed: 0,reviews,lowercase_reviews
0,"I admit, the great majority of films released ...","i admit, the great majority of films released ..."
1,"Take a low budget, inexperienced actors doubli...","take a low budget, inexperienced actors doubli..."
2,"Everybody has seen 'Back To The Future,' right...","everybody has seen 'back to the future,' right..."
3,Doris Day was an icon of beauty in singing and...,doris day was an icon of beauty in singing and...
4,"After a series of silly, fun-loving movies, 19...","after a series of silly, fun-loving movies, 19..."


In [189]:
df["reviews"][0], df["lowercase_reviews"][0]

('I admit, the great majority of films released before say 1933 are just not for me. Of the dozen or so "major" silents I have viewed, one I loved (The Crowd), and two were very good (The Last Command and City Lights, that latter Chaplin circa 1931).<br /><br />So I was apprehensive about this one, and humor is often difficult to appreciate (uh, enjoy) decades later. I did like the lead actors, but thought little of the film.<br /><br />One intriguing sequence. Early on, the guys are supposed to get "de-loused" and for about three minutes, fully dressed, do some schtick. In the background, perhaps three dozen men pass by, all naked, white and black (WWI ?), and for most, their butts, part or full backside, are shown. Was this an early variation of beefcake courtesy of Howard Hughes?',
 'i admit, the great majority of films released before say 1933 are just not for me. of the dozen or so "major" silents i have viewed, one i loved (the crowd), and two were very good (the last command and

## Removal of punctuation 

In [190]:
punc_to_remove = string.punctuation
print(punc_to_remove)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [191]:
df["punc_removed_reviews"] = df["lowercase_reviews"].apply(lambda text: text.translate(str.maketrans('', '', punc_to_remove)))
df.head()

Unnamed: 0,reviews,lowercase_reviews,punc_removed_reviews
0,"I admit, the great majority of films released ...","i admit, the great majority of films released ...",i admit the great majority of films released b...
1,"Take a low budget, inexperienced actors doubli...","take a low budget, inexperienced actors doubli...",take a low budget inexperienced actors doublin...
2,"Everybody has seen 'Back To The Future,' right...","everybody has seen 'back to the future,' right...",everybody has seen back to the future right wh...
3,Doris Day was an icon of beauty in singing and...,doris day was an icon of beauty in singing and...,doris day was an icon of beauty in singing and...
4,"After a series of silly, fun-loving movies, 19...","after a series of silly, fun-loving movies, 19...",after a series of silly funloving movies 1955 ...


In [192]:
df["lowercase_reviews"][0], df["punc_removed_reviews"][0]

('i admit, the great majority of films released before say 1933 are just not for me. of the dozen or so "major" silents i have viewed, one i loved (the crowd), and two were very good (the last command and city lights, that latter chaplin circa 1931).<br /><br />so i was apprehensive about this one, and humor is often difficult to appreciate (uh, enjoy) decades later. i did like the lead actors, but thought little of the film.<br /><br />one intriguing sequence. early on, the guys are supposed to get "de-loused" and for about three minutes, fully dressed, do some schtick. in the background, perhaps three dozen men pass by, all naked, white and black (wwi ?), and for most, their butts, part or full backside, are shown. was this an early variation of beefcake courtesy of howard hughes?',
 'i admit the great majority of films released before say 1933 are just not for me of the dozen or so major silents i have viewed one i loved the crowd and two were very good the last command and city lig

In [193]:
df = df.rename(columns = {"punc_removed_reviews" : "clean_reviews"})
print(df.head())
print(df.columns)

                                             reviews  \
0  I admit, the great majority of films released ...   
1  Take a low budget, inexperienced actors doubli...   
2  Everybody has seen 'Back To The Future,' right...   
3  Doris Day was an icon of beauty in singing and...   
4  After a series of silly, fun-loving movies, 19...   

                                   lowercase_reviews  \
0  i admit, the great majority of films released ...   
1  take a low budget, inexperienced actors doubli...   
2  everybody has seen 'back to the future,' right...   
3  doris day was an icon of beauty in singing and...   
4  after a series of silly, fun-loving movies, 19...   

                                       clean_reviews  
0  i admit the great majority of films released b...  
1  take a low budget inexperienced actors doublin...  
2  everybody has seen back to the future right wh...  
3  doris day was an icon of beauty in singing and...  
4  after a series of silly funloving movies 1955 ..

## Removal of stopwords 

In [194]:
from nltk.corpus import stopwords 

In [195]:
STOPWORDS = stopwords.words('english')
print(len(STOPWORDS), STOPWORDS[:5])

179 ['i', 'me', 'my', 'myself', 'we']


In [196]:
# .split() splits words separated by space 
df["reviews_sw_removed"] = df["clean_reviews"].apply(lambda text: " ".join([word for word in str(text).split() if word not in STOPWORDS]))
print(df.head()) 

                                             reviews  \
0  I admit, the great majority of films released ...   
1  Take a low budget, inexperienced actors doubli...   
2  Everybody has seen 'Back To The Future,' right...   
3  Doris Day was an icon of beauty in singing and...   
4  After a series of silly, fun-loving movies, 19...   

                                   lowercase_reviews  \
0  i admit, the great majority of films released ...   
1  take a low budget, inexperienced actors doubli...   
2  everybody has seen 'back to the future,' right...   
3  doris day was an icon of beauty in singing and...   
4  after a series of silly, fun-loving movies, 19...   

                                       clean_reviews  \
0  i admit the great majority of films released b...   
1  take a low budget inexperienced actors doublin...   
2  everybody has seen back to the future right wh...   
3  doris day was an icon of beauty in singing and...   
4  after a series of silly funloving movies 19

In [197]:
df["clean_reviews"][0],df["reviews_sw_removed"][0] 

('i admit the great majority of films released before say 1933 are just not for me of the dozen or so major silents i have viewed one i loved the crowd and two were very good the last command and city lights that latter chaplin circa 1931br br so i was apprehensive about this one and humor is often difficult to appreciate uh enjoy decades later i did like the lead actors but thought little of the filmbr br one intriguing sequence early on the guys are supposed to get deloused and for about three minutes fully dressed do some schtick in the background perhaps three dozen men pass by all naked white and black wwi  and for most their butts part or full backside are shown was this an early variation of beefcake courtesy of howard hughes',
 'admit great majority films released say 1933 dozen major silents viewed one loved crowd two good last command city lights latter chaplin circa 1931br br apprehensive one humor often difficult appreciate uh enjoy decades later like lead actors thought li

## Removal of frequent words 

In [198]:
from collections import Counter 
import itertools 
cnt = Counter() 

for text in df["reviews_sw_removed"].values:
    for word in text.split():
        cnt[word] +=1 

freq_words = set([w for w,count in cnt.most_common(10)])

# slicing in set 
set(itertools.islice(freq_words, 3))  

{'br', 'like', 'really'}

In [199]:
df["reviews_wo_freq"] = df["reviews_sw_removed"].apply(lambda text: " ".join([word for word in str(text).split() if word not in freq_words]))
print(df.head()) 

                                             reviews  \
0  I admit, the great majority of films released ...   
1  Take a low budget, inexperienced actors doubli...   
2  Everybody has seen 'Back To The Future,' right...   
3  Doris Day was an icon of beauty in singing and...   
4  After a series of silly, fun-loving movies, 19...   

                                   lowercase_reviews  \
0  i admit, the great majority of films released ...   
1  take a low budget, inexperienced actors doubli...   
2  everybody has seen 'back to the future,' right...   
3  doris day was an icon of beauty in singing and...   
4  after a series of silly, fun-loving movies, 19...   

                                       clean_reviews  \
0  i admit the great majority of films released b...   
1  take a low budget inexperienced actors doublin...   
2  everybody has seen back to the future right wh...   
3  doris day was an icon of beauty in singing and...   
4  after a series of silly funloving movies 19

In [200]:
# set(df["reviews_sw_removed"][0].split()), df["reviews_wo_freq"][0]

In [201]:
set(df["reviews_sw_removed"][1].split()) - set(df["reviews_wo_freq"][1].split()) 

{'film', 'good', 'great', 'movie', 'one', 'really', 'time'}

## Stemming 

- PorterStemmer for english language
- can also use Snowball Stemmer for some european languages 

In [202]:
from nltk.stem.porter import PorterStemmer

stemmer = PorterStemmer() 
df["stemmed_reviews"] = df["reviews_wo_freq"].apply(lambda text: " ".join([stemmer.stem(word) for word in text.split()]))
df.head() 


Unnamed: 0,reviews,lowercase_reviews,clean_reviews,reviews_sw_removed,reviews_wo_freq,stemmed_reviews
0,"I admit, the great majority of films released ...","i admit, the great majority of films released ...",i admit the great majority of films released b...,admit great majority films released say 1933 d...,admit majority films released say 1933 dozen m...,admit major film releas say 1933 dozen major s...
1,"Take a low budget, inexperienced actors doubli...","take a low budget, inexperienced actors doubli...",take a low budget inexperienced actors doublin...,take low budget inexperienced actors doubling ...,take low budget inexperienced actors doubling ...,take low budget inexperienc actor doubl produc...
2,"Everybody has seen 'Back To The Future,' right...","everybody has seen 'back to the future,' right...",everybody has seen back to the future right wh...,everybody seen back future right whether like ...,everybody seen back future right whether youve...,everybodi seen back futur right whether youv s...
3,Doris Day was an icon of beauty in singing and...,doris day was an icon of beauty in singing and...,doris day was an icon of beauty in singing and...,doris day icon beauty singing acting warm voic...,doris day icon beauty singing acting warm voic...,dori day icon beauti sing act warm voic geniu ...
4,"After a series of silly, fun-loving movies, 19...","after a series of silly, fun-loving movies, 19...",after a series of silly funloving movies 1955 ...,series silly funloving movies 1955 big year do...,series silly funloving movies 1955 big year do...,seri silli funlov movi 1955 big year dori day ...


In [203]:
for word1, word2 in zip(df["reviews_wo_freq"][1].split(), df["stemmed_reviews"][1].split()):
    if word1 != word2:
        print(f"{word1}:{word2}")

inexperienced:inexperienc
actors:actor
doubling:doubl
production:product
limited:limit
chasers:chaser
gives:give
absolutely:absolut
represents:repres
natured:natur
friends:friend
neighbors:neighbor
coming:come
together:togeth
collaborate:collabor
interesting:interest
involved:involv
probably:probabl
terrible:terribl
poorly:poorli
delivered:deliv
hamfisted:hamfist
editing:edit
chasers:chaser
ambition:ambit
imagine:imagin
necessary:necessari
every:everi
takes:take
suggesting:suggest
people:peopl
chasers:chaser
forgive:forgiv
lady:ladi
grocery:groceri
delivery:deliveri
every:everi
wrenched:wrench
jaws:jaw
problematic:problemat


## Lemmatization 

- requires words to be categorizedby a part of speech as well as by inflected form?
    - inflected language: that changes form or ending of some words in which they are used in sentences changes 
- more accurate than stemming
- slower than stemming, so stemming is performed first
    - requires more computational overhead than stemming 

In [204]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer() 
df["lemmatized_reviews"] = df["stemmed_reviews"].apply(lambda text: " ".join([lemmatizer.lemmatize(word) for word in text.split()]))
df.head() 

Unnamed: 0,reviews,lowercase_reviews,clean_reviews,reviews_sw_removed,reviews_wo_freq,stemmed_reviews,lemmatized_reviews
0,"I admit, the great majority of films released ...","i admit, the great majority of films released ...",i admit the great majority of films released b...,admit great majority films released say 1933 d...,admit majority films released say 1933 dozen m...,admit major film releas say 1933 dozen major s...,admit major film releas say 1933 dozen major s...
1,"Take a low budget, inexperienced actors doubli...","take a low budget, inexperienced actors doubli...",take a low budget inexperienced actors doublin...,take low budget inexperienced actors doubling ...,take low budget inexperienced actors doubling ...,take low budget inexperienc actor doubl produc...,take low budget inexperienc actor doubl produc...
2,"Everybody has seen 'Back To The Future,' right...","everybody has seen 'back to the future,' right...",everybody has seen back to the future right wh...,everybody seen back future right whether like ...,everybody seen back future right whether youve...,everybodi seen back futur right whether youv s...,everybodi seen back futur right whether youv s...
3,Doris Day was an icon of beauty in singing and...,doris day was an icon of beauty in singing and...,doris day was an icon of beauty in singing and...,doris day icon beauty singing acting warm voic...,doris day icon beauty singing acting warm voic...,dori day icon beauti sing act warm voic geniu ...,dori day icon beauti sing act warm voic geniu ...
4,"After a series of silly, fun-loving movies, 19...","after a series of silly, fun-loving movies, 19...",after a series of silly funloving movies 1955 ...,series silly funloving movies 1955 big year do...,series silly funloving movies 1955 big year do...,seri silli funlov movi 1955 big year dori day ...,seri silli funlov movi 1955 big year dori day ...


In [205]:
word_index_to_check = 2
for word1, word2 in zip(df["lemmatized_reviews"][word_index_to_check].split(), df["stemmed_reviews"][word_index_to_check].split()):
    if word1 != word2:
        print(f"{word1}:{word2}")

henchman:henchmen
le:less
plea:pleas


## n-grams 

- contiguous sequences of n words extracted from text
- Unigram, Bigram or Trigram
- to capture contextual information and relationships between words in given text 

In [206]:
def n_grams(text, n):
    word = text.split(' ')
    output = [] 
    for i in range(len(word) - 1):
        output.append(word[i:i+n])
    return output 

In [207]:
demo_txt = "i admit the great majority of films released before say 1933 are just not for me of the dozen or so major silents i have viewed one i loved the crowd"
n_grams(demo_txt, 3)

[['i', 'admit', 'the'],
 ['admit', 'the', 'great'],
 ['the', 'great', 'majority'],
 ['great', 'majority', 'of'],
 ['majority', 'of', 'films'],
 ['of', 'films', 'released'],
 ['films', 'released', 'before'],
 ['released', 'before', 'say'],
 ['before', 'say', '1933'],
 ['say', '1933', 'are'],
 ['1933', 'are', 'just'],
 ['are', 'just', 'not'],
 ['just', 'not', 'for'],
 ['not', 'for', 'me'],
 ['for', 'me', 'of'],
 ['me', 'of', 'the'],
 ['of', 'the', 'dozen'],
 ['the', 'dozen', 'or'],
 ['dozen', 'or', 'so'],
 ['or', 'so', 'major'],
 ['so', 'major', 'silents'],
 ['major', 'silents', 'i'],
 ['silents', 'i', 'have'],
 ['i', 'have', 'viewed'],
 ['have', 'viewed', 'one'],
 ['viewed', 'one', 'i'],
 ['one', 'i', 'loved'],
 ['i', 'loved', 'the'],
 ['loved', 'the', 'crowd'],
 ['the', 'crowd']]

In [210]:
# choosing trigram for the reviews
n = 3
df["n_gram_reviews"] = df["lemmatized_reviews"].apply(lambda text:n_grams(text, n))
df.head() 

Unnamed: 0,reviews,lowercase_reviews,clean_reviews,reviews_sw_removed,reviews_wo_freq,stemmed_reviews,lemmatized_reviews,n_gram_reviews
0,"I admit, the great majority of films released ...","i admit, the great majority of films released ...",i admit the great majority of films released b...,admit great majority films released say 1933 d...,admit majority films released say 1933 dozen m...,admit major film releas say 1933 dozen major s...,admit major film releas say 1933 dozen major s...,"[[admit, major, film], [major, film, releas], ..."
1,"Take a low budget, inexperienced actors doubli...","take a low budget, inexperienced actors doubli...",take a low budget inexperienced actors doublin...,take low budget inexperienced actors doubling ...,take low budget inexperienced actors doubling ...,take low budget inexperienc actor doubl produc...,take low budget inexperienc actor doubl produc...,"[[take, low, budget], [low, budget, inexperien..."
2,"Everybody has seen 'Back To The Future,' right...","everybody has seen 'back to the future,' right...",everybody has seen back to the future right wh...,everybody seen back future right whether like ...,everybody seen back future right whether youve...,everybodi seen back futur right whether youv s...,everybodi seen back futur right whether youv s...,"[[everybodi, seen, back], [seen, back, futur],..."
3,Doris Day was an icon of beauty in singing and...,doris day was an icon of beauty in singing and...,doris day was an icon of beauty in singing and...,doris day icon beauty singing acting warm voic...,doris day icon beauty singing acting warm voic...,dori day icon beauti sing act warm voic geniu ...,dori day icon beauti sing act warm voic geniu ...,"[[dori, day, icon], [day, icon, beauti], [icon..."
4,"After a series of silly, fun-loving movies, 19...","after a series of silly, fun-loving movies, 19...",after a series of silly funloving movies 1955 ...,series silly funloving movies 1955 big year do...,series silly funloving movies 1955 big year do...,seri silli funlov movi 1955 big year dori day ...,seri silli funlov movi 1955 big year dori day ...,"[[seri, silli, funlov], [silli, funlov, movi],..."


In [211]:
df.to_csv('1000_lemmatized.csv', index=False) 