# Data Collection

In [4]:
import pandas as pd

In [5]:
# # Download the IMDB dataset directly from a public link
# !wget "https://raw.githubusercontent.com/Ankit152/IMDB-sentiment-analysis/master/IMDB-Dataset.csv" -O "IMDB Dataset.csv"

# # Now verify it is there
# import os
# print(os.listdir())  # You should see 'IMDB Dataset.csv' now

In [6]:
df = pd.read_csv("./data/IMDB Dataset.csv")

In [7]:
df.shape

(50000, 2)

In [8]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [9]:
df = df.head(100)

In [10]:
df.shape

(100, 2)

# Data Preprocessing

## LowerCase


In [11]:
df["review"][2]

'I thought this was a wonderful way to spend time on a too hot summer weekend, sitting in the air conditioned theater and watching a light-hearted comedy. The plot is simplistic, but the dialogue is witty and the characters are likable (even the well bread suspected serial killer). While some may be disappointed when they realize this is not Match Point 2: Risk Addiction, I thought it was proof that Woody Allen is still fully in control of the style many of us have grown to love.<br /><br />This was the most I\'d laughed at one of Woody\'s comedies in years (dare I say a decade?). While I\'ve never been impressed with Scarlet Johanson, in this she managed to tone down her "sexy" image and jumped right into a average, but spirited young woman.<br /><br />This may not be the crown jewel of his career, but it was wittier than "Devil Wears Prada" and more interesting than "Superman" a great comedy to go see with friends.'

In [12]:
df["review"] = df["review"].str.lower()
df.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. <br /><br />the...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive


## Remove HTML Tags


In [13]:
# re is Regular expression
import re


def remove_html_tags(text):
    pattern = re.compile("<.*?>")
    return pattern.sub("", text)

In [14]:
text = "<html><body><p> Movie 1</p><p> Actor - Aamir Khan</p><p> Click here to <a href='http://google.com'>download</a></p></body></html>"
remove_html_tags(text)

' Movie 1 Actor - Aamir Khan Click here to download'

In [15]:
df["review"] = df["review"].apply(remove_html_tags)

In [16]:
df["review"][1]

'a wonderful little production. the filming technique is very unassuming- very old-time-bbc fashion and gives a comforting, and sometimes discomforting, sense of realism to the entire piece. the actors are extremely well chosen- michael sheen not only "has got all the polari" but he has all the voices down pat too! you can truly see the seamless editing guided by the references to williams\' diary entries, not only is it well worth the watching but it is a terrificly written and performed piece. a masterful production about one of the great master\'s of comedy and his life. the realism really comes home with the little things: the fantasy of the guard which, rather than use the traditional \'dream\' techniques remains solid then disappears. it plays on our knowledge and our senses, particularly with the scenes concerning orton and halliwell and the sets (particularly of their flat with halliwell\'s murals decorating every surface) are terribly well done.'

## Remove URL

In [17]:
def remove_url(text):
    pattern = re.compile(r"https?://\S+|www\.\S+")
    return pattern.sub("", text)

In [18]:
text1 = "Check out my youtube https://www.youtube.com/dswithbappy dswithbappy"
text2 = "Check out my linkedin https://www.linkedin.com/in/boktiarahmed73/"
text3 = "Google search here www.google.com"
text4 = "For data click https://www.kaggle.com/"

In [19]:
remove_url(text1)

'Check out my youtube  dswithbappy'

## Punctuation Handling

In [20]:
import string, time

string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [21]:
exclude = string.punctuation
exclude

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [22]:
def remove_punc_slow(text):
    for char in exclude:
        text = text.replace(char, "")
    return text

In [23]:
text = "string. With. Punctuation?"
start = time.time()
ans = remove_punc_slow(text)
end = time.time()
print(ans)
print("Time Taken: ", (end - start) * 50000)

string With Punctuation
Time Taken:  1.5497207641601562


In [24]:
def remove_punc_fast(text):
    return text.translate(str.maketrans("", "", exclude))

In [25]:
text = "string. With. Punctuation?"
start = time.time()
ans = remove_punc_slow(text)
end = time.time()
print(ans)
print("Time Taken: ", (end - start) * 50000)

string With Punctuation
Time Taken:  1.3113021850585938


In [26]:
remove_punc_fast(df["review"][1])

'a wonderful little production the filming technique is very unassuming very oldtimebbc fashion and gives a comforting and sometimes discomforting sense of realism to the entire piece the actors are extremely well chosen michael sheen not only has got all the polari but he has all the voices down pat too you can truly see the seamless editing guided by the references to williams diary entries not only is it well worth the watching but it is a terrificly written and performed piece a masterful production about one of the great masters of comedy and his life the realism really comes home with the little things the fantasy of the guard which rather than use the traditional dream techniques remains solid then disappears it plays on our knowledge and our senses particularly with the scenes concerning orton and halliwell and the sets particularly of their flat with halliwells murals decorating every surface are terribly well done'

## Chat Conversion Short $\rightarrow$ Long Form

In [27]:
chat_words = {
    "AFAIK": "As Far As I Know",
    "AFK": "Away From Keyboard",
    "ASAP": "As Soon As Possible",
    "FYI": "For Your Information",
    "ASAP": "As Soon As Possible",
    "BRB": "Be Right Back",
    "BTW": "By The Way",
    "OMG": "Oh My God",
    "IMO": "In My Opinion",
    "LOL": "Laugh Out Loud",
    "TTYL": "Talk To You Later",
    "GTG": "Got To Go",
    "TTYT": "Talk To You Tomorrow",
    "IDK": "I Don't Know",
    "TMI": "Too Much Information",
    "IMHO": "In My Humble Opinion",
    "ICYMI": "In Case You Missed It",
    "AFAIK": "As Far As I Know",
    "BTW": "By The Way",
    "FAQ": "Frequently Asked Questions",
    "TGIF": "Thank God It's Friday",
    "FYA": "For Your Action",
    "ICYMI": "In Case You Missed It",
}

In [28]:
def chat_convert(text):
    new_text = []
    for word in text.split():
        if word.upper() in chat_words:
            new_text.append(chat_words[word.upper()])
        else:
            new_text.append(word)
    return " ".join(new_text)

In [31]:
chat_convert("Do this work asap")

'Do this work As Soon As Possible'

## Incorrect Text Handling

In [32]:
from textblob import TextBlob

In [33]:
incorrect_text = "ceertain conditionas duriing seveal ggenerations aree moodified in the saame maner."
textBlb = TextBlob(incorrect_text)
textBlb.correct().string

'certain conditions during several generations are modified in the same manner.'

## Stopwords [Filler Words]

In [34]:
from nltk.corpus import stopwords
import nltk

nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/prabhjeet1/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [35]:
stopwords.words("english")

['a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 "he'd",
 "he'll",
 'her',
 'here',
 'hers',
 'herself',
 "he's",
 'him',
 'himself',
 'his',
 'how',
 'i',
 "i'd",
 'if',
 "i'll",
 "i'm",
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it'd",
 "it'll",
 "it's",
 'its',
 'itself',
 "i've",
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'on

In [36]:
len(stopwords.words("english"))

198

In [37]:
def remove_stopwords(text):
    new_text = []
    for word in text.split():
        if word not in stopwords.words("english"):
            new_text.append(word)
    return " ".join(new_text)

In [38]:
remove_stopwords(
    "probably my all-time favorite movie, a story of selflessness, sacrifice and dedication to a noble cause, but it's not preachy or boring. it just never gets old, despite my having seen it some 15 or more times"
)

'probably all-time favorite movie, story selflessness, sacrifice dedication noble cause, preachy boring. never gets old, despite seen 15 times'

In [39]:
df["review"].apply(remove_stopwords)

0     one reviewers mentioned watching 1 oz episode ...
1     wonderful little production. filming technique...
2     thought wonderful way spend time hot summer we...
3     basically there's family little boy (jake) thi...
4     petter mattei's "love time money" visually stu...
                            ...                        
95    daniel day-lewis versatile actor alive. englis...
96    guess would originally going least two parts, ...
97    well, like watch bad horror b-movies, cause th...
98    worst movie ever seen, well as, worst probably...
99    mario fan long remember, fond memories playing...
Name: review, Length: 100, dtype: object

In [40]:
df.head(2)

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. the filming tec...,positive


## Remove Emoji

In [41]:
import re


def remove_emoji(text):
    emoji_pattern = re.compile(
        "["
        "\U0001f600-\U0001f64f"  # emoticons
        "\U0001f300-\U0001f5ff"  # symbols & pictographs
        "\U0001f680-\U0001f6ff"  # transport & map symbols
        "\U0001f1e0-\U0001f1ff"  # flags (iOS)
        "\U00002702-\U000027b0"
        "\U000024c2-\U0001f251"
        "]+",
        flags=re.UNICODE,
    )
    return emoji_pattern.sub("", text)

In [42]:
remove_emoji("Loved the movie üòç")

'Loved the movie '

## Replace Emoji with its text

In [43]:
!pip install emoji

Collecting emoji
  Downloading emoji-2.15.0-py3-none-any.whl.metadata (5.7 kB)
Downloading emoji-2.15.0-py3-none-any.whl (608 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m608.4/608.4 kB[0m [31m5.4 MB/s[0m  [33m0:00:00[0m
[?25hInstalling collected packages: emoji
Successfully installed emoji-2.15.0


In [44]:
import emoji

emoji.demojize("Loved the movie üòç")

'Loved the movie :smiling_face_with_heart-eyes:'

# Tokenization

## Word Tokenize

In [45]:
sent1 = "I am going to delhi"
sent1.split()

['I', 'am', 'going', 'to', 'delhi']

## Sentence Tokenize

In [46]:
sent2 = (
    "I am going to delhi. I will stay there for 3 days. Let's hope the trip to be great"
)
sent2.split(".")

['I am going to delhi',
 ' I will stay there for 3 days',
 " Let's hope the trip to be great"]

## Tokenize using NLTK

In [47]:
from nltk.tokenize import word_tokenize, sent_tokenize
import nltk

nltk.download("punkt_tab")

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/prabhjeet1/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [48]:
sent1 = "I am going to visit delhi!"
word_tokenize(sent1)

['I', 'am', 'going', 'to', 'visit', 'delhi', '!']

In [51]:
text = """Lorem Ipsum is simply dummy text of the printing and typesetting industry?
Lorem Ipsum has been the industry's standard dummy text ever since the 1500s,
when an unknown printer took a galley of type and scrambled it to make a type specimen book."""

sent_tokenize(text)

['Lorem Ipsum is simply dummy text of the printing and typesetting industry?',
 "Lorem Ipsum has been the industry's standard dummy text ever since the 1500s,\nwhen an unknown printer took a galley of type and scrambled it to make a type specimen book."]

## Tokenize using spaCy
- More Optimized and Better Way For Production

In [54]:
import spacy

nlp = spacy.load("en_core_web_sm")

In [55]:
sent5 = "I have a Ph.D in A.I"
sent6 = "We're here to help! mail us at nks@gmail.com"
sent7 = "A 5km ride cost $10.50"

In [56]:
doc1 = nlp(sent5)
doc2 = nlp(sent6)
doc3 = nlp(sent7)

In [57]:
for token in doc1:
    print(token)

I
have
a
Ph
.
D
in
A.I


## Stemmerization


In [58]:
from nltk.stem.porter import PorterStemmer

In [59]:
ps = PorterStemmer()


def stem_words(text):
    return " ".join([ps.stem(word) for word in text.split()])

In [60]:
sample = "walk walks walking walked"
stem_words(sample)

'walk walk walk walk'

## Lemmatization

In [61]:
import nltk
from nltk.stem import WordNetLemmatizer
import nltk

nltk.download("wordnet")
nltk.download("omw-1.4")

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/prabhjeet1/nltk_data...
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/prabhjeet1/nltk_data...


True

In [62]:
wordnet_lemmatizer = WordNetLemmatizer()
sentence = "He was running and eating at same time. He has bad habit of swimming after playing long hours in the Sun."
punctuations = "?:!.,;"
sentence_words = nltk.word_tokenize(sentence)
for word in sentence_words:
    if word in punctuations:
        sentence_words.remove(word)

sentence_words
print("{0:20}{1:20}".format("Word", "Lemma"))
for word in sentence_words:
    print("{0:20}{1:20}".format(word, wordnet_lemmatizer.lemmatize(word, pos="v")))

Word                Lemma               
He                  He                  
was                 be                  
running             run                 
and                 and                 
eating              eat                 
at                  at                  
same                same                
time                time                
He                  He                  
has                 have                
bad                 bad                 
habit               habit               
of                  of                  
swimming            swim                
after               after               
playing             play                
long                long                
hours               hours               
in                  in                  
the                 the                 
Sun                 Sun                 
