Dataset used: IMDB Dataset from Kaggle

In [None]:
import pandas as pd

In [None]:
!unzip /content/archive.zip

Archive:  /content/archive.zip
  inflating: IMDB Dataset.csv        


In [None]:
data_path = '/content/IMDB Dataset.csv'

In [None]:
df1 = pd.read_csv(data_path)
df1.shape

(50000, 2)

In [None]:
df1.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


### Lower Casing

In [None]:
df1['review'][3]

"Basically there's a family where a little boy (Jake) thinks there's a zombie in his closet & his parents are fighting all the time.<br /><br />This movie is slower than a soap opera... and suddenly, Jake decides to become Rambo and kill the zombie.<br /><br />OK, first of all when you're going to make a film you must Decide if its a thriller or a drama! As a drama the movie is watchable. Parents are divorcing & arguing like in real life. And then we have Jake with his closet which totally ruins all the film! I expected to see a BOOGEYMAN similar movie, and instead i watched a drama with some meaningless thriller spots.<br /><br />3 out of 10 just for the well playing parents & descent dialogs. As for the shots with Jake: just ignore them."

In [None]:
df1['review'] = df1['review'].str.lower()

In [None]:
df1['review'][3]

"basically there's a family where a little boy (jake) thinks there's a zombie in his closet & his parents are fighting all the time.<br /><br />this movie is slower than a soap opera... and suddenly, jake decides to become rambo and kill the zombie.<br /><br />ok, first of all when you're going to make a film you must decide if its a thriller or a drama! as a drama the movie is watchable. parents are divorcing & arguing like in real life. and then we have jake with his closet which totally ruins all the film! i expected to see a boogeyman similar movie, and instead i watched a drama with some meaningless thriller spots.<br /><br />3 out of 10 just for the well playing parents & descent dialogs. as for the shots with jake: just ignore them."

### Remove HTML Tags

In [None]:
import re

def remove_html_tags(text):
  pattern = re.compile('<.*?>')
  return pattern.sub(r'', text)

In [None]:
text = "<html><body><p> Movie 1</p><p> Actor - Aamir Khan</p><title> Test </title></body></html>"

In [None]:
remove_html_tags(text)

' Movie 1 Actor - Aamir Khan Test '

In [None]:
df1['review'] = df1['review'].apply(remove_html_tags)

In [None]:
df1['review'][5]

'probably my all-time favorite movie, a story of selflessness, sacrifice and dedication to a noble cause, but it\'s not preachy or boring. it just never gets old, despite my having seen it some 15 or more times in the last 25 years. paul lukas\' performance brings tears to my eyes, and bette davis, in one of her very few truly sympathetic roles, is a delight. the kids are, as grandma says, more like "dressed-up midgets" than children, but that only makes them more fun to watch. and the mother\'s slow awakening to what\'s happening in the world and under her own roof is believable and startling. if i had a dozen thumbs, they\'d all be "up" for this movie.'

### Remove URL

In [None]:
def remove_url(text):
    pattern = re.compile(r'https?://[^\s]+')
    return pattern.sub('', text)

In [None]:
text1 = 'Check out my linkedin https://www.linkedin.com/in/pranavbalajirs/'

In [None]:
remove_url(text1)

'Check out my linkedin '

### Punctuation Handling

In [None]:
import string, time
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [None]:
exclude = string.punctuation
exclude

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [None]:
def remove_punc(text):
  for char in exclude:
    text = text.replace(char,'')
  return text

In [None]:
text = 'string. With. Punctuation?'

In [None]:
start = time.time()
print(remove_punc(text))
time1 = time.time() - start
print(time1*50000)

string With Punctuation
24.783611297607422


In [None]:
def remove_punc1(text):
  return text.translate(str.maketrans('', '', exclude))

In [None]:
start = time.time()
print(remove_punc1(text))
time2 = time.time() - start
print(time2*50000)

string With Punctuation
25.49886703491211


In [None]:
time1/time2

0.9719495091164095

In [None]:
remove_punc1(df1['review'][5])

'probably my alltime favorite movie a story of selflessness sacrifice and dedication to a noble cause but its not preachy or boring it just never gets old despite my having seen it some 15 or more times in the last 25 years paul lukas performance brings tears to my eyes and bette davis in one of her very few truly sympathetic roles is a delight the kids are as grandma says more like dressedup midgets than children but that only makes them more fun to watch and the mothers slow awakening to whats happening in the world and under her own roof is believable and startling if i had a dozen thumbs theyd all be up for this movie'

### Chat Conversion Handle

In [None]:
chat_words = {
    'AFAIK':'As Far As I Know',
    'AFK':'Away From Keyboard',
    'ASAP':'As Soon As Possible',
    'ATK':'At The Keyboard',
    'ATM':'At The Moment'
}

{
    'FYI':'For Your Information',
    'BTW':'By The Way',
    'OMG':'Oh My God',
    'IMO':'In My Opinion',
    'LOL':'Laughing Out Loud',
    'TTYL':'Talk To You Later',
    'GTG':'Get To Go',
}

{'FYI': 'For Your Information',
 'BTW': 'By The Way',
 'OMG': 'Oh My God',
 'IMO': 'In My Opinion',
 'LOL': 'Laughing Out Loud',
 'TTYL': 'Talk To You Later',
 'GTG': 'Get To Go'}

In [None]:
def chat_conversion(text):
  new_text = []
  for w in text.split():
    if w.upper() in chat_words:
      new_text.append(chat_words[w.upper()])
    else:
      new_text.append(w)
  return " ".join(new_text)

In [None]:
chat_conversion('Do this work ASAP')

'Do this work As Soon As Possible'

### Incorrect Text Handling

In [None]:
from textblob import TextBlob

In [None]:
incorrect_text = 'ceertain conditions duriing several ggeneratioons aree modffiefied in the sunn'
textBlb = TextBlob(incorrect_text)
textBlb.correct().string

'certain conditions during several generations are modffiefied in the sun'

### Stop Words

In [None]:
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
stopwords.words('english')

['a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 "he'd",
 "he'll",
 'her',
 'here',
 'hers',
 'herself',
 "he's",
 'him',
 'himself',
 'his',
 'how',
 'i',
 "i'd",
 'if',
 "i'll",
 "i'm",
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it'd",
 "it'll",
 "it's",
 'its',
 'itself',
 "i've",
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'on

In [None]:
len(stopwords.words('english'))

198

In [None]:
def remove_stopwords(text):
  new_text = []

  for word in text.split():
    if word in stopwords.words('english'):
      new_text.append('')
    else:
      new_text.append(word)

  x = new_text[:]
  new_text.clear()
  return " ".join(x)

In [None]:
remove_stopwords('probably my all time favourite movie, a story of selflesness, sacrifice and dedication to a nob')

'probably   time favourite movie,  story  selflesness, sacrifice  dedication   nob'

In [None]:
df1['review'].apply(remove_stopwords)

Unnamed: 0,review
0,one reviewers mentioned watching 1 oz e...
1,wonderful little production. filming techniq...
2,thought wonderful way spend time hot s...
3,basically there's family little boy (jake) ...
4,"petter mattei's ""love time money"" visuall..."
...,...
49995,thought movie right good job. creative...
49996,"bad plot, bad dialogue, bad acting, idiotic di..."
49997,catholic taught parochial elementary schoo...
49998,going disagree previous comment side m...


### Remove Emoji

In [None]:
import re

def remove_emoji(text):
  emoji_pattern = re.compile("["
                          u"\U0001F600 - \U0001F64F"
                          u"\U0001F300 - \U0001f5ff"
                          u"\U0001F680 - \U0001F6FF"
                          u"\U0001F1E0 - \U0001F1FF"
                          u"\U00002702 - \U000027B0"
                          u"\U000024C2 - \U0001F251"
                          "]+", flags=re.UNICODE)
  return emoji_pattern.sub(r'', text)

In [None]:
remove_emoji("Loved the movie. It was 😗")

'Lovedthemovie.Itwas😗'

In [None]:
!pip install emoji

Collecting emoji
  Downloading emoji-2.14.1-py3-none-any.whl.metadata (5.7 kB)
Downloading emoji-2.14.1-py3-none-any.whl (590 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/590.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m583.7/590.6 kB[0m [31m17.5 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m590.6/590.6 kB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: emoji
Successfully installed emoji-2.14.1


In [None]:
import emoji
print(emoji.demojize('I love 🍉'))

I love :watermelon:


### Tokenization

### 1. Using the split function

In [None]:
sent = 'I am going to chennai'
sent.split()

['I', 'am', 'going', 'to', 'chennai']

In [None]:
sent1 = 'I am going to chennai. I will stay there for three days. Let\'s hope the trip to be great'
sent1.split('.')

['I am going to chennai',
 ' I will stay there for three days',
 " Let's hope the trip to be great"]

#### 2. Using Regular expression

In [None]:
# Word level
import re
sent2 = "I am going to chennai"
tokens = re.findall("[\w']+", sent2)
tokens

['I', 'am', 'going', 'to', 'chennai']

In [None]:
# Sentence level
text = """Lorem Ipsum is simply dummy text of the printing typesetting insdustry?
Lorem Ipsum has been the insdustry's standard dummy text ever since the 1500s,
when an unknown printer took a galley of type and scarmbled it to make a type of specimen book."""

sentences = re.compile('[.|?] ').split(text)
sentences

["Lorem Ipsum is simply dummy text of the printing typesetting insdustry?\nLorem Ipsum has been the insdustry's standard dummy text ever since the 1500s,\nwhen an unknown printer took a galley of type and scarmbled it to make a type of specimen book."]

#### 3. NLTK

In [None]:
from nltk.tokenize import word_tokenize, sent_tokenize
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
# sent1 = 'I am going to chennai'
# word_tokenize(sent1)

In [None]:
# text = """Lorem Ipsum is simply dummy text of the printing typesetting insdustry?
# Lorem Ipsum has been the insdustry's standard dummy text ever since the 1500s,
# when an unknown printer took a galley of type and scarmbled it to make a type of specimen book."""

# sent_tokenize(text)

### Stemming


In [None]:
from nltk.stem.porter import PorterStemmer

In [None]:
ps = PorterStemmer()
def stem_word(text):
  return " ".join([ps.stem(word) for word in text.split()])

In [None]:
sample = "walk walks walking walked"
stem_word(sample)

'walk walk walk walk'

### Lemmatization

In [None]:
# import nltk
# from nltk.stem import WordNetLemmatizer
# import nltk
# nltk.download('wordnet')
# nltk.download('omw-1.4')
# wordnet_lemmatizer = WordNetLemmatizer()

# sentence = "He was running and eating at same time. He has bad habit of swimming after playing long hours in the beach."
# punctuations = "?:!.,;"

# sentence_words = nltk.word_tokenize(sentence)
# for word in sentence_words:
#   if word in punctuations:
#     sentence_words.remove(word)

# sentence_words
# print("{0:20}{1:20}".format("Word","Lemma"))
# for word in sentence_words:
#   print("{0:20}{1:20}".format(word, wordnet_lemmatizer.lemmatize(word, pos='v')))