# Practical 2
## Apply preprocessing steps on the selected Dataset.

In [5]:
import re 
import pandas as pd
import string as strs
import nlp_lib #https://github.com/Sh1vam/nlp_lib
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nlp_lib import *
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from nltk.corpus import wordnet

In [2]:
df=pd.read_csv("IMDBDS.csv")

In [3]:
df.head()

Unnamed: 0,review,sentiment
0,https://github.com/Sh1vam One of the other rev...,positive
1,A wonderful little production. <br />https://g...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


### Lower Casing

In [4]:
df['review']=df['review'].str.lower()

In [5]:
df.head()

Unnamed: 0,review,sentiment
0,https://github.com/sh1vam one of the other rev...,positive
1,a wonderful little production. <br />https://g...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive


### Removing HTML Tags

In [6]:
def remove_html_tags(text):
    if isinstance(text, str):  # Check if the input is a string
        reg = re.compile(r"<[^>]*>")  # Regular expression to match HTML tags
        return reg.sub('', text)  # Remove HTML tags
    else:
        return text
df['review']=df['review'].apply(remove_html_tags)
df.head()

Unnamed: 0,review,sentiment
0,https://github.com/sh1vam one of the other rev...,positive
1,a wonderful little production. https://github....,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive


### Removing URLs

In [7]:
def remove_url(text):
  if isinstance(text, str):
    url_pattern = re.compile(r"(http?|ftp|https?|file)://\S+ ")
    return re.sub(url_pattern, "", text)
  else:
    return text
df['review']=df['review'].apply(remove_url)
df.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. filming techniq...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive


### Removing Punctuations

In [8]:
def remove_punctuations(text):
  if pd.isnull(text):  
        return np.nan
  text = text.translate(str.maketrans('', '', strs.punctuation))
  return text.strip()
df['review']=df['review'].apply(remove_punctuations)
df.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production filming techniqu...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically theres a family where a little boy j...,negative
4,petter matteis love in the time of money is a ...,positive


### Removing Special Characters 

In [9]:
def removes_specials(text):
  tmp=text
  if pd.isnull(tmp):  
        return np.nan
  pattern = re.compile(r'[A-Za-z0-9\s]')
  specials = re.sub(pattern, "", tmp)
  return text.translate(str.maketrans('', '', specials)).strip()
df['review']=df['review'].apply(removes_specials)
df.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production filming techniqu...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically theres a family where a little boy j...,negative
4,petter matteis love in the time of money is a ...,positive


### Removing Non-Printable Characters

In [10]:
def removes_non_printables(text):
  tmp=text
  if pd.isnull(tmp):  
        return np.nan
  specials = tmp.translate(str.maketrans('', '', strs.printable)).strip()
  return text.translate(str.maketrans('', '', specials)).strip()
df['review']=df['review'].apply(removes_non_printables)
df.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production filming techniqu...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically theres a family where a little boy j...,negative
4,petter matteis love in the time of money is a ...,positive


### Defining English Corpus 

In [11]:
stop_words = set(stopwords.words('english'))

### Removing Stop Words

In [12]:
def remove_stopwords(text):
    if pd.isnull(text):  
        return np.nan
    tokens = text.split()
    filtered_words = [word for word in tokens if word not in stop_words]
    return ' '.join(filtered_words)
df['stopwords_removed']=df['review'].apply(remove_stopwords)
df.head()

Unnamed: 0,review,sentiment,stopwords_removed
0,one of the other reviewers has mentioned that ...,positive,one reviewers mentioned watching 1 oz episode ...
1,a wonderful little production filming techniqu...,positive,wonderful little production filming technique ...
2,i thought this was a wonderful way to spend ti...,positive,thought wonderful way spend time hot summer we...
3,basically theres a family where a little boy j...,negative,basically theres family little boy jake thinks...
4,petter matteis love in the time of money is a ...,positive,petter matteis love time money visually stunni...


### Text Tokenization

In [13]:
def tokenized_text(text):
    if pd.isnull(text):  
        return np.nan
    return nltk.word_tokenize(text)
df['tokenized']=df['stopwords_removed'].apply(tokenized_text)
df.head()

Unnamed: 0,review,sentiment,stopwords_removed,tokenized
0,one of the other reviewers has mentioned that ...,positive,one reviewers mentioned watching 1 oz episode ...,"[one, reviewers, mentioned, watching, 1, oz, e..."
1,a wonderful little production filming techniqu...,positive,wonderful little production filming technique ...,"[wonderful, little, production, filming, techn..."
2,i thought this was a wonderful way to spend ti...,positive,thought wonderful way spend time hot summer we...,"[thought, wonderful, way, spend, time, hot, su..."
3,basically theres a family where a little boy j...,negative,basically theres family little boy jake thinks...,"[basically, theres, family, little, boy, jake,..."
4,petter matteis love in the time of money is a ...,positive,petter matteis love time money visually stunni...,"[petter, matteis, love, time, money, visually,..."


### Implementing PorterStemmer to do stemming

In [14]:
ps = PorterStemmer()
def porter_stemming(tokenised_text):
    if isinstance(tokenised_text, list):  # Check if it's a list
        return [ps.stem(word) for word in tokenised_text]  # Apply stemming word by word
    else:
        return np.nan 
df['porter_stemmed']=df['tokenized'].apply(porter_stemming)
df.head()

Unnamed: 0,review,sentiment,stopwords_removed,tokenized,porter_stemmed
0,one of the other reviewers has mentioned that ...,positive,one reviewers mentioned watching 1 oz episode ...,"[one, reviewers, mentioned, watching, 1, oz, e...","[one, review, mention, watch, 1, oz, episod, y..."
1,a wonderful little production filming techniqu...,positive,wonderful little production filming technique ...,"[wonderful, little, production, filming, techn...","[wonder, littl, product, film, techniqu, unass..."
2,i thought this was a wonderful way to spend ti...,positive,thought wonderful way spend time hot summer we...,"[thought, wonderful, way, spend, time, hot, su...","[thought, wonder, way, spend, time, hot, summe..."
3,basically theres a family where a little boy j...,negative,basically theres family little boy jake thinks...,"[basically, theres, family, little, boy, jake,...","[basic, there, famili, littl, boy, jake, think..."
4,petter matteis love in the time of money is a ...,positive,petter matteis love time money visually stunni...,"[petter, matteis, love, time, money, visually,...","[petter, mattei, love, time, money, visual, st..."


### Applying Other Stemming Functions

In [15]:
df['lancaster_stemmed']=df['tokenized'].apply(lancaster_stemming)
df.head()

Unnamed: 0,review,sentiment,stopwords_removed,tokenized,porter_stemmed,lancaster_stemmed
0,one of the other reviewers has mentioned that ...,positive,one reviewers mentioned watching 1 oz episode ...,"[one, reviewers, mentioned, watching, 1, oz, e...","[one, review, mention, watch, 1, oz, episod, y...","[on, review, ment, watch, 1, oz, episod, youl,..."
1,a wonderful little production filming techniqu...,positive,wonderful little production filming technique ...,"[wonderful, little, production, filming, techn...","[wonder, littl, product, film, techniqu, unass...","[wond, littl, produc, film, techn, unassum, ol..."
2,i thought this was a wonderful way to spend ti...,positive,thought wonderful way spend time hot summer we...,"[thought, wonderful, way, spend, time, hot, su...","[thought, wonder, way, spend, time, hot, summe...","[thought, wond, way, spend, tim, hot, sum, wee..."
3,basically theres a family where a little boy j...,negative,basically theres family little boy jake thinks...,"[basically, theres, family, little, boy, jake,...","[basic, there, famili, littl, boy, jake, think...","[bas, ther, famy, littl, boy, jak, think, ther..."
4,petter matteis love in the time of money is a ...,positive,petter matteis love time money visually stunni...,"[petter, matteis, love, time, money, visually,...","[petter, mattei, love, time, money, visual, st...","[pet, mat, lov, tim, money, vis, stun, film, w..."


In [16]:
df['snowball_stemming']=df['tokenized'].apply(snowball_stemming)
df.head()

Unnamed: 0,review,sentiment,stopwords_removed,tokenized,porter_stemmed,lancaster_stemmed,snowball_stemming
0,one of the other reviewers has mentioned that ...,positive,one reviewers mentioned watching 1 oz episode ...,"[one, reviewers, mentioned, watching, 1, oz, e...","[one, review, mention, watch, 1, oz, episod, y...","[on, review, ment, watch, 1, oz, episod, youl,...","[one, review, mention, watch, 1, oz, episod, y..."
1,a wonderful little production filming techniqu...,positive,wonderful little production filming technique ...,"[wonderful, little, production, filming, techn...","[wonder, littl, product, film, techniqu, unass...","[wond, littl, produc, film, techn, unassum, ol...","[wonder, littl, product, film, techniqu, unass..."
2,i thought this was a wonderful way to spend ti...,positive,thought wonderful way spend time hot summer we...,"[thought, wonderful, way, spend, time, hot, su...","[thought, wonder, way, spend, time, hot, summe...","[thought, wond, way, spend, tim, hot, sum, wee...","[thought, wonder, way, spend, time, hot, summe..."
3,basically theres a family where a little boy j...,negative,basically theres family little boy jake thinks...,"[basically, theres, family, little, boy, jake,...","[basic, there, famili, littl, boy, jake, think...","[bas, ther, famy, littl, boy, jak, think, ther...","[basic, there, famili, littl, boy, jake, think..."
4,petter matteis love in the time of money is a ...,positive,petter matteis love time money visually stunni...,"[petter, matteis, love, time, money, visually,...","[petter, mattei, love, time, money, visual, st...","[pet, mat, lov, tim, money, vis, stun, film, w...","[petter, mattei, love, time, money, visual, st..."


In [17]:
df['regexp_stemmed']=df['tokenized'].apply(regexp_stemming)
df.head()

Unnamed: 0,review,sentiment,stopwords_removed,tokenized,porter_stemmed,lancaster_stemmed,snowball_stemming,regexp_stemmed
0,one of the other reviewers has mentioned that ...,positive,one reviewers mentioned watching 1 oz episode ...,"[one, reviewers, mentioned, watching, 1, oz, e...","[one, review, mention, watch, 1, oz, episod, y...","[on, review, ment, watch, 1, oz, episod, youl,...","[one, review, mention, watch, 1, oz, episod, y...","[one, reviewer, mentioned, watch, 1, oz, episo..."
1,a wonderful little production filming techniqu...,positive,wonderful little production filming technique ...,"[wonderful, little, production, filming, techn...","[wonder, littl, product, film, techniqu, unass...","[wond, littl, produc, film, techn, unassum, ol...","[wonder, littl, product, film, techniqu, unass...","[wonderful, littl, production, film, techniqu,..."
2,i thought this was a wonderful way to spend ti...,positive,thought wonderful way spend time hot summer we...,"[thought, wonderful, way, spend, time, hot, su...","[thought, wonder, way, spend, time, hot, summe...","[thought, wond, way, spend, tim, hot, sum, wee...","[thought, wonder, way, spend, time, hot, summe...","[thought, wonderful, way, spend, tim, hot, sum..."
3,basically theres a family where a little boy j...,negative,basically theres family little boy jake thinks...,"[basically, theres, family, little, boy, jake,...","[basic, there, famili, littl, boy, jake, think...","[bas, ther, famy, littl, boy, jak, think, ther...","[basic, there, famili, littl, boy, jake, think...","[basically, there, family, littl, boy, jak, th..."
4,petter matteis love in the time of money is a ...,positive,petter matteis love time money visually stunni...,"[petter, matteis, love, time, money, visually,...","[petter, mattei, love, time, money, visual, st...","[pet, mat, lov, tim, money, vis, stun, film, w...","[petter, mattei, love, time, money, visual, st...","[petter, mattei, lov, tim, money, visually, st..."


### Applying Lemmatization using WordNetLemmatizer

In [18]:
lemmatizer= WordNetLemmatizer()
def wordnet_lemmatizing(tokenised_text):
    if isinstance(tokenised_text, list):  
        return [lemmatizer.lemmatize(word) for word in tokenised_text]  
    else:
        return np.nan 
df['lemmatized']=df['tokenized'].apply(wordnet_lemmatizing)
df.head()

Unnamed: 0,review,sentiment,stopwords_removed,tokenized,porter_stemmed,lancaster_stemmed,snowball_stemming,regexp_stemmed,lemmatized
0,one of the other reviewers has mentioned that ...,positive,one reviewers mentioned watching 1 oz episode ...,"[one, reviewers, mentioned, watching, 1, oz, e...","[one, review, mention, watch, 1, oz, episod, y...","[on, review, ment, watch, 1, oz, episod, youl,...","[one, review, mention, watch, 1, oz, episod, y...","[one, reviewer, mentioned, watch, 1, oz, episo...","[one, reviewer, mentioned, watching, 1, oz, ep..."
1,a wonderful little production filming techniqu...,positive,wonderful little production filming technique ...,"[wonderful, little, production, filming, techn...","[wonder, littl, product, film, techniqu, unass...","[wond, littl, produc, film, techn, unassum, ol...","[wonder, littl, product, film, techniqu, unass...","[wonderful, littl, production, film, techniqu,...","[wonderful, little, production, filming, techn..."
2,i thought this was a wonderful way to spend ti...,positive,thought wonderful way spend time hot summer we...,"[thought, wonderful, way, spend, time, hot, su...","[thought, wonder, way, spend, time, hot, summe...","[thought, wond, way, spend, tim, hot, sum, wee...","[thought, wonder, way, spend, time, hot, summe...","[thought, wonderful, way, spend, tim, hot, sum...","[thought, wonderful, way, spend, time, hot, su..."
3,basically theres a family where a little boy j...,negative,basically theres family little boy jake thinks...,"[basically, theres, family, little, boy, jake,...","[basic, there, famili, littl, boy, jake, think...","[bas, ther, famy, littl, boy, jak, think, ther...","[basic, there, famili, littl, boy, jake, think...","[basically, there, family, littl, boy, jak, th...","[basically, there, family, little, boy, jake, ..."
4,petter matteis love in the time of money is a ...,positive,petter matteis love time money visually stunni...,"[petter, matteis, love, time, money, visually,...","[petter, mattei, love, time, money, visual, st...","[pet, mat, lov, tim, money, vis, stun, film, w...","[petter, mattei, love, time, money, visual, st...","[petter, mattei, lov, tim, money, visually, st...","[petter, matteis, love, time, money, visually,..."


In [None]:
df['procss']=df['lemmatized'].apply(removes_specials)

In [None]:
df.to_csv("reviews.csv",index=False)

In [2]:
df=pd.read_csv("reviews.csv")

In [3]:
df.head()

Unnamed: 0,review,sentiment,stopwords_removed,tokenized,porter_stemmed,lancaster_stemmed,snowball_stemming,regexp_stemmed,lemmatized,procss
0,one of the other reviewers has mentioned that ...,positive,one reviewers mentioned watching 1 oz episode ...,"['one', 'reviewers', 'mentioned', 'watching', ...","['one', 'review', 'mention', 'watch', '1', 'oz...","['on', 'review', 'ment', 'watch', '1', 'oz', '...","['one', 'review', 'mention', 'watch', '1', 'oz...","['one', 'reviewer', 'mentioned', 'watch', '1',...","['one', 'reviewer', 'mentioned', 'watching', '...",one reviewer mentioned watching 1 oz episode y...
1,a wonderful little production filming techniqu...,positive,wonderful little production filming technique ...,"['wonderful', 'little', 'production', 'filming...","['wonder', 'littl', 'product', 'film', 'techni...","['wond', 'littl', 'produc', 'film', 'techn', '...","['wonder', 'littl', 'product', 'film', 'techni...","['wonderful', 'littl', 'production', 'film', '...","['wonderful', 'little', 'production', 'filming...",wonderful little production filming technique ...
2,i thought this was a wonderful way to spend ti...,positive,thought wonderful way spend time hot summer we...,"['thought', 'wonderful', 'way', 'spend', 'time...","['thought', 'wonder', 'way', 'spend', 'time', ...","['thought', 'wond', 'way', 'spend', 'tim', 'ho...","['thought', 'wonder', 'way', 'spend', 'time', ...","['thought', 'wonderful', 'way', 'spend', 'tim'...","['thought', 'wonderful', 'way', 'spend', 'time...",thought wonderful way spend time hot summer we...
3,basically theres a family where a little boy j...,negative,basically theres family little boy jake thinks...,"['basically', 'theres', 'family', 'little', 'b...","['basic', 'there', 'famili', 'littl', 'boy', '...","['bas', 'ther', 'famy', 'littl', 'boy', 'jak',...","['basic', 'there', 'famili', 'littl', 'boy', '...","['basically', 'there', 'family', 'littl', 'boy...","['basically', 'there', 'family', 'little', 'bo...",basically there family little boy jake think t...
4,petter matteis love in the time of money is a ...,positive,petter matteis love time money visually stunni...,"['petter', 'matteis', 'love', 'time', 'money',...","['petter', 'mattei', 'love', 'time', 'money', ...","['pet', 'mat', 'lov', 'tim', 'money', 'vis', '...","['petter', 'mattei', 'love', 'time', 'money', ...","['petter', 'mattei', 'lov', 'tim', 'money', 'v...","['petter', 'matteis', 'love', 'time', 'money',...",petter matteis love time money visually stunni...


In [6]:
wordnet_map={"N": wordnet.NOUN, "V": wordnet.VERB, "J": wordnet.ADJ, "R": wordnet.ADV}
def get_wordnet_pos(tag):
    if pd.isnull(tag):  
        return np.nan
    if tag.startswith('J'):
        return wordnet.ADJ  # adjective
    elif tag.startswith('V'):
        return wordnet.VERB  # verb
    elif tag.startswith('N'):
        return wordnet.NOUN  # noun
    elif tag.startswith('R'):
        return wordnet.ADV  # adverb
    else:
        return wordnet.NOUN  # default to noun

In [15]:
wnl = WordNetLemmatizer()
def word_pos_tag(text):
    if pd.isnull(text):  
        return np.nan
    # Tokenize the text
    tokens = re.split(r'\W+', text)
    # Remove stopwords
    text = [word for word in tokens if word not in stopwords.words('english')]
    # Lemmatize the text
    text_lemmatized = [wnl.lemmatize(word) for word in text]
    # Perform POS tagging
    pos_tags = nltk.pos_tag(text_lemmatized)
    return pos_tags 
    # return text  # This line is unreachable and can be removed

In [17]:
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\PCD\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger_eng.zip.


True

In [25]:
a=word_pos_tag(df["procss"][0])
a

[('one', 'CD'),
 ('reviewer', 'NN'),
 ('mentioned', 'VBD'),
 ('watching', 'VBG'),
 ('1', 'CD'),
 ('oz', 'JJ'),
 ('episode', 'NN'),
 ('youll', 'NN'),
 ('hooked', 'VBD'),
 ('right', 'RB'),
 ('exactly', 'RB'),
 ('happened', 'VBD'),
 ('methe', 'NNS'),
 ('first', 'JJ'),
 ('thing', 'NN'),
 ('struck', 'VBD'),
 ('oz', 'JJ'),
 ('brutality', 'NN'),
 ('unflinching', 'VBG'),
 ('scene', 'NN'),
 ('violence', 'NN'),
 ('set', 'VBN'),
 ('right', 'RB'),
 ('word', 'NN'),
 ('go', 'VB'),
 ('trust', 'NN'),
 ('show', 'NN'),
 ('faint', 'NN'),
 ('hearted', 'VBD'),
 ('timid', 'JJ'),
 ('show', 'NN'),
 ('pull', 'JJ'),
 ('punch', 'JJ'),
 ('regard', 'JJ'),
 ('drug', 'NN'),
 ('sex', 'NN'),
 ('violence', 'NN'),
 ('hardcore', 'NN'),
 ('classic', 'JJ'),
 ('use', 'NN'),
 ('wordit', 'NN'),
 ('called', 'VBN'),
 ('oz', 'NN'),
 ('nickname', 'NN'),
 ('given', 'VBN'),
 ('oswald', 'JJ'),
 ('maximum', 'JJ'),
 ('security', 'NN'),
 ('state', 'NN'),
 ('penitentary', 'JJ'),
 ('focus', 'NN'),
 ('mainly', 'RB'),
 ('emerald', 'VBZ'),


In [27]:
b=get_wordnet_pos('JJ')
b

'a'

In [38]:
pos_tag(df["procss"][0].split(" "))

[('one', 'CD'),
 ('reviewer', 'NN'),
 ('mentioned', 'VBD'),
 ('watching', 'VBG'),
 ('1', 'CD'),
 ('oz', 'JJ'),
 ('episode', 'NN'),
 ('youll', 'NN'),
 ('hooked', 'VBD'),
 ('right', 'RB'),
 ('exactly', 'RB'),
 ('happened', 'VBD'),
 ('methe', 'NNS'),
 ('first', 'JJ'),
 ('thing', 'NN'),
 ('struck', 'VBD'),
 ('oz', 'JJ'),
 ('brutality', 'NN'),
 ('unflinching', 'VBG'),
 ('scene', 'NN'),
 ('violence', 'NN'),
 ('set', 'VBN'),
 ('right', 'RB'),
 ('word', 'NN'),
 ('go', 'VB'),
 ('trust', 'NN'),
 ('show', 'NN'),
 ('faint', 'NN'),
 ('hearted', 'VBD'),
 ('timid', 'JJ'),
 ('show', 'NN'),
 ('pull', 'JJ'),
 ('punch', 'JJ'),
 ('regard', 'JJ'),
 ('drug', 'NN'),
 ('sex', 'NN'),
 ('violence', 'NN'),
 ('hardcore', 'NN'),
 ('classic', 'JJ'),
 ('use', 'NN'),
 ('wordit', 'NN'),
 ('called', 'VBN'),
 ('oz', 'NN'),
 ('nickname', 'NN'),
 ('given', 'VBN'),
 ('oswald', 'JJ'),
 ('maximum', 'JJ'),
 ('security', 'NN'),
 ('state', 'NN'),
 ('penitentary', 'JJ'),
 ('focus', 'NN'),
 ('mainly', 'RB'),
 ('emerald', 'VBZ'),


In [39]:
spacy.explain('CD')

'cardinal number'

In [40]:
spacy.explain('VBD')

'verb, past tense'

In [56]:
def displacy_render(text):
    doc = nlp(text)
    displacy.render(doc, style="dep", jupyter=True, options={"distance": 110,"compact":True})
    return None

#df['procss'].apply(displacy_render)
displacy_render(df['procss'][0])

In [54]:

options={'distance':110,"compact":True,"color":"black","bg":"white","font":"time"}
def displacy_render(text):
    doc = nlp(text)
    displacy.render(doc, style="dep", jupyter=True, options=options)
    return None
displacy_render(df['review'][0])