In [2]:
import pandas as pd

### LowerCasing

In [3]:
df = pd.read_csv("dataset.csv")

In [4]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [5]:
df.shape

(50000, 2)

In [6]:
df["review"][5]

'Probably my all-time favorite movie, a story of selflessness, sacrifice and dedication to a noble cause, but it\'s not preachy or boring. It just never gets old, despite my having seen it some 15 or more times in the last 25 years. Paul Lukas\' performance brings tears to my eyes, and Bette Davis, in one of her very few truly sympathetic roles, is a delight. The kids are, as grandma says, more like "dressed-up midgets" than children, but that only makes them more fun to watch. And the mother\'s slow awakening to what\'s happening in the world and under her own roof is believable and startling. If I had a dozen thumbs, they\'d all be "up" for this movie.'

In [7]:
df["review"][5].lower()

'probably my all-time favorite movie, a story of selflessness, sacrifice and dedication to a noble cause, but it\'s not preachy or boring. it just never gets old, despite my having seen it some 15 or more times in the last 25 years. paul lukas\' performance brings tears to my eyes, and bette davis, in one of her very few truly sympathetic roles, is a delight. the kids are, as grandma says, more like "dressed-up midgets" than children, but that only makes them more fun to watch. and the mother\'s slow awakening to what\'s happening in the world and under her own roof is believable and startling. if i had a dozen thumbs, they\'d all be "up" for this movie.'

In [8]:
#appying lower case conversion to entire dataset
df["review"].str.lower()

0        one of the other reviewers has mentioned that ...
1        a wonderful little production. <br /><br />the...
2        i thought this was a wonderful way to spend ti...
3        basically there's a family where a little boy ...
4        petter mattei's "love in the time of money" is...
                               ...                        
49995    i thought this movie did a down right good job...
49996    bad plot, bad dialogue, bad acting, idiotic di...
49997    i am a catholic taught in parochial elementary...
49998    i'm going to have to disagree with the previou...
49999    no one expects the star trek movies to be high...
Name: review, Length: 50000, dtype: object

In [9]:
df["review"] = df["review"].str.lower()

In [10]:
df.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. <br /><br />the...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive


### removing html tags

In [11]:
import re
def removeHTMLtags(text):
    pattern = re.compile('<.*?>')
    return pattern.sub(r'', text)

In [12]:
html_string = """
<p>Hello, <b>this is a bold text</b> and <i>this is italic</i>.</p>
<a href="https://example.com">Click here</a> to visit our website.
<ul>
    <li>Item 1</li>
    <li>Item 2</li>
    <li>Item 3</li>
</ul>
"""
print(html_string)


<p>Hello, <b>this is a bold text</b> and <i>this is italic</i>.</p>
<a href="https://example.com">Click here</a> to visit our website.
<ul>
    <li>Item 1</li>
    <li>Item 2</li>
    <li>Item 3</li>
</ul>



In [13]:
removeHTMLtags(html_string)

'\nHello, this is a bold text and this is italic.\nClick here to visit our website.\n\n    Item 1\n    Item 2\n    Item 3\n\n'

In [14]:
# removing html tags from the dataset
df["review"].apply(removeHTMLtags)

0        one of the other reviewers has mentioned that ...
1        a wonderful little production. the filming tec...
2        i thought this was a wonderful way to spend ti...
3        basically there's a family where a little boy ...
4        petter mattei's "love in the time of money" is...
                               ...                        
49995    i thought this movie did a down right good job...
49996    bad plot, bad dialogue, bad acting, idiotic di...
49997    i am a catholic taught in parochial elementary...
49998    i'm going to have to disagree with the previou...
49999    no one expects the star trek movies to be high...
Name: review, Length: 50000, dtype: object

In [15]:
df["review"] = df["review"].apply(removeHTMLtags)

### removing puntuation

we remove puntuation, as if we have puntuation, when we perform tokenization, different punutation just extend the number of token which might add extra complexity or it might consider puntuation and word as new token.

In [16]:
import string, time

In [17]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [18]:
puntuations = string.punctuation

In [19]:
#function to remove puntuations
def removePuntutation(text):
    for char in puntuations:
        text = text.replace(char, '')
    return text 

In [20]:
text = "hellow.! How are you.? I am good..!!"

In [21]:
start = time.time()
updateText = removePuntutation(text)
start1 = time.time() - start

In [22]:
print(f"Original text : \n -->{text}")
print(f"Updated text : \n -->{updateText}")
print(f"Time taken : --> {start1}")

Original text : 
 -->hellow.! How are you.? I am good..!!
Updated text : 
 -->hellow How are you I am good
Time taken : --> 3.4809112548828125e-05


### !!! the above method is very slow to remove puntutation

In [23]:
#much optimized way to remove punctuations
def remove_puntuation(text):
    return text.translate(str.maketrans('','',puntuations))


In [24]:
start = time.time()
updateText = remove_puntuation(text)
start2 = time.time() - start

In [25]:
print(f"Original text : \n -->{text}")
print(f"Updated text : \n -->{updateText}")
print(f"Time taken : --> {start2 }")

Original text : 
 -->hellow.! How are you.? I am good..!!
Updated text : 
 -->hellow How are you I am good
Time taken : --> 2.384185791015625e-05


In [26]:
start1/start2

1.46

### Chat word treatnment

working with the language used in social media

In [27]:
chatword = {
    "AFAIK": "As Far As I Know",
    "AFK": "Away From Keyboard",
    "ASAP": "As Soon As Possible",
    "ATK": "At The Keyboard",
    "ATM": "At The Moment",
    "A3": "Anytime, Anywhere, Anyplace",
    "BAK": "Back At Keyboard",
    "BBL": "Be Back Later"
}

def convert_to_dict(text):
    new_text = []
    for w in text.split():
        if w.upper() in chatword:  # Convert to uppercase before checking
            new_text.append(chatword[w.upper()])
        else:
            new_text.append(w)  # Keep original word if not found
    return " ".join(new_text)


In [28]:
# Example usage
text = "I will be AFK ATM but BBL"
converted = convert_to_dict(text)

In [29]:
print(f"Original text : \n -->{text}")
print(f"Updated text : \n -->{converted}")

Original text : 
 -->I will be AFK ATM but BBL
Updated text : 
 -->I will be Away From Keyboard At The Moment but Be Back Later


### spelling correction

cause complexity if there is speeling error then different tokenization is done, causing error

In [30]:
from textblob import TextBlob

In [31]:
original = "certain condtions durring several ggeneration are moddified in the samee maner"
textblb = TextBlob(original)
updated = textblb.correct().string

In [32]:
print(f"Original word: \n --> {original}")
print(f"Updated word: \n --> {updated}")

Original word: 
 --> certain condtions durring several ggeneration are moddified in the samee maner
Updated word: 
 --> certain conditions during several generation are modified in the same manner


### removing stop words

In [33]:
from nltk.corpus import stopwords

In [34]:
list_of_stopword = stopwords.words("english")
print(list_of_stopword)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [35]:
def remove_stopwords(text):
    ans = []
    w = text.split()
    for word in w:
        if word.lower() in list_of_stopword:
            continue
        else:
            ans.append(word)
    return " ".join(ans)

In [36]:
original = "hi i am vivek choudhry, my favourite sports is football, you are who.?"
updated = remove_stopwords(original)

In [37]:
updated

'hi vivek choudhry, favourite sports football, who.?'

In [38]:
df["review"].apply(remove_stopwords)

0        one reviewers mentioned watching 1 oz episode ...
1        wonderful little production. filming technique...
2        thought wonderful way spend time hot summer we...
3        basically there's family little boy (jake) thi...
4        petter mattei's "love time money" visually stu...
                               ...                        
49995    thought movie right good job. creative origina...
49996    bad plot, bad dialogue, bad acting, idiotic di...
49997    catholic taught parochial elementary schools n...
49998    i'm going disagree previous comment side malti...
49999    one expects star trek movies high art, fans ex...
Name: review, Length: 50000, dtype: object

### tokenization

i.e breaking the data into small tokens -> this could be words or sentances

### 1) using python split function

In [39]:
#word tokenization using split functin 
word = "I am going to new delhi"
updated_word = word.split(" ")

In [40]:
print(f"Original word : \ng --> {word}")
print(f"Updated word : \n --> {updated_word}")

Original word : 
g --> I am going to new delhi
Updated word : 
 --> ['I', 'am', 'going', 'to', 'new', 'delhi']


In [41]:
word = "I am going to gujrat. My flight will land at 4pm"
updated_word = word.split(" ")

In [42]:
print(f"Original word : \ng --> {word}")
print(f"Updated word : \n --> {updated_word}")

Original word : 
g --> I am going to gujrat. My flight will land at 4pm
Updated word : 
 --> ['I', 'am', 'going', 'to', 'gujrat.', 'My', 'flight', 'will', 'land', 'at', '4pm']


In [43]:
#problem with split function is that it takes explanation mark with the word in some case
word = "I am going to gujrat!. My flight will land at 4pm!!"
updated_word = word.split(" ")

In [44]:
print(f"Original word : \ng --> {word}")
print(f"Updated word : \n --> {updated_word}")

Original word : 
g --> I am going to gujrat!. My flight will land at 4pm!!
Updated word : 
 --> ['I', 'am', 'going', 'to', 'gujrat!.', 'My', 'flight', 'will', 'land', 'at', '4pm!!']


### 2) using regular expression

In [45]:
import re


word = "I am going to gujrat!. My flight will land at 4pm!!"
updated_word = re.findall("[\w']+", word)

print(f"Original word : \ng --> {word}")
print(f"Updated word : \n --> {updated_word}")


Original word : 
g --> I am going to gujrat!. My flight will land at 4pm!!
Updated word : 
 --> ['I', 'am', 'going', 'to', 'gujrat', 'My', 'flight', 'will', 'land', 'at', '4pm']


### 3) using nltk

In [46]:
from nltk.tokenize import word_tokenize, sent_tokenize

In [47]:
word = "I am going to visit Delhi!"
updated_word = word_tokenize(word)

In [48]:
print(f"Original word : \ng --> {word}")
print(f"Updated word : \n --> {updated_word}")

Original word : 
g --> I am going to visit Delhi!
Updated word : 
 --> ['I', 'am', 'going', 'to', 'visit', 'Delhi', '!']


In [49]:
word = df["review"][0][:300]
updated_word = sent_tokenize(word)

In [50]:
print(f"Original word : \ng --> {word}")
print(f"Updated word : \n --> {updated_word}")

Original word : 
g --> one of the other reviewers has mentioned that after watching just 1 oz episode you'll be hooked. they are right, as this is exactly what happened with me.the first thing that struck me about oz was its brutality and unflinching scenes of violence, which set in right from the word go. trust me, this 
Updated word : 
 --> ["one of the other reviewers has mentioned that after watching just 1 oz episode you'll be hooked.", 'they are right, as this is exactly what happened with me.the first thing that struck me about oz was its brutality and unflinching scenes of violence, which set in right from the word go.', 'trust me, this']


In [52]:
word1 = 'I have a Ph.D in A.I'
word2 = "We're here to help! mail us at vivek084@gmail.com"
word3 = "A 5km ride will cost Rs.500"


In [54]:
word_tokenize(word1)

['I', 'have', 'a', 'Ph.D', 'in', 'A.I']

In [55]:
word_tokenize(word3)

['A', '5km', 'ride', 'will', 'cost', 'Rs.500']

In [56]:
word_tokenize(word2)

['We',
 "'re",
 'here',
 'to',
 'help',
 '!',
 'mail',
 'us',
 'at',
 'vivek084',
 '@',
 'gmail.com']

### 4) using spacy

In [None]:
import spacy 
lp = spacy.load("en_core_web_sm")

OSError: [E050] Can't find model 'en_core_web_sm'. It doesn't seem to be a Python package or a valid path to a data directory.