### Removing HTML Tags

In [1]:
import pandas as pd
import numpy as np

In [2]:
sample_text = "<h1>Welcome to My Website</h1> <p>This is a <strong>sample</strong> HTML page created by Microsoft Copilot.</p>"

In [3]:
import re
def striphtml(data):
    p = re.compile(r'<.*?>')
    return p.sub("",data)

In [4]:
striphtml(sample_text)

'Welcome to My Website This is a sample HTML page created by Microsoft Copilot.'

### Removing Emojis

In [5]:
text_emojis = "Take care and keep smiling! üòäüòÄüòÅ, Feeling sad? It‚Äôs okay, everyone has those days. üåßÔ∏èüò¢ Remember, after the rain comes the rainbow. üåà Keep your chin up! You‚Äôve got this! üí™‚ú®, Hope that adds a bit of spark to your day! üåûüåªüéâ"

In [6]:
text_emojis.encode('utf-8')

b'Take care and keep smiling! \xf0\x9f\x98\x8a\xf0\x9f\x98\x80\xf0\x9f\x98\x81, Feeling sad? It\xe2\x80\x99s okay, everyone has those days. \xf0\x9f\x8c\xa7\xef\xb8\x8f\xf0\x9f\x98\xa2 Remember, after the rain comes the rainbow. \xf0\x9f\x8c\x88 Keep your chin up! You\xe2\x80\x99ve got this! \xf0\x9f\x92\xaa\xe2\x9c\xa8, Hope that adds a bit of spark to your day! \xf0\x9f\x8c\x9e\xf0\x9f\x8c\xbb\xf0\x9f\x8e\x89'

### Spelling Correction

In [7]:
from textblob import TextBlob

In [8]:
text_incorrect = "The quicck brwn foox junpss ovver the lazi dogi."
corrected_text = TextBlob(text_incorrect).correct()
corrected_text

TextBlob("The quick brown foot jumps over the lazy dog.")

### Tokenization

In [9]:
dummy_text = ""
print(dummy_text)

from nltk.tokenize import sent_tokenize, word_tokenize

sents = sent_tokenize(dummy_text)
sents

for sent in sents:
    print(word_tokenize(sent))
    




### Lowercase

In [10]:
df = pd.read_csv("datasets\IMDB Dataset.csv")
df.head()

  df = pd.read_csv("datasets\IMDB Dataset.csv")


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [11]:
df['review'][1].lower()

'a wonderful little production. <br /><br />the filming technique is very unassuming- very old-time-bbc fashion and gives a comforting, and sometimes discomforting, sense of realism to the entire piece. <br /><br />the actors are extremely well chosen- michael sheen not only "has got all the polari" but he has all the voices down pat too! you can truly see the seamless editing guided by the references to williams\' diary entries, not only is it well worth the watching but it is a terrificly written and performed piece. a masterful production about one of the great master\'s of comedy and his life. <br /><br />the realism really comes home with the little things: the fantasy of the guard which, rather than use the traditional \'dream\' techniques remains solid then disappears. it plays on our knowledge and our senses, particularly with the scenes concerning orton and halliwell and the sets (particularly of their flat with halliwell\'s murals decorating every surface) are terribly well d

In [12]:
df['review'] = df['review'].str.lower()

In [13]:
df

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. <br /><br />the...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive
...,...,...
49995,i thought this movie did a down right good job...,positive
49996,"bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,i am a catholic taught in parochial elementary...,negative
49998,i'm going to have to disagree with the previou...,negative


#### Removing HTML Tags

In [14]:
import re
def remove_html_tags(text):
    pattern = re.compile('<.*?>')
    return pattern.sub(r'', text)

In [15]:
text = '<html> <body> A wonderful little production. <br /><br />the filming technique is very unassuming- very old-time-bbc fashion and gives a comforting, and sometimes discomforting, sense of realism to the entire piece. <br /><br />the actors are extremely well chosen- michael sheen not only "has got all the polari"'

In [16]:
df['review'] = df['review'].apply(remove_html_tags)

In [17]:
df

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. the filming tec...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive
...,...,...
49995,i thought this movie did a down right good job...,positive
49996,"bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,i am a catholic taught in parochial elementary...,negative
49998,i'm going to have to disagree with the previou...,negative


### Removing URLS from Text

In [18]:
def remove_urls(text):
    pattern = re.compile(r'https?://\S+|www\.\S+')
    return pattern.sub(r'', text)

In [19]:
url1 = 'Check out my Linkedin Profile https://www.kaggle.com/shakilahmad'
url2 = 'Check out Shakil Khan  Linkedin Profile http://www.kaggle.com/shakilahmad'
url3 = 'Check Google Linkedin Profile www.kaggle.com/shakilahmad'
url4 = 'Check this out my Linkedin Profile https://www.kaggle.com/shakilahmad Also google here www.google.com'


In [20]:
remove_urls(url4)

'Check this out my Linkedin Profile  Also google here '

### Removing Punctuation

In [21]:
# Punctuation Are: @#$%^&*() ETC
import string, time
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [22]:
exclude = string.punctuation

def remove_punctuation(text):
    for char in exclude:
        text = text.replace(char,'')
    return text

In [23]:
text2 = 'this a text. and contain, some puctuation !@?'

In [28]:
start = time.time()
print(remove_punctuation(text2))
time1 = time.time() - start
print(time1 * 50000)

this a text and contain some puctuation 
27.477741241455078


In [25]:
# Above method is taking more time if we have large amount of data 
# Another way of removing punctuaiton is:

def remove_punctuation_two(text):
    return text.translate(str.maketrans('','',exclude))

In [33]:
start = time.time()
print(remove_punctuation_two(text2))
time2 = time.time() - start
print(time2 * 50000)

this a text and contain some puctuation 
38.8026237487793


### Chat word Treatment

In [34]:
chat_words = {
    "AFAIK": "As Far As I Know",
    "AFK": "Away From Keyboard",
    "ASAP": "As Soon As Possible",
    "ATK": "At The Keyboard",
    "ATM": "At The Moment",
    "A3": "Anytime, Anywhere, Anyplace",
    "BAK": "Back At Keyboard",
    "BBL": "Be Back Later",
    "BBS": "Be Back Soon",
    "BFN": "Bye For Now",
    "B4N": "Bye For Now",
    "BRB": "Be Right Back",
    "BRT": "Be Right There",
    "BTW": "By The Way",
    "B4": "Before",
    "CU": "See You",
    "CUL8R": "See You Later",
    "CYA": "See You",
    "FAQ": "Frequently Asked Questions",
    "FC": "Fingers Crossed",
    "FWIW": "For What It's Worth",
    "FYI": "For Your Information",
    "GAL": "Get A Life",
    "GG": "Good Game",
    "GN": "Good Night",
    "GMTA": "Great Minds Think Alike",
    "GR8": "Great!",
    "G9": "Genius",
    "IC": "I See",
    "ICQ": "I Seek you (also a chat program)",
    "ILU": "I Love You",
    "IMHO": "In My Honest/Humble Opinion",
    "IMO": "In My Opinion",
    "IOW": "In Other Words",
    "IRL": "In Real Life",
    "KISS": "Keep It Simple, Stupid",
    "LDR": "Long Distance Relationship",
    "LMAO": "Laugh My A.. Off",
    "LOL": "Laughing Out Loud",
    "LTNS": "Long Time No See",
    "L8R": "Later",
    "MTE": "My Thoughts Exactly",
    "M8": "Mate",
    "NRN": "No Reply Necessary",
    "OIC": "Oh I See",
    "PITA": "Pain In The A..",
    "PRT": "Party",
    "PRW": "Parents Are Watching",
    "QPSA?": "Que Pasa?",
    "ROFL": "Rolling On The Floor Laughing",
    "ROFLOL": "Rolling On The Floor Laughing Out Loud",
    "ROTFLMAO": "Rolling On The Floor Laughing My A.. Off",
    "SK8": "Skate",
    "STATS": "Your sex and age",
    "ASL": "Age, Sex, Location",
    "THX": "Thank You",
    "TTFN": "Ta-Ta For Now!",
    "TTYL": "Talk To You Later",
    "U": "You",
    "U2": "You Too",
    "U4E": "Yours For Ever",
    "WB": "Welcome Back",
    "WTF": "What The F...",
    "WTG": "Way To Go!",
    "WUF": "Where Are You From?",
    "W8": "Wait...",
    "7K": "Sick:-D Laugher",
    "TFW": "That feeling when",
    "MFW": "My face when",
    "MRW": "My reaction when",
    "IFYP": "I feel your pain",
    "LOL": "Laughing out loud",
    "TNTL": "Trying not to laugh",
    "JK": "Just kidding",
    "IDC": "I don‚Äôt care",
    "ILY": "I love you",
    "IMU": "I miss you",
    "ADIH": "Another day in hell",
    "ZZZ": "Sleeping, bored, tired",
    "WYWH": "Wish you were here",
    "TIME": "Tears in my eyes",
    "BAE": "Before anyone else",
    "FIMH": "Forever in my heart",
    "BSAAW": "Big smile and a wink",
    "BWL": "Bursting with laughter",
    "LMAO": "Laughing my a** off",
    "BFF": "Best friends forever",
    "CSL": "Can‚Äôt stop laughing"
}


In [35]:
def chat_conversion(text):
    new_text = []
    for w in text.split():
        if w.upper() in chat_words:
            new_text.append(chat_words[w.upper()])
        else:
            new_text.append(w)
    return " ".join(new_text)

In [41]:
chat_conversion('ABC')

'ABC'

### Spelling Correction

In [47]:
from textblob import TextBlob

text_incorrect = "The quicck brwn foxxx junpss ovver the lazi dogi."
corrected_text = TextBlob(text_incorrect)
corrected_text.correct().string

'The quick brown fox jumps over the lazy dog.'

### Removing stop words
we can remove stop words from sentence for better accuracy and machine learning process, but we will need for sentiment analysis and POS tagging , parts of speech tagging

In [48]:
from nltk.corpus import stopwords

In [50]:
en_stopwords = stopwords.words('english')

In [55]:
def remove_stopwords(sentence):
    new_text = []
    for word in sentence.split():
        if word in en_stopwords:
            new_text.append('')
        else:
            new_text.append(word)
    x = new_text[:]
    new_text.clear()
    return " ".join(x)

In [56]:
remove_stopwords('i will be having good time with frineds to visit a place for enjoyment')

'    good time  frineds  visit  place  enjoyment'