<a href="https://colab.research.google.com/github/Prakhar021-hub/Deep-Learning-Notebooks/blob/main/text_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [30]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


In [31]:
data = {
    "reviews" : [ "<p><b>Amazing quality!</b> The product exceeded my expectations. <i>Very durable</i>, easy to use, and worth the price. I highly recommend it to everyone.</p>",
                 "<p><b>Disappointed purchase.</b> The build feels cheap, and delivery was delayed. <u>Not satisfied</u> with customer service. Could have been a much better experience overall.</p>",
                  "<p><b>Good value for money.</b> Packaging was neat, product works fine. <i>Not premium</i> quality, but acceptable for the price. Would buy again if needed.</p>",
                  "<p><b>Excellent service!</b> Delivery was lightning fast, packaging was secure, and the product works flawlessly. <i>Highly recommend</i> this seller for a smooth shopping experience.</p>"],
    "rating" : ["excellent", "Verrry good", "poorr", "fabb"]
}

In [32]:
df = pd.DataFrame(data)

In [33]:
df

Unnamed: 0,reviews,rating
0,<p><b>Amazing quality!</b> The product exceede...,excellent
1,<p><b>Disappointed purchase.</b> The build fee...,Verrry good
2,<p><b>Good value for money.</b> Packaging was ...,poorr
3,<p><b>Excellent service!</b> Delivery was ligh...,fabb


In [34]:
## Lowercasing

df["reviews"][0].lower()

'<p><b>amazing quality!</b> the product exceeded my expectations. <i>very durable</i>, easy to use, and worth the price. i highly recommend it to everyone.</p>'

In [35]:
# similarly for the whole dataset
df["reviews"] = df["reviews"].str.lower()

In [36]:
df

Unnamed: 0,reviews,rating
0,<p><b>amazing quality!</b> the product exceede...,excellent
1,<p><b>disappointed purchase.</b> the build fee...,Verrry good
2,<p><b>good value for money.</b> packaging was ...,poorr
3,<p><b>excellent service!</b> delivery was ligh...,fabb


In [37]:
tex = df["reviews"][0]

In [38]:
import re

def remove_html_tags(text):
    clean_text = re.compile('<.*?>')
    return clean_text.sub(r'', text)

In [39]:
remove_html_tags(tex)

'amazing quality! the product exceeded my expectations. very durable, easy to use, and worth the price. i highly recommend it to everyone.'

In [40]:
## for our example

df["reviews"] = df["reviews"].apply(remove_html_tags)

In [41]:
df

Unnamed: 0,reviews,rating
0,amazing quality! the product exceeded my expec...,excellent
1,"disappointed purchase. the build feels cheap, ...",Verrry good
2,"good value for money. packaging was neat, prod...",poorr
3,excellent service! delivery was lightning fast...,fabb


In [42]:
## removing the urls

text1 = "I often use www.google.com when I need to quickly search for something."
text2 = "Here is a useful tutorial link: google.com/search?q=python that helped me understand Python basics."
text3 = "You can find more information about this project at http://example.org/about if you’re interested."
text4 = "My company website is hosted at https://sub.example.com/page/123, and it contains all our product details."
text5 = "The old software version is still available for download at ftp://downloads.example.net/file.txt, but I don’t recommend using it anymore."

import re

def remove_urls(text):
    # Match http, https, and www URLs
    clean_text = re.sub(r'(https?://\S+|www\.\S+)', '', text)
    return clean_text.strip()

In [43]:
remove_urls(text1)


'I often use  when I need to quickly search for something.'

In [44]:
## Removing the punctuations

In [45]:
import string, time

In [46]:
exclude = string.punctuation

In [47]:
def remove_punctuation(text):
  for char in exclude:
    text = text.replace(char, '')
  return text

  ## but this a slow process it will take time on a large dataset

In [48]:
sent = "hello! my name is Prakhar, i am a student. what about you?"

In [49]:
remove_punctuation(sent)

'hello my name is Prakhar i am a student what about you'

In [50]:
def remove_punctuation1(text):
  return text.translate(str.maketrans('', '',exclude))
## this is the faster version


In [51]:
remove_punctuation1(sent)

'hello my name is Prakhar i am a student what about you'

In [52]:
## spelling correction

from textblob import TextBlob

incorrect_sentence = "hellop myy namme is prateek niceee tooo meeet u"
textblb = TextBlob(incorrect_sentence)
textblb.correct().string


'hello may name is pratzen nice took meet u'

In [53]:
## removing stop words

import nltk
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [54]:
sentence = "Bright sunlight filtered gently through the tall green trees as children laughed loudly, chasing butterflies across the meadow with endless joy"

from nltk.corpus import stopwords

def remove_stopwords(text):
    new_text = []
    for word in text.split():
        if word not in stopwords.words('english'):
            new_text.append(word)
    return " ".join(new_text)



In [55]:
remove_stopwords(sentence)

'Bright sunlight filtered gently tall green trees children laughed loudly, chasing butterflies across meadow endless joy'

In [56]:
## Handeling emogies  either u can remove them or convert them

# removing them
import re

def remove_emoji(text):
    emoji_pattern = re.compile(
        "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags
        u"\U00002700-\U000027BF"  # dingbats
        u"\U000024C2-\U0001F251"  # enclosed characters
        "]+",
        flags=re.UNICODE
    )
    return emoji_pattern.sub(r'', text)


In [57]:
remove_emoji("loved this movie it was 😊")

'loved this movie it was '

In [59]:
## replacing the emogi

!pip install emoji
import emoji
emoji.demojize("Python is fun 😄 🚀 🔥")

Collecting emoji
  Downloading emoji-2.14.1-py3-none-any.whl.metadata (5.7 kB)
Downloading emoji-2.14.1-py3-none-any.whl (590 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/590.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m590.6/590.6 kB[0m [31m30.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: emoji
Successfully installed emoji-2.14.1


'Python is fun :grinning_face_with_smiling_eyes: :rocket: :fire:'