In [1]:
import re
def remove_html_tags(text):
    """
    This function removes HTML tags from a given text.
    """
    if isinstance(text, str):  # Check if the input is a string
        clean = re.compile('<.*?>')  # This regex matches any HTML tags
        cleaned_text = re.sub(clean, '', text)
        normalized_text = re.sub(r'\s+', ' ', cleaned_text).strip()
        return normalized_text  # Replace HTML tags with an empty string

def remove_url(text):
    # Step 1: Use regex to remove URLs
    text_without_url = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    
    # Step 2: Remove extra spaces by splitting and joining
    cleaned_text = ' '.join(text_without_url.split())
    
    return cleaned_text

import string

# Define a translation table that maps punctuation to None
translator = str.maketrans('', '', string.punctuation)

chat_words = {
    'AFAIK':'As Far As I Know',
    'AFK':'Away From Keyboard',
    'ASAP':'As Soon As Possible',
    "FYI": "For Your Information",
    "ASAP": "As Soon As Possible",
    "BRB": "Be Right Back",
    "BTW": "By The Way",
    "OMG": "Oh My God",
    "IMO": "In My Opinion",
    "LOL": "Laugh Out Loud",
    "TTYL": "Talk To You Later",
    "GTG": "Got To Go",
    "TTYT": "Talk To You Tomorrow",
    "IDK": "I Don't Know",
    "TMI": "Too Much Information",
    "IMHO": "In My Humble Opinion",
    "ICYMI": "In Case You Missed It",
    "AFAIK": "As Far As I Know",
    "BTW": "By The Way",
    "FAQ": "Frequently Asked Questions",
    "TGIF": "Thank God It's Friday",
    "FYA": "For Your Action",
    "ICYMI": "In Case You Missed It",
}

def chat_conversion(text):
    new_text = []
    for w in text.split():
        if w.upper() in chat_words:
            new_text.append(chat_words[w.upper()])
        else:
            new_text.append(w)
    return " ".join(new_text)

from textblob import TextBlob
def correct_spelling_column(text):
    blob = TextBlob(text)
    corrected_text = blob.correct()
    return str(corrected_text)

In [2]:
text3 = "String, with Punctuation?{}[]+-*&,:;"
text = "<!DOCTYPE html><html><body><h1>My First Heading</h1><p>My first paragraph. </p></body></html> paragraph."
text2 = 'Check out this website: https://www.example.com and also visit http://example.org.'

In [3]:
text3 = text3.translate(translator)  # Remove punctuation
print(text3)
cleaned_text = remove_url(text2)   
print(cleaned_text)
nt = remove_html_tags(text)
print(nt)

String with Punctuation
Check out this website: and also visit
My First HeadingMy first paragraph. paragraph.


In [4]:
import pandas as pd

In [5]:
df = pd.read_csv("D:\WorkSpace\GenAI\GenAI\datasets\IMDB Dataset.csv")

In [6]:
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [7]:
df['review'] = df['review'].str.lower()

In [8]:
df['review'][1]

'a wonderful little production. <br /><br />the filming technique is very unassuming- very old-time-bbc fashion and gives a comforting, and sometimes discomforting, sense of realism to the entire piece. <br /><br />the actors are extremely well chosen- michael sheen not only "has got all the polari" but he has all the voices down pat too! you can truly see the seamless editing guided by the references to williams\' diary entries, not only is it well worth the watching but it is a terrificly written and performed piece. a masterful production about one of the great master\'s of comedy and his life. <br /><br />the realism really comes home with the little things: the fantasy of the guard which, rather than use the traditional \'dream\' techniques remains solid then disappears. it plays on our knowledge and our senses, particularly with the scenes concerning orton and halliwell and the sets (particularly of their flat with halliwell\'s murals decorating every surface) are terribly well d

In [9]:
df['review'] = df['review'].apply(remove_html_tags)

In [10]:
df['review'][1]

'a wonderful little production. the filming technique is very unassuming- very old-time-bbc fashion and gives a comforting, and sometimes discomforting, sense of realism to the entire piece. the actors are extremely well chosen- michael sheen not only "has got all the polari" but he has all the voices down pat too! you can truly see the seamless editing guided by the references to williams\' diary entries, not only is it well worth the watching but it is a terrificly written and performed piece. a masterful production about one of the great master\'s of comedy and his life. the realism really comes home with the little things: the fantasy of the guard which, rather than use the traditional \'dream\' techniques remains solid then disappears. it plays on our knowledge and our senses, particularly with the scenes concerning orton and halliwell and the sets (particularly of their flat with halliwell\'s murals decorating every surface) are terribly well done.'

In [11]:
df['review'] = df['review'].apply(remove_url)

In [12]:
df['review'] = df['review'].str.translate(translator)

In [13]:
df['review'][1]

'a wonderful little production the filming technique is very unassuming very oldtimebbc fashion and gives a comforting and sometimes discomforting sense of realism to the entire piece the actors are extremely well chosen michael sheen not only has got all the polari but he has all the voices down pat too you can truly see the seamless editing guided by the references to williams diary entries not only is it well worth the watching but it is a terrificly written and performed piece a masterful production about one of the great masters of comedy and his life the realism really comes home with the little things the fantasy of the guard which rather than use the traditional dream techniques remains solid then disappears it plays on our knowledge and our senses particularly with the scenes concerning orton and halliwell and the sets particularly of their flat with halliwells murals decorating every surface are terribly well done'

In [14]:
chat_conversion('Do this Work fi')

'Do this Work fi'

In [15]:
df['review'] = df['review'].apply(chat_conversion)

In [None]:
df_corrected = correct_spelling(df, 'reviews')

In [135]:
df['review'][1]

'a wonderful little production the filming technique is very unassuming very oldtimebbc fashion and gives a comforting and sometimes discomforting sense of realism to the entire piece the actors are extremely well chosen michael sheen not only has got all the polari but he has all the voices down pat too you can truly see the seamless editing guided by the references to williams diary entries not only is it well worth the watching but it is a terrificly written and performed piece a masterful production about one of the great masters of comedy and his life the realism really comes home with the little things the fantasy of the guard which rather than use the traditional dream techniques remains solid then disappears it plays on our knowledge and our senses particularly with the scenes concerning orton and halliwell and the sets particularly of their flat with halliwells murals decorating every surface are terribly well done'

In [139]:


incorrect_text = 'ceertain conditionas duriing seveal ggenerations aree moodified in the saame maner.'

textBlb = TextBlob(incorrect_text)

textBlb.correct().string

'certain conditions during several generations are modified in the same manner.'

In [None]:
import pandas as pd
from textblob import TextBlob
from joblib import Parallel, delayed
from collections import defaultdict

# Cache to store previously corrected words
correction_cache = defaultdict(str)

def correct_spelling(text):
    # Check if the correction is already in cache
    if text in correction_cache:
        return correction_cache[text]
    # Perform correction if not cached
    blob = TextBlob(text)
    corrected_text = str(blob.correct())
    correction_cache[text] = corrected_text  # Add to cache
    return corrected_text

# Apply the function using parallel processing
df['review'] = Parallel(n_jobs=-1)(delayed(correct_spelling)(text) for text in df['review'])


In [16]:
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\adity\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [17]:
len(stopwords.words('english'))

179

In [None]:
def remove_stopwords(text):
    new_text = []

    for word in text.split():
        if word in stopwords.words('english'):
            new_text.append('')
        else:
            new_text.append(word)
    x = new_text[:]
    new_text.clear()
    return " ".join(x)

In [18]:
def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

In [19]:
remove_emoji("Loved the movie. It was 😘😘")

'Loved the movie. It was '

In [None]:
import emoji
print(emoji.demojize('Python is 🔥'))

In [None]:
print(emoji.demojize('Loved the movie. It was 😘'))

In [None]:
# word tokenization
sent1 = 'I am going to delhi'
sent1.split()

In [None]:
# sentence tokenization
sent2 = 'I am going to delhi. I will stay there for 3 days. Let\'s hope the trip to be great'
sent2.split('.')

In [None]:
# Problems with split function
sent3 = 'I am going to delhi!'
sent3.split()

In [None]:
sent4 = 'Where do think I should go? I have 3 day holiday'
sent4.split('.')

In [None]:
import re
sent3 = 'I am going to delhi!'
tokens = re.findall("[\w']+", sent3)
tokens

In [None]:
text = """Lorem Ipsum is simply dummy text of the printing and typesetting industry?
Lorem Ipsum has been the industry's standard dummy text ever since the 1500s,
when an unknown printer took a galley of type and scrambled it to make a type specimen book."""
sentences = re.compile('[.!?] ').split(text)
sentences

In [20]:
from nltk.tokenize import word_tokenize,sent_tokenize
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\adity\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [None]:
sent1 = 'I am going to visit delhi!'
word_tokenize(sent1)

In [None]:
text = """Lorem Ipsum is simply dummy text of the printing and typesetting industry?
Lorem Ipsum has been the industry's standard dummy text ever since the 1500s,
when an unknown printer took a galley of type and scrambled it to make a type specimen book."""

sent_tokenize(text)

In [None]:
sent5 = 'I have a Ph.D in A.I'
sent6 = "We're here to help! mail us at nks@gmail.com"
sent7 = 'A 5km ride cost $10.50'

word_tokenize(sent5)

In [None]:
word_tokenize(sent6)

In [22]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [None]:
doc1 = nlp(sent5)
doc2 = nlp(sent6)
doc3 = nlp(sent7)
doc4 = nlp(sent1)

In [None]:
doc4 = nlp(sent1)
doc4

In [None]:
for token in doc4:
    print(token)