# Text Data PreProcessing Task

In [103]:
import numpy as np
import pandas as pd
import regex as re
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to C:\Users\Sonu
[nltk_data]     Vikas\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Sonu
[nltk_data]     Vikas\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to C:\Users\Sonu
[nltk_data]     Vikas\AppData\Roaming\nltk_data...


True

In [104]:
doc_0='''A tree-toad loved a she-toad
Who lived up in a tree.
He was a two-toed tree-toad,
But a three-toed toad was she.
The two-toed tree-toad tried to win
The three-toed she-toad's heart,
For the two-toed tree-toad loved the ground
That the three-toed tree-toad trod.
But the two-toed tree-toad tried in vain;
He couldn't please her whim.
From her tree-toad bower,
With her three-toed power,
The she-toad vetoed him.'''

In [105]:
doc_1='''Bobby Bibby bought a bat
Bobby Bibby bought a ball
With his bat Bob banged the ball
Banged it bump against the wall
But so boldy Bobby banged it
That he burst his rubber ball
Boo! cried Bobby, Bad luck, ball!
Bad luck, Bobby, Bad Luck ball.
Now to drown his many troubles
Bobby Bibby’s blowing bubbles.
Black Bart was a smart marksman.'''

In [106]:
doc_2='''All I want is a proper cup of coffee,
Made in a proper copper coffee pot
I may be off my dot
But I want a cup of coffee
From a proper coffee pot.

Tin coffee pots and iron coffee pots
They're no use to me -
If I can't have a proper cup of coffee
In a proper copper coffee pot
I'll have a cup of tea.'''

In [107]:
doc_3='''Peter Piper picked a peck of pickled peppers.
A peck of pickled peppers Peter Piper picked.
If Peter Piper picked a peck of pickled peppers,
Where’s the peck of pickled peppers Peter Piper picked?'''

In [108]:
doc_4='''Yellow butter, purple jelly, red jam, black bread.
Spread it thick, say it quick!
Yellow butter, purple jelly, red jam, black bread.
Spread it thicker, say it quicker!
Yellow butter, purple jelly, red jam, black bread.
Don’t eat with your mouth full!'''

In [109]:
df_Org = pd.DataFrame({'docs' : [doc_0 , doc_1 , doc_2 , doc_3 , doc_4]})

In [110]:
df_Org

Unnamed: 0,docs
0,A tree-toad loved a she-toad\nWho lived up in ...
1,Bobby Bibby bought a bat\nBobby Bibby bought a...
2,"All I want is a proper cup of coffee,\nMade in..."
3,Peter Piper picked a peck of pickled peppers.\...
4,"Yellow butter, purple jelly, red jam, black br..."


In [111]:
df = df_Org.copy()

In [112]:
df

Unnamed: 0,docs
0,A tree-toad loved a she-toad\nWho lived up in ...
1,Bobby Bibby bought a bat\nBobby Bibby bought a...
2,"All I want is a proper cup of coffee,\nMade in..."
3,Peter Piper picked a peck of pickled peppers.\...
4,"Yellow butter, purple jelly, red jam, black br..."


# Data Cleaning

In [113]:
def text_cleaning(data , stem = False):
    
    # Removal of html tags
    html_tag = re.sub(r'<.*?>' , '' , data)
    
    # Removal of Special Characters
    spcl_chr = re.sub(r'[^A-z ]','',html_tag)
    
    # handling of newline char(\n)
    newline_tag = re.sub(r'\n',' ',spcl_chr)
    
    # Convertion of the uniform case --> preferble (lower)
    Sentence = newline_tag.lower()
    
    # spliting the data
    tokens = Sentence.split()
    
    # Removing stopwords
    clean_tokens = [ele for ele in tokens if ele not in stopwords.words('english')]
    
    # stemming or lemmatize
    if stem:
        doc_list = [SnowballStemmer('english').stem(i) for i in clean_tokens]
    else:
        doc_list = [WordNetLemmatizer().lemmatize(i) for i in clean_tokens]
    
    return ' '.join(doc_list)

In [115]:
df['Text_cleaning_lemmatizer'] = df['docs'].apply(text_cleaning, stem=False)

In [116]:
df

Unnamed: 0,docs,Text_cleaning_lemmatizer
0,A tree-toad loved a she-toad\nWho lived up in ...,treetoad loved shetoadwho lived treehe twotoed...
1,Bobby Bibby bought a bat\nBobby Bibby bought a...,bobby bibby bought batbobby bibby bought ballw...
2,"All I want is a proper cup of coffee,\nMade in...",want proper cup coffeemade proper copper coffe...
3,Peter Piper picked a peck of pickled peppers.\...,peter piper picked peck pickled peppersa peck ...
4,"Yellow butter, purple jelly, red jam, black br...",yellow butter purple jelly red jam black bread...


In [117]:
df['Text_cleaning_stemmer'] = df['docs'].apply(text_cleaning,stem = True)

In [118]:
df

Unnamed: 0,docs,Text_cleaning_lemmatizer,Text_cleaning_stemmer
0,A tree-toad loved a she-toad\nWho lived up in ...,treetoad loved shetoadwho lived treehe twotoed...,treetoad love shetoadwho live treeh twoto tree...
1,Bobby Bibby bought a bat\nBobby Bibby bought a...,bobby bibby bought batbobby bibby bought ballw...,bobbi bibbi bought batbobbi bibbi bought ballw...
2,"All I want is a proper cup of coffee,\nMade in...",want proper cup coffeemade proper copper coffe...,want proper cup coffeemad proper copper coffe ...
3,Peter Piper picked a peck of pickled peppers.\...,peter piper picked peck pickled peppersa peck ...,peter piper pick peck pickl peppersa peck pick...
4,"Yellow butter, purple jelly, red jam, black br...",yellow butter purple jelly red jam black bread...,yellow butter purpl jelli red jam black breads...


In [119]:
import nltk
from nltk.stem import SnowballStemmer
nltk.download('punkt')

# Sample text
text = "Peter Piper picked a peck of pickled peppers.A peck of pickled peppers Peter Piper picked.If Peter Piper picked a peck of pickled peppers,Where’s the peck of pickled peppers Peter Piper picked?"

# Initialize the Snowball Stemmer for English
stemmer = SnowballStemmer('english')

# Tokenize the text into words
words = nltk.word_tokenize(text)

# Stem each word and join them back into a string
stemmed_words = [stemmer.stem(word) for word in words]
stemmed_text = ' '.join(stemmed_words)

print(stemmed_text)

[nltk_data] Downloading package punkt to C:\Users\Sonu
[nltk_data]     Vikas\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


peter piper pick a peck of pickl peppers.a peck of pickl pepper peter piper picked.if peter piper pick a peck of pickl pepper , where ’ s the peck of pickl pepper peter piper pick ?


In [120]:
import nltk
from nltk.stem import WordNetLemmatizer

# Sample text
text = "Peter Piper picked a peck of pickled peppers.A peck of pickled peppers Peter Piper picked.If Peter Piper picked a peck of pickled peppers,Where’s the peck of pickled peppers Peter Piper picked?"

# Initialize the WordNet Lemmatizer
lemmatizer = WordNetLemmatizer()

# Tokenize the text into words
words = nltk.word_tokenize(text)

# Lemmatize each word and join them back into a string
lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
lemmatized_text = ' '.join(lemmatized_words)

print(lemmatized_text)

Peter Piper picked a peck of pickled peppers.A peck of pickled pepper Peter Piper picked.If Peter Piper picked a peck of pickled pepper , Where ’ s the peck of pickled pepper Peter Piper picked ?


### Uniform

In [123]:
df['docs']=df['docs'].str.lower()
df['docs']

0    a tree-toad loved a she-toad\nwho lived up in ...
1    bobby bibby bought a bat\nbobby bibby bought a...
2    all i want is a proper cup of coffee,\nmade in...
3    peter piper picked a peck of pickled peppers.\...
4    yellow butter, purple jelly, red jam, black br...
Name: docs, dtype: object

### Html Tag

In [124]:
x='<h>Sonu Vikas </h>'
re.sub(r'<.*?>','',x)

'Sonu Vikas '

In [125]:
def htmltag(x):
    return re.sub(r'<.*?>','',x)

In [126]:
df['docs']=df['docs'].apply(htmltag)
df['docs']

0    a tree-toad loved a she-toad\nwho lived up in ...
1    bobby bibby bought a bat\nbobby bibby bought a...
2    all i want is a proper cup of coffee,\nmade in...
3    peter piper picked a peck of pickled peppers.\...
4    yellow butter, purple jelly, red jam, black br...
Name: docs, dtype: object

### Url Remover

In [127]:
def urlre(x):
    return re.sub(r'https?://\S+|www\.S\+','',x)

In [129]:
x='''A tree-toad loved a she-toad,
Who lived up in a tree. For the https://www.twinkl.co.in/teaching-wiki/tongue-twisters-for-kids'''
urlre(x)

'A tree-toad loved a she-toad,\nWho lived up in a tree. For the '

### New Line Handle

In [130]:
x='A tree-toad loved a she-toad,\n Who lived up in a tree,\n He was a two-toed tree-toad'
re.sub(r'\n','',x)

'A tree-toad loved a she-toad, Who lived up in a tree, He was a two-toed tree-toad'

### Removing Special Character

In [131]:
x='A tree-toad #$! loved a she-toad,@ Who lived up in a tree,_+-* He was a two-toed tree-toad'
re.sub(r'[^a-zA-Z]',' ',x)

'A tree toad     loved a she toad   Who lived up in a tree      He was a two toed tree toad'

### Stopwords

In [134]:
x='A tree-toad loved a she-toad Who lived up in a tree  He was a two-toed tree-toad'.casefold().split()
x

['a',
 'tree-toad',
 'loved',
 'a',
 'she-toad',
 'who',
 'lived',
 'up',
 'in',
 'a',
 'tree',
 'he',
 'was',
 'a',
 'two-toed',
 'tree-toad']

In [135]:
z= [i for i in x if i not in stopwords.words('english')]
z

['tree-toad', 'loved', 'she-toad', 'lived', 'tree', 'two-toed', 'tree-toad']

In [137]:
def stopword(x):
    z=[]
    for i in x.casefold().split():
        if i not in stopwords.words('english'):
            z.append(i)
    return ' '.join(z)

In [138]:
df['docs']=df['docs'].apply(stopword)
df['docs']

0    tree-toad loved she-toad lived tree. two-toed ...
1    bobby bibby bought bat bobby bibby bought ball...
2    want proper cup coffee, made proper copper cof...
3    peter piper picked peck pickled peppers. peck ...
4    yellow butter, purple jelly, red jam, black br...
Name: docs, dtype: object

# Regex Function

In [121]:
import re

def convert_numbers_to_word(text):
    pattern = r'\d+'  # Regular expression pattern to match one or more digits
    
    def replace(match):
        return 'number'
    
    converted_text = re.sub(pattern, replace, text)
    
    return converted_text

# Example usage
text = "There are 10 Bus and 5 Truck"
converted_text = convert_numbers_to_word(text)
print(converted_text)

There are number Bus and number Truck


In [122]:
import re

def extract_numbers(text):
    pattern = r'\d+'  # Matches one or more digits
    
    numbers = re.findall(pattern, text)
    
    return numbers

# Example usage
text = "I have 10 Bus and 5 Truck"

numbers = extract_numbers(text)
print(numbers)

['10', '5']
