## In NLP text processing is the crucial step, which involves cleaning the text before appling any ML algorithm on it. By doing this we can ensure most accurate results and analysis.

In [8]:
import os
import pandas as pd
import numpy as np

df=pd.read_csv('/content/twitter_training.csv')
df

Unnamed: 0,2401,Borderlands,Positive,"im getting on borderlands and i will murder you all ,"
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...
...,...,...,...,...
74676,9200,Nvidia,Positive,Just realized that the Windows partition of my...
74677,9200,Nvidia,Positive,Just realized that my Mac window partition is ...
74678,9200,Nvidia,Positive,Just realized the windows partition of my Mac ...
74679,9200,Nvidia,Positive,Just realized between the windows partition of...


In [9]:
# We taking only Text Feature to perform the operations
df=df['im getting on borderlands and i will murder you all ,']

In [10]:
df

Unnamed: 0,"im getting on borderlands and i will murder you all ,"
0,I am coming to the borders and I will kill you...
1,im getting on borderlands and i will kill you ...
2,im coming on borderlands and i will murder you...
3,im getting on borderlands 2 and i will murder ...
4,im getting into borderlands and i can murder y...
...,...
74676,Just realized that the Windows partition of my...
74677,Just realized that my Mac window partition is ...
74678,Just realized the windows partition of my Mac ...
74679,Just realized between the windows partition of...


# Lower Case-
By lower casing the entire text in the vocabulary, this wil help to detect the duplicate words. Which will reduce redudancy.

In [11]:
df=df.str.lower()
df.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['lower_sent']=df.str.lower()


Unnamed: 0,"im getting on borderlands and i will murder you all ,"
0,I am coming to the borders and I will kill you...
1,im getting on borderlands and i will kill you ...
2,im coming on borderlands and i will murder you...
3,im getting on borderlands 2 and i will murder ...
4,im getting into borderlands and i can murder y...


# Remove HTML Tags
If some cases we scrabed our data from website, so there are chances to get HTML tags. By removing the tags we ensure the quality of the data.

In [12]:
import re
def remove_html(text):
   html_removal=re.compile('<.*?>')
   return html_removal.sub(r'',text)

In [14]:
text=''' is not updated during training,
i.e. it remains as a fixed  For a newly constructed Embedding,
the embedding vector at <div>/ <html>'''
print(remove_html(text))

 is not updated during training,
i.e. it remains as a fixed  For a newly constructed Embedding,
the embedding vector at / 


# Remove URLs

In [15]:
def remove_url(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)

In [16]:
url="Pytorch is the most used library in the current year https://docs.pytorch.org/docs/stable/generated/torch.nn.Embedding.html"
remove_url(url)

'Pytorch is the most used library in the current year '

# Remove Punctuations

In [19]:
import string
data = {'text': ['Hello, world!', 'Python is great.', 'Remove: punctuation, please!']}
df = pd.DataFrame(data)

def remove_punct(text):
    return text.translate(str.maketrans('', '', string.punctuation))

# Applying the function correctly
df['clean_text'] = df['text'].apply(remove_punct)
print(df.head())


                           text                 clean_text
0                 Hello, world!                Hello world
1              Python is great.            Python is great
2  Remove: punctuation, please!  Remove punctuation please


# Remove Stopwords

In [None]:
import pandas as pd
from nltk.corpus import stopwords

# Sample DataFrame
data = {'text': ['This is a sample sentence.', 'Stopwords should be removed.', 'We are learning NLP.']}
df = pd.DataFrame(data)

# Loading English stopwords
STOPWORDS = set(stopwords.words('english'))

# Function to remove stopwords
def remove_stopwords(text):
    return " ".join([word for word in str(text).split() if word.lower() not in STOPWORDS])

# Applying the function to the specific text column
df['clean_text'] = df['text'].apply(remove_stopwords)


# Removal of Emoji

In [22]:
def remove_emoji(string):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"
                           u"\U0001F300-\U0001F5FF"
                           u"\U0001F680-\U0001F6FF"
                           u"\U0001F1E0-\U0001F1FF"
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', string)

In [24]:
remove_emoji('This is fire🔥')

'This is fire'

# Stemming

In [25]:
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()
def stem_words(text):
    return " ".join([stemmer.stem(word) for word in text.split()])



In [None]:
df.apply(stem_words)

# Lemmatization

In [27]:
import spacy
# Load the spaCy model
nlp = spacy.load("en_core_web_sm")

def lemmatize_words(text):
    doc = nlp(text)
    return " ".join([token.lemma_ for token in doc])



In [29]:
nlp("running"),nlp("better"),nlp("walking")

(running, better, walking)

# Spell Correction

In [30]:
from textblob import TextBlob
incorrect="To catherize we need contitancy, we are verry hamble "
text=TextBlob(incorrect)
text.correct().string

'To catherine we need constancy, we are very humble '