<a href="https://colab.research.google.com/github/ShakilM26/Pandas/blob/main/data-cleaning-practice/text_cleaning_nltk_reg.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# ***Normalize Case***

In [20]:
# Normalize the case of words 
# Machine doesn't understand different cases so that we have to make it same case.

daenerys = "I am Daenerys Stormborn of House Targaryen, of the blood of Old Valyeria. I am the dragon's daughter, and I swear to you that those who would harm you will die screaming."

daenerys=daenerys.lower()
daenerys

"i am daenerys stormborn of house targaryen, of the blood of old valyeria. i am the dragon's daughter, and i swear to you that those who would harm you will die screaming."

# ***Removing Stopwords***

In [2]:
# Removing stopwords 
# Those words which cannot provide value in document
# Note: Removing stopwords is not always the best idea!

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [3]:
stop_words = set(stopwords.words("english"))
print(stop_words)

{'do', 'why', "you'd", 'most', "wouldn't", 'they', 'yourself', 'against', 'haven', 'now', 'and', 'at', 'yours', 'aren', 'an', 'once', 'their', 'because', 'when', 'only', "doesn't", 'to', 'after', "isn't", 'is', 'ma', 'these', 'not', "should've", 'needn', 'm', 'those', 'both', "don't", 'had', 'itself', 'under', 'over', 'nor', 'off', 'until', 'wouldn', 'just', "wasn't", "mustn't", 'himself', 'be', 'she', 'a', 'your', 'what', 'its', 'for', 'as', 're', 'some', 'which', 'hadn', "shouldn't", 'our', 'during', 'didn', 'myself', 'having', 'should', "mightn't", 'with', 'into', "aren't", "won't", 'can', 'than', 'it', 'her', 'here', 'then', 've', 'me', 'y', 'there', 'was', 'will', 'if', 'out', 'while', 'but', 'shan', 'between', 'again', 'o', 'ourselves', 'in', 'each', 'no', 'more', 'been', 'how', "haven't", "you're", 'don', 'herself', 'this', "she's", 'am', "it's", 'i', 'further', 'through', 'did', 'any', 'all', 'hasn', 'before', "that'll", 'such', 'so', 'weren', 'my', 'we', 'themselves', 'that', 

In [4]:
x = "The UK lockdown restrictions will be dropped in the summer so we can go partying again!"

y = " ".join([word for word in daenerys.split() if word not in stop_words])
print(y)

daenerys stormborn house targaryen, blood old valyeria. dragon's daughter, swear would harm die screaming.


# ***Stemming and Lemmatization*** 

In [None]:
# Stemming and Lemmatization are both techniques used to normalize text in NLP. 
# walked, walk and walking are all merely different tenses of the same word. 
# we have to normalize this otherwise, theyâ€™d be treated differently.

In [22]:
import nltk
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer 

words = ['walk', 'walking', 'walks', 'walked', 'ran', 'run', 'runs', 'running']

# Stemming
stemmer = PorterStemmer()
for word in words:
  print(word + "--->" + stemmer.stem(word))

walk--->walk
walking--->walk
walks--->walk
walked--->walk
ran--->ran
run--->run
runs--->run
running--->run


In [None]:
# Lemmatization

lemmatizer = WordNetLemmatizer()
for word in words:
  print(word + "--->" + lemmatizer.lemmatize(word))

***Unicode string*** 

In [5]:
# Remove Unicode string

unicodes ='Python is good \u200c for machine learning'

# encode it to ascii formats
encodes = unicodes.encode(encoding='ascii', errors='ignore')
# decode the text
decode = encodes.decode()

clean_text = ' '.join([word for word in decode.split()])
print(clean_text)

Python is good for machine learning


# **Regular Expression**

In [21]:
# Removing hashtag, html link, punctuation etc

import re 
lines = 'She @farhana is good in speaking. But ritu @ritu is good at speaking, communication, writing and listening'
y = re.sub('@\S+', '', lines)
y

'She  is good in speaking. But ritu  is good at speaking, communication, writing and listening'

In [7]:
# market tickers, sign

sign = 'Tony gave him $2000 in august. Condition was steve payback $1000 in september and $1000 in october.'
sign = re.sub('\$', '', sign)
sign

'Tony gave him 2000 in august. Condition was steve payback 1000 in september and 1000 in october.'

In [8]:
# remove urls

url = 'Download it from any movie https://www.lotr.com site'
url = re.sub(r'https?:\/\/.\S+', '', url)
print(url) 

Download it from any movie  site


In [9]:
# remove '#'

hash = "Shakil #teaching him since 2022"
hash = re.sub('#', '', hash)
hash

'Shakil teaching him since 2022'

In [10]:
import string 

text = "This is amazing! but add some value, then remove it."
punct = set(string.punctuation) 
text = "".join([ch for ch in text if ch not in punct])
print(text)

This is amazing but add some value then remove it
