<a href="https://colab.research.google.com/github/ShakilM26/Pandas/blob/main/data-cleaning-practice/text_clining.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
import numpy as np

In [None]:
df = pd.DataFrame({"varsity": ["NSU","NSU","AIUB","AIUB","NSU",np.nan,np.nan,"Ahsanullah", "Ahsanullah", "NSU", np.nan],
                   "player": ["Mou","Shakil","Awdrita","Mintu","Farhana","Hadid","Jenny", "Emi", "Zainab", "Esha", "Nuha"],
                   "score": [81, 84, np.nan, 91, np.nan, 86, 94, 89, 70, 80, 86]})

df 

Unnamed: 0,varsity,player,score
0,NSU,Mou,81.0
1,NSU,Shakil,84.0
2,AIUB,Awdrita,
3,AIUB,Mintu,91.0
4,NSU,Farhana,
5,,Hadid,86.0
6,,Jenny,94.0
7,Ahsanullah,Emi,89.0
8,Ahsanullah,Zainab,70.0
9,NSU,Esha,80.0


In [None]:
df['varsity'].value_counts()
# There are some problem. Output doesn't show the nan values. 
# If we want to see the nan values then we have to use dropna parameter.

NSU           4
AIUB          2
Ahsanullah    2
Name: varsity, dtype: int64

In [None]:
df['varsity'].value_counts(dropna=False)

NSU           4
NaN           3
AIUB          2
Ahsanullah    2
Name: varsity, dtype: int64

In [None]:
# Sometimes we have to fill our nan value using others col value. 

df['varsity'].fillna(df['player'], inplace=True)
df

# df.loc[df["varsity"].isna()==True, "varsity"] = df["player"], also use this

Unnamed: 0,varsity,player,score
0,NSU,Mou,81.0
1,NSU,Shakil,84.0
2,AIUB,Awdrita,
3,AIUB,Mintu,91.0
4,NSU,Farhana,
5,Hadid,Hadid,86.0
6,Jenny,Jenny,94.0
7,Ahsanullah,Emi,89.0
8,Ahsanullah,Zainab,70.0
9,NSU,Esha,80.0


In [None]:
# Using Python dictionaries 

items = {'Mou':'NSU', 'Shakil':'Harvard', 'Awdrita':'Brack', 'Mintu':'MIT', 'Farhana':'Toronto', 'Hadid':'BUP', 'Jenny':'Ahsanullah',
         'Emi':'Harvard','Zainab':'Toronto', 'Esha':'Harvard', 'Nuha':'Toronto'}

In [None]:
df['varsity'] = df['varsity'].replace(items)
df

# df["varsity"] = df["varsity"].replace("Nuha","Toronto")


Unnamed: 0,varsity,player,score
0,NSU,Mou,81.0
1,NSU,Shakil,84.0
2,AIUB,Awdrita,
3,AIUB,Mintu,91.0
4,NSU,Farhana,
5,Ahsanullah,Hadid,86.0
6,Ahsanullah,Jenny,94.0
7,Ahsanullah,Emi,89.0
8,Ahsanullah,Zainab,70.0
9,NSU,Esha,80.0


In [None]:
# The default data type for integers does not support null values so the data
# type is upcasted to float. If it is important for you to represent these values
# as integers, you can use the nullable integer data type. 

In [None]:
df["score"] = df["score"].astype(pd.Int64Dtype())
df

Unnamed: 0,varsity,player,score
0,NSU,Mou,81.0
1,NSU,Shakil,84.0
2,AIUB,Awdrita,
3,AIUB,Mintu,91.0
4,NSU,Farhana,
5,Ahsanullah,Hadid,86.0
6,Ahsanullah,Jenny,94.0
7,Ahsanullah,Emi,89.0
8,Ahsanullah,Zainab,70.0
9,NSU,Esha,80.0


In [8]:
# Normalize the case of words 
# Machine doesn't understand different cases so that we have to make it same case.

daenerys = "I am Daenerys Stormborn of House Targaryen, of the blood of Old Valyeria. I am the dragon's daughter, and I swear to you that those who would harm you will die screaming."

daenerys=daenerys.lower()
daenerys

"i am daenerys stormborn of house targaryen, of the blood of old valyeria. i am the dragon's daughter, and i swear to you that those who would harm you will die screaming."

In [6]:
# Removing stopwords 
# Those words which cannot provide value in document
# Note: Removing stopwords is not always the best idea!

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
stop_words = set(stopwords.words("english"))
print(stop_words)

{'any', 'm', 'have', 'mightn', 'in', 'both', 'then', "mightn't", 'been', 'shouldn', 'didn', 'for', 'during', 'mustn', 'them', 'not', 'there', 'the', 'on', "doesn't", 'her', 'only', 'he', 'weren', 'you', 'doesn', "you'll", "you'd", 'those', 'when', "hadn't", 'hasn', 'here', 'it', "wasn't", "needn't", 'each', 'few', "should've", 'aren', 'wasn', 'had', 'his', 'yours', "mustn't", 'does', 'this', 'too', 'but', 'against', 'nor', 'is', 'once', 'through', 'my', 'if', 'or', 'of', 'yourselves', 'theirs', 'be', "shan't", 'very', 'no', 'over', 'where', 'couldn', 't', 'we', "you're", 'were', 'hers', 'herself', "that'll", 'before', 'its', "aren't", "haven't", "weren't", 'these', 'an', "it's", 'll', 'down', 'and', 'was', 'they', 'him', 'i', 'from', 'doing', 'haven', 'such', 'don', 'isn', "hasn't", "didn't", 'me', 'are', 'as', "wouldn't", 'about', 'our', 'at', 'some', 'which', 'am', 'under', 'just', 'myself', 'having', 'y', 'because', 'more', 'shan', 'she', 'will', "isn't", "couldn't", 'until', 'can',

In [9]:
x = "The UK lockdown restrictions will be dropped in the summer so we can go partying again!"

y = " ".join([word for word in daenerys.split() if word not in stop_words])
print(y)

daenerys stormborn house targaryen, blood old valyeria. dragon's daughter, swear would harm die screaming.


In [12]:
# Remove Unicode string

unicodes ='Python is good \u200c for machine learning'

# encode it to ascii formats
encodes = unicodes.encode(encoding='ascii', errors='ignore')
# decode the text
decode = encodes.decode()

clean_text = ' '.join([word for word in decode.split()])
print(clean_text)

Python is good for machine learning


In [18]:
# Removing hashtag, html link, punctuation etc

import re 
lines = 'She@farhana is good in speaking. But ritu @ritu is good at speaking, communication, writing and listening'
y = re.sub('@\S+', '', lines)
y

'She is good in speaking. But ritu  is good at speaking, communication, writing and listening'

In [19]:
# market tickers, sign

sign = 'Tony gave him $2000 in august. Condition was steve payback $1000 in september and $1000 in october.'
sign = re.sub('\$', '', sign)
sign

'Tony gave him 2000 in august. Condition was steve payback 1000 in september and 1000 in october.'

In [26]:
# remove urls

url = 'Download it from any movie https://www.lotr.com site'
url = re.sub(r'https?:\/\/.\S+', '', url)
print(url) 

Download it from any movie  site


In [28]:
# remove '#'

hash = "Shakil #teaching him since 2022"
hash = re.sub('#', '', hash)
hash


'Shakil teaching him since 2022'

In [29]:
import string 

text = "This is amazing! but add some value, then remove it."
punct = set(string.punctuation) 
text = "".join([ch for ch in text if ch not in punct])
print(text)

This is amazing but add some value then remove it


In [None]:
# Stemming and Lemmatization are both techniques used to normalize text in NLP. 
# walked, walk and walking are all merely different tenses of the same word. 
# we have to normalize this otherwise, they’d be treated differently.



In [30]:
import nltk
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer 

words = ['walk', 'walking', 'walks', 'walked', 'ran', 'run', 'runs', 'running']

# Stemming
stemmer = PorterStemmer()
for word in words:
  print(word + "--->" + stemmer.stem(word))

walk--->walk
walking--->walk
walks--->walk
walked--->walk
ran--->ran
run--->run
runs--->run
running--->run


In [33]:
# Lemmatization

lemmatizer = WordNetLemmatizer()
for word in words:
  print(word + "--->" + lemmatizer.lemmatize(word))

LookupError: ignored