# Study of essential text pre-processing techniques. Write python script for the essential text preprocessing techniques. Store the preprocessed data into a separate column of .CSV file. Compare the outcomes with and without using libraries for the same.

## Perform the following task with using inbuilt Python Libraries:

In [1]:
!pip install deep_translator emoji

Collecting deep_translator
  Downloading deep_translator-1.11.4-py3-none-any.whl.metadata (30 kB)
Collecting emoji
  Downloading emoji-2.14.0-py3-none-any.whl.metadata (5.7 kB)
Downloading deep_translator-1.11.4-py3-none-any.whl (42 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.3/42.3 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading emoji-2.14.0-py3-none-any.whl (586 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m586.9/586.9 kB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: emoji, deep_translator
Successfully installed deep_translator-1.11.4 emoji-2.14.0


In [2]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from deep_translator import GoogleTranslator
import emoji
import string
import re

In [3]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [4]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [5]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [6]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [7]:
cd "drive/My Drive/Sem-6/ML/P4"

/content/drive/My Drive/Sem-6/ML/P4


In [8]:
data = pd.read_csv("PTweet_WWE.csv")
data.head()

Unnamed: 0,link,text,name,username,date,is_rt,n_comment,n_rt,n_quote,n_like
0,https://twitter.com/WWE/status/174201779437427...,IF YA SMELL..... @TheRock has come back to #W...,WWE,@WWE,"Jan 2, 2024 · 2:59 AM UTC",False,1088,9916,1856,49998
1,https://twitter.com/WWE/status/174573989977118...,Who had the best Instagram photo of the week?!...,WWE,@WWE,"Jan 12, 2024 · 9:30 AM UTC",False,47,39,3,342
2,https://twitter.com/WWE/status/174567949971749...,These #RoyalRumble crashers were RUTHLESS! ht...,WWE,@WWE,"Jan 12, 2024 · 5:30 AM UTC",False,46,72,2,624
3,https://twitter.com/WWE/status/174564199274113...,An All Mighty moment in the 2023 Men's #RoyalR...,WWE,@WWE,"Jan 12, 2024 · 3:00 AM UTC",False,58,213,17,2454
4,https://twitter.com/WWE/status/174559668762676...,Outta nowhere! 😲,WWE,@WWE,"Jan 12, 2024 · 12:00 AM UTC",False,70,354,10,3853


In [9]:
df = pd.DataFrame(data['text'])
df.head()

Unnamed: 0,text
0,IF YA SMELL..... @TheRock has come back to #W...
1,Who had the best Instagram photo of the week?!...
2,These #RoyalRumble crashers were RUTHLESS! ht...
3,An All Mighty moment in the 2023 Men's #RoyalR...
4,Outta nowhere! 😲


### 1. Lower Casing

In [10]:
# Task 1: Lowercasing
df['lowercased_text'] = df['text'].apply(lambda x: x.lower())
df.head()

Unnamed: 0,text,lowercased_text
0,IF YA SMELL..... @TheRock has come back to #W...,if ya smell..... @therock has come back to #w...
1,Who had the best Instagram photo of the week?!...,who had the best instagram photo of the week?!...
2,These #RoyalRumble crashers were RUTHLESS! ht...,these #royalrumble crashers were ruthless! ht...
3,An All Mighty moment in the 2023 Men's #RoyalR...,an all mighty moment in the 2023 men's #royalr...
4,Outta nowhere! 😲,outta nowhere! 😲


### 2. Tokenization

In [11]:
# Task 2: Tokenization
# df['tokens'] = df['lowercased_text'].apply(lambda x: re.findall(r'\b\w+\b', x))
df['tokens'] = df['lowercased_text'].apply(lambda x: word_tokenize(x))
df.head()

Unnamed: 0,text,lowercased_text,tokens
0,IF YA SMELL..... @TheRock has come back to #W...,if ya smell..... @therock has come back to #w...,"[if, ya, smell, ....., @, therock, has, come, ..."
1,Who had the best Instagram photo of the week?!...,who had the best instagram photo of the week?!...,"[who, had, the, best, instagram, photo, of, th..."
2,These #RoyalRumble crashers were RUTHLESS! ht...,these #royalrumble crashers were ruthless! ht...,"[these, #, royalrumble, crashers, were, ruthle..."
3,An All Mighty moment in the 2023 Men's #RoyalR...,an all mighty moment in the 2023 men's #royalr...,"[an, all, mighty, moment, in, the, 2023, men, ..."
4,Outta nowhere! 😲,outta nowhere! 😲,"[outta, nowhere, !, 😲]"


### 3. Punctuation Mark Removal

In [12]:
# Task 3: Punctuation Mark Removal
df['cleaned_text'] = df['tokens'].apply(lambda x: ''.join(char for char in x if char not in string.punctuation))
df.head()

Unnamed: 0,text,lowercased_text,tokens,cleaned_text
0,IF YA SMELL..... @TheRock has come back to #W...,if ya smell..... @therock has come back to #w...,"[if, ya, smell, ....., @, therock, has, come, ...",ifyasmell.....therockhascomebacktowweraw
1,Who had the best Instagram photo of the week?!...,who had the best instagram photo of the week?!...,"[who, had, the, best, instagram, photo, of, th...",whohadthebestinstagramphotooftheweekhttps//www...
2,These #RoyalRumble crashers were RUTHLESS! ht...,these #royalrumble crashers were ruthless! ht...,"[these, #, royalrumble, crashers, were, ruthle...",theseroyalrumblecrasherswereruthlesshttps//tub...
3,An All Mighty moment in the 2023 Men's #RoyalR...,an all mighty moment in the 2023 men's #royalr...,"[an, all, mighty, moment, in, the, 2023, men, ...",anallmightymomentinthe2023men'sroyalrumblematch
4,Outta nowhere! 😲,outta nowhere! 😲,"[outta, nowhere, !, 😲]",outtanowhere😲


### 4. Stop Word Removal

In [13]:
# Task 4: Stop Word Removal
stop_words = set(stopwords.words('english'))
df['filtered_text'] = df['tokens'].apply(lambda x: ' '.join(word for word in x if word not in stop_words))
df.head()

Unnamed: 0,text,lowercased_text,tokens,cleaned_text,filtered_text
0,IF YA SMELL..... @TheRock has come back to #W...,if ya smell..... @therock has come back to #w...,"[if, ya, smell, ....., @, therock, has, come, ...",ifyasmell.....therockhascomebacktowweraw,ya smell ..... @ therock come back # wweraw !
1,Who had the best Instagram photo of the week?!...,who had the best instagram photo of the week?!...,"[who, had, the, best, instagram, photo, of, th...",whohadthebestinstagramphotooftheweekhttps//www...,best instagram photo week ? ! https : //www.ww...
2,These #RoyalRumble crashers were RUTHLESS! ht...,these #royalrumble crashers were ruthless! ht...,"[these, #, royalrumble, crashers, were, ruthle...",theseroyalrumblecrasherswereruthlesshttps//tub...,# royalrumble crashers ruthless ! https : //tu...
3,An All Mighty moment in the 2023 Men's #RoyalR...,an all mighty moment in the 2023 men's #royalr...,"[an, all, mighty, moment, in, the, 2023, men, ...",anallmightymomentinthe2023men'sroyalrumblematch,mighty moment 2023 men 's # royalrumble match !
4,Outta nowhere! 😲,outta nowhere! 😲,"[outta, nowhere, !, 😲]",outtanowhere😲,outta nowhere ! 😲


### 5. Stemming

In [14]:
# Task 5: Stemming
stemmer = PorterStemmer()
df['stemmed_Text'] = df['tokens'].apply(lambda x: ' '.join(stemmer.stem(word) for word in x))
df.head()

Unnamed: 0,text,lowercased_text,tokens,cleaned_text,filtered_text,stemmed_Text
0,IF YA SMELL..... @TheRock has come back to #W...,if ya smell..... @therock has come back to #w...,"[if, ya, smell, ....., @, therock, has, come, ...",ifyasmell.....therockhascomebacktowweraw,ya smell ..... @ therock come back # wweraw !,if ya smell ..... @ therock ha come back to # ...
1,Who had the best Instagram photo of the week?!...,who had the best instagram photo of the week?!...,"[who, had, the, best, instagram, photo, of, th...",whohadthebestinstagramphotooftheweekhttps//www...,best instagram photo week ? ! https : //www.ww...,who had the best instagram photo of the week ?...
2,These #RoyalRumble crashers were RUTHLESS! ht...,these #royalrumble crashers were ruthless! ht...,"[these, #, royalrumble, crashers, were, ruthle...",theseroyalrumblecrasherswereruthlesshttps//tub...,# royalrumble crashers ruthless ! https : //tu...,these # royalrumbl crasher were ruthless ! htt...
3,An All Mighty moment in the 2023 Men's #RoyalR...,an all mighty moment in the 2023 men's #royalr...,"[an, all, mighty, moment, in, the, 2023, men, ...",anallmightymomentinthe2023men'sroyalrumblematch,mighty moment 2023 men 's # royalrumble match !,an all mighti moment in the 2023 men 's # roya...
4,Outta nowhere! 😲,outta nowhere! 😲,"[outta, nowhere, !, 😲]",outtanowhere😲,outta nowhere ! 😲,outta nowher ! 😲


### 6. Lemmatization

In [15]:
# Task 6: Lemmatization
lemmatizer = WordNetLemmatizer()
df['lemmatized_text'] = df['tokens'].apply(lambda x: ' '.join(lemmatizer.lemmatize(word) for word in x))
df.head()

Unnamed: 0,text,lowercased_text,tokens,cleaned_text,filtered_text,stemmed_Text,lemmatized_text
0,IF YA SMELL..... @TheRock has come back to #W...,if ya smell..... @therock has come back to #w...,"[if, ya, smell, ....., @, therock, has, come, ...",ifyasmell.....therockhascomebacktowweraw,ya smell ..... @ therock come back # wweraw !,if ya smell ..... @ therock ha come back to # ...,if ya smell ..... @ therock ha come back to # ...
1,Who had the best Instagram photo of the week?!...,who had the best instagram photo of the week?!...,"[who, had, the, best, instagram, photo, of, th...",whohadthebestinstagramphotooftheweekhttps//www...,best instagram photo week ? ! https : //www.ww...,who had the best instagram photo of the week ?...,who had the best instagram photo of the week ?...
2,These #RoyalRumble crashers were RUTHLESS! ht...,these #royalrumble crashers were ruthless! ht...,"[these, #, royalrumble, crashers, were, ruthle...",theseroyalrumblecrasherswereruthlesshttps//tub...,# royalrumble crashers ruthless ! https : //tu...,these # royalrumbl crasher were ruthless ! htt...,these # royalrumble crasher were ruthless ! ht...
3,An All Mighty moment in the 2023 Men's #RoyalR...,an all mighty moment in the 2023 men's #royalr...,"[an, all, mighty, moment, in, the, 2023, men, ...",anallmightymomentinthe2023men'sroyalrumblematch,mighty moment 2023 men 's # royalrumble match !,an all mighti moment in the 2023 men 's # roya...,an all mighty moment in the 2023 men 's # roya...
4,Outta nowhere! 😲,outta nowhere! 😲,"[outta, nowhere, !, 😲]",outtanowhere😲,outta nowhere ! 😲,outta nowher ! 😲,outta nowhere ! 😲


In [16]:
for index, row in df.iterrows():
  for token in row['tokens']:
    lemmatized_word = lemmatizer.lemmatize(token)
    print(f"Token: {token}, Lemmatized: {lemmatized_word}, Equal: {token == lemmatized_word}")

Token: if, Lemmatized: if, Equal: True
Token: ya, Lemmatized: ya, Equal: True
Token: smell, Lemmatized: smell, Equal: True
Token: ....., Lemmatized: ....., Equal: True
Token: @, Lemmatized: @, Equal: True
Token: therock, Lemmatized: therock, Equal: True
Token: has, Lemmatized: ha, Equal: False
Token: come, Lemmatized: come, Equal: True
Token: back, Lemmatized: back, Equal: True
Token: to, Lemmatized: to, Equal: True
Token: #, Lemmatized: #, Equal: True
Token: wweraw, Lemmatized: wweraw, Equal: True
Token: !, Lemmatized: !, Equal: True
Token: who, Lemmatized: who, Equal: True
Token: had, Lemmatized: had, Equal: True
Token: the, Lemmatized: the, Equal: True
Token: best, Lemmatized: best, Equal: True
Token: instagram, Lemmatized: instagram, Equal: True
Token: photo, Lemmatized: photo, Equal: True
Token: of, Lemmatized: of, Equal: True
Token: the, Lemmatized: the, Equal: True
Token: week, Lemmatized: week, Equal: True
Token: ?, Lemmatized: ?, Equal: True
Token: !, Lemmatized: !, Equal: Tru

In [17]:
equal_counts = []
unequal_counts = []

for index, row in df.iterrows():
  equal_count = 0
  unequal_count = 0
  for token in row['tokens']:
    lemmatized_word = lemmatizer.lemmatize(token)
    if token == lemmatized_word:
      equal_count += 1
    else:
      unequal_count += 1
  equal_counts.append(equal_count)
  unequal_counts.append(unequal_count)

df['simple_count'] = equal_counts
df['complex_count'] = unequal_counts

df.head(40)

Unnamed: 0,text,lowercased_text,tokens,cleaned_text,filtered_text,stemmed_Text,lemmatized_text,simple_count,complex_count
0,IF YA SMELL..... @TheRock has come back to #W...,if ya smell..... @therock has come back to #w...,"[if, ya, smell, ....., @, therock, has, come, ...",ifyasmell.....therockhascomebacktowweraw,ya smell ..... @ therock come back # wweraw !,if ya smell ..... @ therock ha come back to # ...,if ya smell ..... @ therock ha come back to # ...,12,1
1,Who had the best Instagram photo of the week?!...,who had the best instagram photo of the week?!...,"[who, had, the, best, instagram, photo, of, th...",whohadthebestinstagramphotooftheweekhttps//www...,best instagram photo week ? ! https : //www.ww...,who had the best instagram photo of the week ?...,who had the best instagram photo of the week ?...,15,1
2,These #RoyalRumble crashers were RUTHLESS! ht...,these #royalrumble crashers were ruthless! ht...,"[these, #, royalrumble, crashers, were, ruthle...",theseroyalrumblecrasherswereruthlesshttps//tub...,# royalrumble crashers ruthless ! https : //tu...,these # royalrumbl crasher were ruthless ! htt...,these # royalrumble crasher were ruthless ! ht...,10,2
3,An All Mighty moment in the 2023 Men's #RoyalR...,an all mighty moment in the 2023 men's #royalr...,"[an, all, mighty, moment, in, the, 2023, men, ...",anallmightymomentinthe2023men'sroyalrumblematch,mighty moment 2023 men 's # royalrumble match !,an all mighti moment in the 2023 men 's # roya...,an all mighty moment in the 2023 men 's # roya...,13,0
4,Outta nowhere! 😲,outta nowhere! 😲,"[outta, nowhere, !, 😲]",outtanowhere😲,outta nowhere ! 😲,outta nowher ! 😲,outta nowhere ! 😲,4,0
5,Thanks for stopping by the WWE Performance Cen...,thanks for stopping by the wwe performance cen...,"[thanks, for, stopping, by, the, wwe, performa...",thanksforstoppingbythewweperformancecenterdkelce1,thanks stopping wwe performance center @ dkelc...,thank for stop by the wwe perform center @ dke...,thanks for stopping by the wwe performance cen...,11,0
6,The Tribal Chief is unfazed 🤣 @WWERomanReigns...,the tribal chief is unfazed 🤣 @wweromanreigns...,"[the, tribal, chief, is, unfazed, 🤣, @, wwerom...",thetribalchiefisunfazed🤣wweromanreignsheymanhu...,tribal chief unfazed 🤣 @ wweromanreigns @ heym...,the tribal chief is unfaz 🤣 @ wweromanreign @ ...,the tribal chief is unfazed 🤣 @ wweromanreigns...,10,0
7,It was great to have @dkelce1 come out and vis...,it was great to have @dkelce1 come out and vis...,"[it, was, great, to, have, @, dkelce1, come, o...",itwasgreattohavedkelce1comeoutandvisitthewwepe...,great @ dkelce1 come visit wwe performance cen...,it wa great to have @ dkelce1 come out and vis...,it wa great to have @ dkelce1 come out and vis...,16,1
8,Attention @NHLFlyers fans! @GrittyNHL has the...,attention @nhlflyers fans! @grittynhl has the...,"[attention, @, nhlflyers, fans, !, @, grittynh...",attentionnhlflyersfansgrittynhlhasthewwegolden...,attention @ nhlflyers fans ! @ grittynhl # wwe...,attent @ nhlflyer fan ! @ grittynhl ha the # w...,attention @ nhlflyers fan ! @ grittynhl ha the...,22,2
9,The #DustyClassic continues NEXT WEEK on #WWEN...,the #dustyclassic continues next week on #wwen...,"[the, #, dustyclassic, continues, next, week, ...",thedustyclassiccontinuesnextweekonwwenxtwithth...,# dustyclassic continues next week # wwenxt tw...,the # dustyclass continu next week on # wwenxt...,the # dustyclassic continues next week on # ww...,29,1


### 7. Translation

In [18]:
# Task 7: Translation
# translator = google_translator()
df['translated_text'] = df['lowercased_text'].apply(lambda x: GoogleTranslator(source='auto', target='es').translate(x))  # Translate to Spanish
df.head()

Unnamed: 0,text,lowercased_text,tokens,cleaned_text,filtered_text,stemmed_Text,lemmatized_text,simple_count,complex_count,translated_text
0,IF YA SMELL..... @TheRock has come back to #W...,if ya smell..... @therock has come back to #w...,"[if, ya, smell, ....., @, therock, has, come, ...",ifyasmell.....therockhascomebacktowweraw,ya smell ..... @ therock come back # wweraw !,if ya smell ..... @ therock ha come back to # ...,if ya smell ..... @ therock ha come back to # ...,12,1,Si hueles... ¡@therock ha vuelto a #wweraw!
1,Who had the best Instagram photo of the week?!...,who had the best instagram photo of the week?!...,"[who, had, the, best, instagram, photo, of, th...",whohadthebestinstagramphotooftheweekhttps//www...,best instagram photo week ? ! https : //www.ww...,who had the best instagram photo of the week ?...,who had the best instagram photo of the week ?...,15,1,¿Quién tuvo la mejor foto de Instagram de la s...
2,These #RoyalRumble crashers were RUTHLESS! ht...,these #royalrumble crashers were ruthless! ht...,"[these, #, royalrumble, crashers, were, ruthle...",theseroyalrumblecrasherswereruthlesshttps//tub...,# royalrumble crashers ruthless ! https : //tu...,these # royalrumbl crasher were ruthless ! htt...,these # royalrumble crasher were ruthless ! ht...,10,2,¡Estos intrusos del #royalrumble fueron despia...
3,An All Mighty moment in the 2023 Men's #RoyalR...,an all mighty moment in the 2023 men's #royalr...,"[an, all, mighty, moment, in, the, 2023, men, ...",anallmightymomentinthe2023men'sroyalrumblematch,mighty moment 2023 men 's # royalrumble match !,an all mighti moment in the 2023 men 's # roya...,an all mighty moment in the 2023 men 's # roya...,13,0,¡Un momento todopoderoso en el partido #RoyalR...
4,Outta nowhere! 😲,outta nowhere! 😲,"[outta, nowhere, !, 😲]",outtanowhere😲,outta nowhere ! 😲,outta nowher ! 😲,outta nowhere ! 😲,4,0,¡de la nada! 😲


### 8. Emoji to text

In [19]:
# Task 8: Emoji to Text
df['emoji_to_text'] = df['text'].apply(lambda x: emoji.demojize(x))
df.head()

Unnamed: 0,text,lowercased_text,tokens,cleaned_text,filtered_text,stemmed_Text,lemmatized_text,simple_count,complex_count,translated_text,emoji_to_text
0,IF YA SMELL..... @TheRock has come back to #W...,if ya smell..... @therock has come back to #w...,"[if, ya, smell, ....., @, therock, has, come, ...",ifyasmell.....therockhascomebacktowweraw,ya smell ..... @ therock come back # wweraw !,if ya smell ..... @ therock ha come back to # ...,if ya smell ..... @ therock ha come back to # ...,12,1,Si hueles... ¡@therock ha vuelto a #wweraw!,IF YA SMELL..... @TheRock has come back to #W...
1,Who had the best Instagram photo of the week?!...,who had the best instagram photo of the week?!...,"[who, had, the, best, instagram, photo, of, th...",whohadthebestinstagramphotooftheweekhttps//www...,best instagram photo week ? ! https : //www.ww...,who had the best instagram photo of the week ?...,who had the best instagram photo of the week ?...,15,1,¿Quién tuvo la mejor foto de Instagram de la s...,Who had the best Instagram photo of the week?!...
2,These #RoyalRumble crashers were RUTHLESS! ht...,these #royalrumble crashers were ruthless! ht...,"[these, #, royalrumble, crashers, were, ruthle...",theseroyalrumblecrasherswereruthlesshttps//tub...,# royalrumble crashers ruthless ! https : //tu...,these # royalrumbl crasher were ruthless ! htt...,these # royalrumble crasher were ruthless ! ht...,10,2,¡Estos intrusos del #royalrumble fueron despia...,These #RoyalRumble crashers were RUTHLESS! ht...
3,An All Mighty moment in the 2023 Men's #RoyalR...,an all mighty moment in the 2023 men's #royalr...,"[an, all, mighty, moment, in, the, 2023, men, ...",anallmightymomentinthe2023men'sroyalrumblematch,mighty moment 2023 men 's # royalrumble match !,an all mighti moment in the 2023 men 's # roya...,an all mighty moment in the 2023 men 's # roya...,13,0,¡Un momento todopoderoso en el partido #RoyalR...,An All Mighty moment in the 2023 Men's #RoyalR...
4,Outta nowhere! 😲,outta nowhere! 😲,"[outta, nowhere, !, 😲]",outtanowhere😲,outta nowhere ! 😲,outta nowher ! 😲,outta nowhere ! 😲,4,0,¡de la nada! 😲,Outta nowhere! :astonished_face:


## Perform the following task without using inbuilt Python Libraries (The last two task (Translation and Emoji) are not possible without libraies):

In [20]:
import re
import string

# Sample text data
text_data = data.head()['text']

# Task 1: Lowercasing
lowercased_texts = [text.lower() for text in text_data]

# Task 2: Tokenization
tokenized_texts = [re.findall(r'\b\w+\b', text) for text in text_data]

# Task 3: Punctuation Mark Removal
cleaned_texts = [''.join(char for char in text if char not in string.punctuation) for text in text_data]

# Task 4: Stop Word Removal
stop_words = set(["a", "an", "the", "is", "from", "this"])
filtered_texts = [' '.join(word for word in text.split() if word.lower() not in stop_words) for text in text_data]

# Task 5: Stemming
def simple_stemming(text):
    return ' '.join(word[:4] if len(word) > 4 else word for word in text.split())

stemmed_texts = [simple_stemming(text) for text in text_data]

# Task 6: Lemmatization
def simple_lemmatization(text):
    return ' '.join(word[:-2] if word.endswith("es") else word for word in text.split())

lemmatized_texts = [simple_lemmatization(text) for text in text_data]

# Display results
for i in range(len(text_data)):
    print(f"\nOriginal Text: {text_data[i]}")
    print(f"Lowercased Text: {lowercased_texts[i]}")
    print(f"Tokenized Text: {tokenized_texts[i]}")
    print(f"Cleaned Text: {cleaned_texts[i]}")
    print(f"Filtered Text: {filtered_texts[i]}")
    print(f"Stemmed Text: {stemmed_texts[i]}")
    print(f"Lemmatized Text: {lemmatized_texts[i]}")



Original Text: IF YA SMELL.....  @TheRock has come back to #WWERaw!
Lowercased Text: if ya smell.....  @therock has come back to #wweraw!
Tokenized Text: ['IF', 'YA', 'SMELL', 'TheRock', 'has', 'come', 'back', 'to', 'WWERaw']
Cleaned Text: IF YA SMELL  TheRock has come back to WWERaw
Filtered Text: IF YA SMELL..... @TheRock has come back to #WWERaw!
Stemmed Text: IF YA SMEL @The has come back to #WWE
Lemmatized Text: IF YA SMELL..... @TheRock has come back to #WWERaw!

Original Text: Who had the best Instagram photo of the week?!  https://www.wwe.com/gallery/the-25-best-instagram-photos-of-the-week-january-7-2024#fid-40650941
Lowercased Text: who had the best instagram photo of the week?!  https://www.wwe.com/gallery/the-25-best-instagram-photos-of-the-week-january-7-2024#fid-40650941
Tokenized Text: ['Who', 'had', 'the', 'best', 'Instagram', 'photo', 'of', 'the', 'week', 'https', 'www', 'wwe', 'com', 'gallery', 'the', '25', 'best', 'instagram', 'photos', 'of', 'the', 'week', 'january

In [21]:
!pip install inflect



In [22]:
import nltk
from nltk.corpus import wordnet as wn
from nltk.tokenize import word_tokenize
from nltk import pos_tag

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')

sentence = "The cat sat on the mat."
tokens = word_tokenize(sentence)
tagged_tokens = pos_tag(tokens)
print("POS Tagging:")
for token, tag in tagged_tokens:
    print(f'{token} -> {tag}')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


POS Tagging:
The -> DT
cat -> NN
sat -> VBD
on -> IN
the -> DT
mat -> NN
. -> .


In [23]:
import nltk
from nltk.corpus import wordnet as wn
from nltk.tokenize import word_tokenize
from nltk import pos_tag

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')
for tokens in tokenized_texts:
  tagged_tokens = pos_tag(tokens)
  print("POS Tagging:")
  for token, tag in tagged_tokens:
      print(f'{token} -> {tag}')
  print("\nWordNet Exploration:")
  word = tokens[0]

  synsets = wn.synsets(word)
  print(f"\nSynsets of '{word}':")
  for synset in synsets:
      print(f"- {synset.name()}: {synset.definition()}")
  print(f"\nLemmas of '{word}':")
  for synset in synsets:
      lemmas = synset.lemmas()
      for lemma in lemmas:
          print(f"- {lemma.name()}")
  print(f"\nHypernyms of '{word}':")
  for synset in synsets:
      hypernyms = synset.hypernyms()
      for hypernym in hypernyms:
          print(f"- {synset.name()} -> {hypernym.name()}")
  print(f"\nHyponyms of '{word}':")
  for synset in synsets:
      hyponyms = synset.hyponyms()
      for hyponym in hyponyms:
          print(f"- {synset.name()} -> {hyponym.name()}")
  print(f"\nExample sentences for '{word}':")
  for synset in synsets:
      examples = synset.examples()
      for example in examples:
          print(f"- {synset.name()}: {example}")

POS Tagging:
IF -> NNP
YA -> NNP
SMELL -> NNP
TheRock -> NNP
has -> VBZ
come -> VBN
back -> RB
to -> TO
WWERaw -> NNP

WordNet Exploration:

Synsets of 'IF':

Lemmas of 'IF':

Hypernyms of 'IF':

Hyponyms of 'IF':

Example sentences for 'IF':
POS Tagging:
Who -> WP
had -> VBD
the -> DT
best -> JJS
Instagram -> NNP
photo -> NN
of -> IN
the -> DT
week -> NN
https -> NN
www -> NN
wwe -> NN
com -> NN
gallery -> VBD
the -> DT
25 -> CD
best -> JJS
instagram -> NN
photos -> NN
of -> IN
the -> DT
week -> NN
january -> JJ
7 -> CD
2024 -> CD
fid -> NN
40650941 -> CD

WordNet Exploration:


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!



Synsets of 'Who':
- world_health_organization.n.01: a United Nations agency to coordinate international health activities and to help governments improve health services

Lemmas of 'Who':
- World_Health_Organization
- WHO

Hypernyms of 'Who':
- world_health_organization.n.01 -> united_nations_agency.n.01

Hyponyms of 'Who':

Example sentences for 'Who':
POS Tagging:
These -> DT
RoyalRumble -> JJ
crashers -> NNS
were -> VBD
RUTHLESS -> NNP
https -> NN
tube -> NN
mint -> NN
lgbt -> NN
VV5fxHfxCE4 -> NNP
si -> NN
naZCLWedRVreRISE -> NN

WordNet Exploration:

Synsets of 'These':

Lemmas of 'These':

Hypernyms of 'These':

Hyponyms of 'These':

Example sentences for 'These':
POS Tagging:
An -> DT
All -> DT
Mighty -> NNP
moment -> NN
in -> IN
the -> DT
2023 -> CD
Men -> NNP
s -> VBD
RoyalRumble -> JJ
Match -> NN

WordNet Exploration:

Synsets of 'An':
- associate_in_nursing.n.01: an associate degree in nursing

Lemmas of 'An':
- Associate_in_Nursing
- AN

Hypernyms of 'An':
- associate_in_n

In [24]:
import nltk
from nltk.corpus import wordnet as wn
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from time import time

nltk.download('treebank')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('omw-1.4')

pos_full_form = {
    'CC': 'Coordinating conjunction', 'CD': 'Cardinal digit', 'DT': 'Determiner',
    'EX': 'Existential there', 'FW': 'Foreign word', 'IN': 'Preposition/subordinating conjunction',
    'JJ': 'Adjective', 'JJR': 'Adjective, comparative', 'JJS': 'Adjective, superlative',
    'LS': 'List item marker', 'MD': 'Modal', 'NN': 'Noun, singular or mass', 'NNS': 'Noun, plural',
    'NNP': 'Proper noun, singular', 'NNPS': 'Proper noun, plural', 'PDT': 'Predeterminer',
    'POS': 'Possessive ending', 'PRP': 'Personal pronoun', 'PRP$': 'Possessive pronoun',
    'RB': 'Adverb', 'RBR': 'Adverb, comparative', 'RBS': 'Adverb, superlative', 'RP': 'Particle',
    'TO': 'To', 'UH': 'Interjection', 'VB': 'Verb, base form', 'VBD': 'Verb, past tense',
    'VBG': 'Verb, gerund or present participle', 'VBN': 'Verb, past participle',
    'VBP': 'Verb, non-3rd person singular present', 'VBZ': 'Verb, 3rd person singular present',
    'WDT': 'Wh-determiner', 'WP': 'Wh-pronoun', 'WP$': 'Possessive wh-pronoun', 'WRB': 'Wh-adverb'
}

def wordnet_pos_code(tag):
    if tag.startswith('J'):
        return wn.ADJ
    elif tag.startswith('V'):
        return wn.VERB
    elif tag.startswith('N'):
        return wn.NOUN
    elif tag.startswith('R'):
        return wn.ADV
    else:
        return None

tokenization = tokenized_texts

total_treebank_time = 0
total_wordnet_time = 0
total_precision = 0
total_sentences = len(tokenization)

def get_pos_full_form(tag):
    return pos_full_form.get(tag, "Unknown")

for tokens in tokenization:
    print(f"\nProcessing sentence: {' '.join(tokens)}")

    # Treebank
    start_treebank = time()
    treebank_tags = pos_tag(tokens)
    end_treebank = time()

    print("Treebank POS tagging results:")
    for token, tag in treebank_tags:
        print(f'{token} -> {tag} ({get_pos_full_form(tag)})')

    # WordNet
    start_wordnet = time()
    wordnet_tags = []
    for token, tag in treebank_tags:
        wn_tag = wordnet_pos_code(tag)
        if wn_tag:
            synsets = wn.synsets(token, wn_tag)
            if synsets:
                wn_tagged = synsets[0].pos()
            else:
                wn_tagged = "unknown"
        else:
            wn_tagged = "unknown"
        wordnet_tags.append((token, wn_tagged))
    end_wordnet = time()

    print("\nWordNet-based tagging results:")
    for token, tag in wordnet_tags:
        print(f'{token} -> {tag}')

    treebank_time = end_treebank - start_treebank
    wordnet_time = end_wordnet - start_wordnet
    total_treebank_time += treebank_time
    total_wordnet_time += wordnet_time

    correct_matches = sum(1 for t1, t2 in zip(treebank_tags, wordnet_tags) if t1[1].startswith(t2[1].upper()))
    precision = correct_matches / len(treebank_tags)
    total_precision += precision

avg_precision = total_precision / total_sentences
avg_treebank_time = total_treebank_time / total_sentences
avg_wordnet_time = total_wordnet_time / total_sentences
print(f"\nOverall average time for Treebank POS tagging: {avg_treebank_time:.6f} seconds")
print(f"Overall average time for WordNet-based tagging: {avg_wordnet_time:.6f} seconds")
print(f"Overall precision (accuracy): {avg_precision:.2f}")


[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Unzipping corpora/treebank.zip.



Processing sentence: IF YA SMELL TheRock has come back to WWERaw
Treebank POS tagging results:
IF -> NNP (Proper noun, singular)
YA -> NNP (Proper noun, singular)
SMELL -> NNP (Proper noun, singular)
TheRock -> NNP (Proper noun, singular)
has -> VBZ (Verb, 3rd person singular present)
come -> VBN (Verb, past participle)
back -> RB (Adverb)
to -> TO (To)
WWERaw -> NNP (Proper noun, singular)

WordNet-based tagging results:
IF -> unknown
YA -> unknown
SMELL -> n
TheRock -> unknown
has -> v
come -> v
back -> r
to -> unknown
WWERaw -> unknown

Processing sentence: Who had the best Instagram photo of the week https www wwe com gallery the 25 best instagram photos of the week january 7 2024 fid 40650941
Treebank POS tagging results:
Who -> WP (Wh-pronoun)
had -> VBD (Verb, past tense)
the -> DT (Determiner)
best -> JJS (Adjective, superlative)
Instagram -> NNP (Proper noun, singular)
photo -> NN (Noun, singular or mass)
of -> IN (Preposition/subordinating conjunction)
the -> DT (Determiner)

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [25]:
import nltk
from nltk import pos_tag
from prettytable import PrettyTable
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')

tokenization = tokenized_texts
pos_column_headers = ['Noun', 'Verb', 'Adjective', 'Adverb', 'Determiner', 'Preposition', 'Pronoun', 'Conjunction', 'Modal']

def categorize_pos(tag):
    if tag in ['NN', 'NNS', 'NNP', 'NNPS']:
        return 'Noun'
    elif tag in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']:
        return 'Verb'
    elif tag in ['JJ', 'JJR', 'JJS']:
        return 'Adjective'
    elif tag in ['RB', 'RBR', 'RBS']:
        return 'Adverb'
    elif tag == 'DT':
        return 'Determiner'
    elif tag == 'IN':
        return 'Preposition'
    elif tag in ['PRP', 'PRP$']:
        return 'Pronoun'
    elif tag == 'CC':
        return 'Conjunction'
    elif tag == 'MD':
        return 'Modal'
    else:
        return None

table = PrettyTable(['Sentence'] + pos_column_headers)
for tokens in tokenization:
    tagged_tokens = pos_tag(tokens)
    pos_dict = {category: [] for category in pos_column_headers}
    for token, tag in tagged_tokens:
        category = categorize_pos(tag)
        if category:
            pos_dict[category].append(token)

    table.add_row([
        ' '.join(tokens),
        pos_dict['Noun'],
        pos_dict['Verb'],
        pos_dict['Adjective'],
        pos_dict['Adverb'],
        pos_dict['Determiner'],
        pos_dict['Preposition'],
        pos_dict['Pronoun'],
        pos_dict['Conjunction'],
        pos_dict['Modal']
    ])

print(table)


+---------------------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------+--------------------+-----------------------------+-------------+------------------------------+--------------+---------+-------------+-------+
|                                                                   Sentence                                                                  |                                                Noun                                                |        Verb        |          Adjective          |    Adverb   |          Determiner          | Preposition  | Pronoun | Conjunction | Modal |
+---------------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
