In [12]:
# Install & Import Libraries

import re
import nltk
import spacy
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer


In [13]:
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [14]:
# Sample Data

texts = [
    "I loved this movie! The acting was amazing ðŸ˜Š",
    "Worst experience ever. Totally waste of time!!!",
    "The product is okay, not great but not bad either.",
    "Absolutely fantastic service, will buy again!",
    "Terrible support. I'm very disappointed ðŸ˜¡"
]

df = pd.DataFrame({"text": texts})
df

Unnamed: 0,text
0,I loved this movie! The acting was amazing ðŸ˜Š
1,Worst experience ever. Totally waste of time!!!
2,"The product is okay, not great but not bad eit..."
3,"Absolutely fantastic service, will buy again!"
4,Terrible support. I'm very disappointed ðŸ˜¡


In [15]:
# Text Cleaning Function

def clean_text(text):
    text = text.lower()                         # lowercase
    text = re.sub(r"http\S+", "", text)         # remove URLs
    text = re.sub(r"[^a-z\s]", "", text)        # remove punctuation/emojis
    text = re.sub(r"\s+", " ", text).strip()    # remove extra spaces
    return text

df["clean_text"] = df["text"].apply(clean_text)
df


Unnamed: 0,text,clean_text
0,I loved this movie! The acting was amazing ðŸ˜Š,i loved this movie the acting was amazing
1,Worst experience ever. Totally waste of time!!!,worst experience ever totally waste of time
2,"The product is okay, not great but not bad eit...",the product is okay not great but not bad either
3,"Absolutely fantastic service, will buy again!",absolutely fantastic service will buy again
4,Terrible support. I'm very disappointed ðŸ˜¡,terrible support im very disappointed


In [16]:
# Tokenization using NLTK

df["tokens"] = df["clean_text"].apply(word_tokenize)
df


Unnamed: 0,text,clean_text,tokens
0,I loved this movie! The acting was amazing ðŸ˜Š,i loved this movie the acting was amazing,"[i, loved, this, movie, the, acting, was, amaz..."
1,Worst experience ever. Totally waste of time!!!,worst experience ever totally waste of time,"[worst, experience, ever, totally, waste, of, ..."
2,"The product is okay, not great but not bad eit...",the product is okay not great but not bad either,"[the, product, is, okay, not, great, but, not,..."
3,"Absolutely fantastic service, will buy again!",absolutely fantastic service will buy again,"[absolutely, fantastic, service, will, buy, ag..."
4,Terrible support. I'm very disappointed ðŸ˜¡,terrible support im very disappointed,"[terrible, support, im, very, disappointed]"


In [17]:
# Stopword Removal

stop_words = set(stopwords.words("english"))

def remove_stopwords(tokens):
    return [word for word in tokens if word not in stop_words]

df["no_stopwords"] = df["tokens"].apply(remove_stopwords)
df


Unnamed: 0,text,clean_text,tokens,no_stopwords
0,I loved this movie! The acting was amazing ðŸ˜Š,i loved this movie the acting was amazing,"[i, loved, this, movie, the, acting, was, amaz...","[loved, movie, acting, amazing]"
1,Worst experience ever. Totally waste of time!!!,worst experience ever totally waste of time,"[worst, experience, ever, totally, waste, of, ...","[worst, experience, ever, totally, waste, time]"
2,"The product is okay, not great but not bad eit...",the product is okay not great but not bad either,"[the, product, is, okay, not, great, but, not,...","[product, okay, great, bad, either]"
3,"Absolutely fantastic service, will buy again!",absolutely fantastic service will buy again,"[absolutely, fantastic, service, will, buy, ag...","[absolutely, fantastic, service, buy]"
4,Terrible support. I'm very disappointed ðŸ˜¡,terrible support im very disappointed,"[terrible, support, im, very, disappointed]","[terrible, support, im, disappointed]"


In [18]:
# Lemmatization

lemmatizer = WordNetLemmatizer()

def lemmatize_tokens(tokens):
    return [lemmatizer.lemmatize(word,pos = 'v') for word in tokens]

df["lemmatized"] = df["no_stopwords"].apply(lemmatize_tokens)
df


Unnamed: 0,text,clean_text,tokens,no_stopwords,lemmatized
0,I loved this movie! The acting was amazing ðŸ˜Š,i loved this movie the acting was amazing,"[i, loved, this, movie, the, acting, was, amaz...","[loved, movie, acting, amazing]","[love, movie, act, amaze]"
1,Worst experience ever. Totally waste of time!!!,worst experience ever totally waste of time,"[worst, experience, ever, totally, waste, of, ...","[worst, experience, ever, totally, waste, time]","[worst, experience, ever, totally, waste, time]"
2,"The product is okay, not great but not bad eit...",the product is okay not great but not bad either,"[the, product, is, okay, not, great, but, not,...","[product, okay, great, bad, either]","[product, okay, great, bad, either]"
3,"Absolutely fantastic service, will buy again!",absolutely fantastic service will buy again,"[absolutely, fantastic, service, will, buy, ag...","[absolutely, fantastic, service, buy]","[absolutely, fantastic, service, buy]"
4,Terrible support. I'm very disappointed ðŸ˜¡,terrible support im very disappointed,"[terrible, support, im, very, disappointed]","[terrible, support, im, disappointed]","[terrible, support, im, disappoint]"


In [22]:
# Complete NLP Pipeline

def nlp_pipeline_nltk(text):
    text = clean_text(text)
    tokens = word_tokenize(text)
    tokens = remove_stopwords(tokens)
    tokens = lemmatize_tokens(tokens)
    return tokens


df["final_tokens_nltk"] = df["text"].apply(nlp_pipeline_nltk)
df


Unnamed: 0,text,clean_text,tokens,no_stopwords,lemmatized,final_tokens_spacy,final_tokens_nltk
0,I loved this movie! The acting was amazing ðŸ˜Š,i loved this movie the acting was amazing,"[i, loved, this, movie, the, acting, was, amaz...","[loved, movie, acting, amazing]","[love, movie, act, amaze]","[love, movie, acting, amazing]","[love, movie, act, amaze]"
1,Worst experience ever. Totally waste of time!!!,worst experience ever totally waste of time,"[worst, experience, ever, totally, waste, of, ...","[worst, experience, ever, totally, waste, time]","[worst, experience, ever, totally, waste, time]","[bad, experience, totally, waste, time]","[worst, experience, ever, totally, waste, time]"
2,"The product is okay, not great but not bad eit...",the product is okay not great but not bad either,"[the, product, is, okay, not, great, but, not,...","[product, okay, great, bad, either]","[product, okay, great, bad, either]","[product, okay, great, bad]","[product, okay, great, bad, either]"
3,"Absolutely fantastic service, will buy again!",absolutely fantastic service will buy again,"[absolutely, fantastic, service, will, buy, ag...","[absolutely, fantastic, service, buy]","[absolutely, fantastic, service, buy]","[absolutely, fantastic, service, buy]","[absolutely, fantastic, service, buy]"
4,Terrible support. I'm very disappointed ðŸ˜¡,terrible support im very disappointed,"[terrible, support, im, very, disappointed]","[terrible, support, im, disappointed]","[terrible, support, im, disappoint]","[terrible, support, disappointed]","[terrible, support, im, disappoint]"


In [23]:
# spaCy Pipeline

nlp = spacy.load("en_core_web_sm")

def nlp_pipeline_spacy(text):
    doc = nlp(text.lower())
    tokens = [
        token.lemma_
        for token in doc
        if not token.is_stop and token.is_alpha
    ]
    return tokens


df["final_tokens_spacy"] = df["text"].apply(nlp_pipeline_spacy)
df


Unnamed: 0,text,clean_text,tokens,no_stopwords,lemmatized,final_tokens_spacy,final_tokens_nltk
0,I loved this movie! The acting was amazing ðŸ˜Š,i loved this movie the acting was amazing,"[i, loved, this, movie, the, acting, was, amaz...","[loved, movie, acting, amazing]","[love, movie, act, amaze]","[love, movie, acting, amazing]","[love, movie, act, amaze]"
1,Worst experience ever. Totally waste of time!!!,worst experience ever totally waste of time,"[worst, experience, ever, totally, waste, of, ...","[worst, experience, ever, totally, waste, time]","[worst, experience, ever, totally, waste, time]","[bad, experience, totally, waste, time]","[worst, experience, ever, totally, waste, time]"
2,"The product is okay, not great but not bad eit...",the product is okay not great but not bad either,"[the, product, is, okay, not, great, but, not,...","[product, okay, great, bad, either]","[product, okay, great, bad, either]","[product, okay, great, bad]","[product, okay, great, bad, either]"
3,"Absolutely fantastic service, will buy again!",absolutely fantastic service will buy again,"[absolutely, fantastic, service, will, buy, ag...","[absolutely, fantastic, service, buy]","[absolutely, fantastic, service, buy]","[absolutely, fantastic, service, buy]","[absolutely, fantastic, service, buy]"
4,Terrible support. I'm very disappointed ðŸ˜¡,terrible support im very disappointed,"[terrible, support, im, very, disappointed]","[terrible, support, im, disappointed]","[terrible, support, im, disappoint]","[terrible, support, disappointed]","[terrible, support, im, disappoint]"


In [24]:
comparison_df = df[["text", "final_tokens_nltk", "final_tokens_spacy"]]
comparison_df


Unnamed: 0,text,final_tokens_nltk,final_tokens_spacy
0,I loved this movie! The acting was amazing ðŸ˜Š,"[love, movie, act, amaze]","[love, movie, acting, amazing]"
1,Worst experience ever. Totally waste of time!!!,"[worst, experience, ever, totally, waste, time]","[bad, experience, totally, waste, time]"
2,"The product is okay, not great but not bad eit...","[product, okay, great, bad, either]","[product, okay, great, bad]"
3,"Absolutely fantastic service, will buy again!","[absolutely, fantastic, service, buy]","[absolutely, fantastic, service, buy]"
4,Terrible support. I'm very disappointed ðŸ˜¡,"[terrible, support, im, disappoint]","[terrible, support, disappointed]"
