# ***Bonus exercise: advanced grade***

In [21]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
import nltk
import re
import pandas as pd

from utils import emojis_unicode, emoticons, slang_words

In [22]:
def lowercase(text):
    out = text.lower()
    print(f"After lowercase: {out}")
    return out

def remove_punctuation(text):
    PUNCT_TO_REMOVE = string.punctuation
    out = text.translate(str.maketrans('', '', PUNCT_TO_REMOVE))
    print(f"After removing punctuation: {out}")
    return out

def stopwords_removal(text):
    STOPWORDS = set(stopwords.words('english'))
    out = " ".join([word for word in str(text).split() if word not in STOPWORDS])
    print(f"After stopwords removal: {out}")
    return out

def lemmatize_words(text):
    lemmatizer = WordNetLemmatizer()
    wordnet_map = {"N":wordnet.NOUN, "V":wordnet.VERB, "J":wordnet.ADJ, "R":wordnet.ADV}
    pos_tagged_text = nltk.pos_tag(text.split())
    out = " ".join([lemmatizer.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_tagged_text])
    print(f"After lematizing: {out}")
    return out

def convert_emojis(text):
    EMO_UNICODE = emojis_unicode()
    UNICODE_EMO = {v: k for k, v in EMO_UNICODE.items()}
    print(text)
    for emoticon, description in UNICODE_EMO.items():
        cleaned_description = description.replace(",", "").replace(":", "").split()
        replacement = "_".join(cleaned_description)
        text = text.replace(emoticon, replacement)
    print(f"After converting emojis: {text}")
    return text

def convert_emoticons(text):
    EMOTICONS = emoticons()
    for emoticon, description in EMOTICONS.items():
        cleaned_description = description.replace(",", "").split()
        cleaned_description_joined = "_".join(cleaned_description)
        # replace the emojis by the cleaned description within the given text
        out = re.sub(u'('+emoticon+')', cleaned_description_joined, text)
    print(f"After converting emoticons: {out}")
    return out

def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    out = url_pattern.sub(r'', text)
    print(f"After removing urls: {out}")
    return out

def remove_html(text):
    html_pattern = re.compile('<.*?>')
    out = html_pattern.sub(r'', text)
    print(f"After removing html: {out}")
    return out

def chat_words_conversion(text):
    slang_words_list = slang_words()
    chat_words_list = list(slang_words_list.keys())
    new_text = []
    for w in text.split():
        if w.upper() in chat_words_list:
            new_text.append(slang_words_list[w.upper()])
        else:
            new_text.append(w)
    out = " ".join(new_text)
    print(f"After converting slang: {out}")
    return out 

# Create FunctionTransformer for each custom function
lowercase_transformer = FunctionTransformer(func=lowercase, validate=False)
remove_punctuation_transformer = FunctionTransformer(func=remove_punctuation, validate=False)
stopwords_removal_transformer = FunctionTransformer(func=stopwords_removal, validate=False)
lemmatize_words_transformer = FunctionTransformer(func=lemmatize_words, validate=False)
convert_emojis_transformer = FunctionTransformer(func=convert_emojis, validate=False)
convert_emoticons_transformer = FunctionTransformer(func=convert_emoticons, validate=False)
remove_urls_transformer = FunctionTransformer(func=remove_urls, validate=False)
remove_html_transformer = FunctionTransformer(func=remove_html, validate=False)
chat_words_conversion_transformer = FunctionTransformer(func=chat_words_conversion, validate=False)

# Create an scikit-learn pipeline
pipeline = Pipeline([
    ('remove_urls', remove_urls_transformer),
    ('remove_html', remove_html_transformer),
    ('convert_emojis', convert_emojis_transformer),
    ('convert_emoticons', convert_emoticons_transformer),
    ('chat_words_conversion', chat_words_conversion_transformer),
    ('lowercase', lowercase_transformer),
    ('remove_punctuation', remove_punctuation_transformer),
    ('stopwords_removal', stopwords_removal_transformer),
    ('lemmatize_words', lemmatize_words_transformer),
    
])

def clean(text, pipeline=pipeline):
    return pipeline.transform(text)

In [23]:
pipeline.transform("Hello Amazon - my package never arrived :( https://www.amazon.com/gp/css/order-history?ref_=nav_orders_first PLEASE FIX ASAP ⏰! @AmazonHelp <test/>")

After removing urls: Hello Amazon - my package never arrived :(  PLEASE FIX ASAP ⏰! @AmazonHelp <test/>
After removing html: Hello Amazon - my package never arrived :(  PLEASE FIX ASAP ⏰! @AmazonHelp 
Hello Amazon - my package never arrived :(  PLEASE FIX ASAP ⏰! @AmazonHelp 
After converting emojis: Hello Amazon - my package never arrived :(  PLEASE FIX ASAP alarm_clock! @AmazonHelp 
After converting emoticons: Hello Amazon - my package never arrived :(  PLEASE FIX ASAP alarm_clock! @AmazonHelp 
After converting slang: Hello Amazon - my package never arrived :( PLEASE FIX As Soon As Possible alarm_clock! @AmazonHelp
After lowercase: hello amazon - my package never arrived :( please fix as soon as possible alarm_clock! @amazonhelp
After removing punctuation: hello amazon  my package never arrived  please fix as soon as possible alarmclock amazonhelp
After stopwords removal: hello amazon package never arrived please fix soon possible alarmclock amazonhelp
After lematizing: hello amazon 

'hello amazon package never arrive please fix soon possible alarmclock amazonhelp'

In [25]:
df = pd.read_csv("./to_clean.csv", index_col=0)
df

Unnamed: 0_level_0,text
index,Unnamed: 1_level_1
1,Hello Amazon - my package never arrived :( htt...
2,Hello! 😊 This is an example text with emojis! 👍
3,<p>This is a <b>sample</b> text with <a href='...
4,The quick brown fox jumps over the lazy dog.
5,Visit our website at https://www.example.com f...
6,I'm feeling 😄 today. Don't worry 😉.
7,This text contains special characters #$%&@*!
8,LOL BRB and OMG are common chat abbreviations.
9,😂😍👏 Just saw the funniest movie ever! 😂😍👏
10,<a href='https://www.example.com'>Click here</...


In [26]:
%%capture
df["cleaned_text"] = df.text.apply(lambda x: clean(x))

In [27]:
for idx, row in df.iterrows():
    print(f"Base text: {row.text}")
    print(f"Cleaned text: {row.cleaned_text}")
    print("\n")

Base text: Hello Amazon - my package never arrived :( https://www.amazon.com/gp/css/order-history?ref_=nav_orders_first PLEASE FIX ASAP ⏰! @AmazonHelp <test/>
Cleaned text: hello amazon package never arrive please fix soon possible alarmclock amazonhelp


Base text: Hello! 😊 This is an example text with emojis! 👍
Cleaned text: hello smilingfacewithsmilingeyes example text emojis thumbsup


Base text: <p>This is a <b>sample</b> text with <a href='https://www.example.com'>HTML</a> tags.</p>
Cleaned text: sample text


Base text: The quick brown fox jumps over the lazy dog.
Cleaned text: quick brown fox jump lazy dog


Base text: Visit our website at https://www.example.com for more information
Cleaned text: visit website information


Base text: I'm feeling 😄 today. Don't worry 😉.
Cleaned text: im feel smilingfacewithopenmouthsmilingeyes today dont worry winkingface


Base text: This text contains special characters #$%&@*!
Cleaned text: text contain special character


Base text: LOL BR