In [35]:
import pandas as pd
import re
import string
from bs4 import BeautifulSoup
from spellchecker import SpellChecker

import nltk
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer

# Import data

In [36]:
# lets import the dataframe from the csv file
train_data = pd.read_csv('data/raw_data/train.txt', names=['text', 'emotion'], sep=';')
test_data = pd.read_csv('data/raw_data/test.txt', names=['text', 'emotion'], sep=';')

# Start pre-proc

We will first look for information about the data.  
Is there any link to remove, any emjoy, emoticon or any other information that we do not need, or we need to convert?  
We will also probably need to remove the stop words, and convert the text to lower case and lemmatize it.

## Look for links

In [37]:
# create function that takes a review and retreiives the link if it exists
def get_link(review):
    #print(f"review: {review}")
    # create a list of words in the review
    review_words = review.split()
    # create a list of links
    links = []
    # iterate through the words in the review
    for word in review_words:
        # if the word starts with http
        if word.startswith('http'):
            # append the word to the list of links
            links.append(word)
    # if there are links in the review
    if len(links) > 0:
        # return the first link
        return links[0]
    # if there are no links in the review
    else:
        # return None
        return None

In [42]:
def count_links(data, name):
    count = 0
    for texts in data['text']:
        #print(rev)
        # get the link
        link = get_link(texts)
        # if there is a link
        if link is not None:
            count += 1
    print(f"Number of links in {name}: {count}")

In [43]:
count_links(train_data, 'train_data')
count_links(test_data, 'test_data')

Number of links in train_data: 199
Number of links in test_data: 26


## Look for emojis

In [48]:
# create a function that takes a text and returns emojis or emoticons if they exist
import emoji
EMOJIS = emoji.EMOJI_DATA
def extract_emojis(rev):
    return ''.join(c for c in rev if c in EMOJIS)

In [45]:
# iterate through the text in the dataframe
def count_emojis(data, name):
    print(f"Emojis in {name}:")
    for texts in data['text']:
        #print(rev)
        emojis = extract_emojis(texts)
        if len(emojis) > 0:
            print(f"emojis: {(emojis)}")

In [46]:
count_emojis(train_data, 'train_data')
count_emojis(test_data, 'test_data')

Emojis in train_data:
Emojis in test_data:


There is no emojis in the data, so we do not need to remove them.

## Look for emoticons

In [49]:
# lets look for emoticons
from emot.emo_unicode import EMOTICONS_EMO
def extract_emoticons(rev):
    return ''.join(c for c in rev if c in EMOTICONS_EMO)

In [50]:
# iterate through the text in the dataframe
def count_emoticons(data, name):
    print(f"Emoticons in {name}:")
    for texts in data['text']:
        #print(rev)
        emots = extract_emoticons(texts)
        if len(emots) > 0:
            print(f"emots: {(emots)}")

In [51]:
count_emoticons(train_data, 'train_data')
count_emoticons(test_data, 'test_data')

Emoticons in train_data:
Emoticons in test_data:


So we have no emoticons in the data neither.

## Other information

Naturally, we will need to remove the punctuation, and convert the text to lower case.  
We will also need to remove the stop words, lemmatize the text and check for spell correction.  
Lemmatization is important because it will allow us to reduce the number of words in the vocabulary, and therefore the number of features in the model.

## Functions

1 - Lower Casing

In [15]:
def lower_case(text: str) -> str:
    """
    Converts the input text to lowercase.
    """
    return text.lower()

In [16]:
assert lower_case("Hello World!") == "hello world!"

2 - Remove stop words

In [17]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words("english"))

In [22]:
def remove_stop_words(text: str) -> str:
    """_summary_

    Args:
        text (str): text to remove stop words from

    Returns:
        str: text with stop words removed
    """
    Text=[i for i in str(text).split() if i not in stop_words]
    return " ".join(Text)

In [19]:
assert remove_stop_words("Hello the World!") == "Hello World!"

3 - Remove Numbers

In [21]:
def remove_numbers(text: str) -> str:
    """_summary_

    Args:
        text (_type_): text to remove numbers from

    Returns:
        _type_: text with numbers removed
    """
    text=''.join([i for i in text if not i.isdigit()])
    return text

In [23]:
assert remove_numbers("Hello 123 World!") == "Hello  World!"

4 - Remove Punctuation

In [25]:
def remove_punctuations(text: str) -> str:
    """_summary_
    
    Args:
        text (_type_): text to remove punctuations from
        
    Returns:
        _type_: text with punctuations removed
    """
    text = re.sub('[%s]' % re.escape("""!"#$%&'()*+,،-./:;<=>؟?@[\]^_`{|}~"""), ' ', text)
    text = text.replace('؛',"", )
    text = re.sub('\s+', ' ', text)
    text =  " ".join(text.split())
    return text.strip()

In [26]:
assert remove_punctuations("Hello! World!") == "Hello World"

5 - Remove URLs

In [27]:
def remove_urls(text: str) -> str:
    """_summary_
    
    Args:
        text (_type_): text to remove urls from
        
    Returns:
        _type_: text with urls removed
    """
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)

In [28]:
assert remove_urls("Hello https://www.world.com!") == "Hello "

6 - Lemmatization

In [29]:
def lemmatization(text: str) -> str:
    """_summary_

    Args:
        text (str): text to lemmatize

    Returns:
        str: text with lemmatized words
    """
    lemmatizer= WordNetLemmatizer()
    text = text.split()
    text=[lemmatizer.lemmatize(y) for y in text]
    return " " .join(text)

In [33]:
assert lemmatization("Hello foot Worlds!") == "Hello foot Worlds!"

# Create the pipeline for the script

As the pre-processing will be a python script, we will create the pipeline here but we will not run it.

In [34]:
def preprocessing_pipeline(text: str) -> str:
    """
    Chains all the cleaning functions together using scikit-learn pipelines.
    """
    preprocessing_steps = [
        ('lower_case', FunctionTransformer(lower_case)),
        ('remove_stopwords', FunctionTransformer(remove_stop_words)),
        ('remove_numbers', FunctionTransformer(remove_numbers)),
        ('remove_urls', FunctionTransformer(remove_urls)),
        ('remove_punctuation', FunctionTransformer(remove_punctuations)),
        ('lemmatization', FunctionTransformer(lemmatization))
    ]

    # Create the pipeline
    preprocessing_pipeline = Pipeline(preprocessing_steps)

    # Apply the pipeline to the input text
    cleaned_text = preprocessing_pipeline.transform([text][0])

    return cleaned_text

The end of the pre-processing will be the creation of a csv file with the pre-processed data.

```python
if __name__ == "__main__":
    df = pd.read_csv("data\clean_data\dataset.csv", index_col=0)
    df["text"] = df.text.apply(lambda x: preprocessing_pipeline(x))
```