In [34]:
import pandas as pd
import re
import string
from bs4 import BeautifulSoup
from spellchecker import SpellChecker

import nltk
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer

# Import data

In [2]:
# lets import the dataframe from the csv file
reviews = pd.read_csv('data\clean_data\Musical_instruments_reviews_clean.csv')

# Start pre-proc

We will first look for information about the data.  
Is there any link to remove, any emjoy, emoticon or any other information that we do not need, or we need to convert?  
We will also probably need to remove the stop words, and convert the text to lower case and lemmatize it.

## Look for links

In [3]:
# create function that takes a review and retreiives the link if it exists
def get_link(review):
    #print(f"review: {review}")
    # create a list of words in the review
    review_words = review.split()
    # create a list of links
    links = []
    # iterate through the words in the review
    for word in review_words:
        # if the word starts with http
        if word.startswith('http'):
            # append the word to the list of links
            links.append(word)
    # if there are links in the review
    if len(links) > 0:
        # return the first link
        return links[0]
    # if there are no links in the review
    else:
        # return None
        return None

In [4]:
# iterate through the reviews in the dataframe
for rev in reviews['review']:
    #print(rev)
    # get the link from the review
    link = get_link(rev)
    # if there is a link in the review
    if link is not None:
        # print the link
        print(link)

http://www.amazon.com/LEVYS-LEATHERS-MMGXL-2-5-BRN-STRAP-EXTENDER/dp/B00BH5N91E/ref=sr_1_2?ie=UTF8&qid;=1361565768&sr;=8-2&keywords;=MMGXL-2.5
http://www.amazon.com/gp/product/B0018TC7BW/ref=cm_cr_rev_prod_title


We can see two different links in the data.  
Which means the pipeline will have to remove them.

## Look for emojis

In [5]:
# create a function that takes a review and returns emojis or emoticons if they exist
import emoji
EMOJIS = emoji.EMOJI_DATA
def extract_emojis(rev):
    return ''.join(c for c in rev if c in EMOJIS)

In [6]:
# iterate through the reviews in the dataframe
for rev in reviews['review']:
    #print(rev)
    emojis = extract_emojis(rev)
    if len(emojis) > 0:
        print(f"emojis: {(emojis)}")

There is no emojis in the data, so we do not need to remove them.

## Look for emoticons

In [7]:
# lets look for emoticons in the reviews
from emot.emo_unicode import EMOTICONS_EMO
def extract_emoticons(rev):
    return ''.join(c for c in rev if c in EMOTICONS_EMO)

In [8]:
# iterate through the reviews in the dataframe
for rev in reviews['review']:
    #print(rev)
    emots = extract_emoticons(rev)
    if len(emots) > 0:
        print(f"emots: {(emots)}")

So we have no emoticons in the data neither.

## Other information

Naturally, we will need to remove the punctuation, and convert the text to lower case.  
We will also need to remove the stop words, lemmatize the text and check for spell correction.  
Lemmatization is important because it will allow us to reduce the number of words in the vocabulary, and therefore the number of features in the model.

## Functions

1 - Remove links

In [9]:
def remove_urls(text: str) -> str:
    """
    Removes URLs from the input text.
    """
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)

In [10]:
assert remove_urls("https://www.google.com") == ""

2 - Remove tags

In [11]:
def remove_http_tags(text: str) -> str:
    """
    Removes HTTP tags from the input text.
    """
    return BeautifulSoup(text, "html.parser").text

In [12]:
assert remove_http_tags("<p>hello world</p>") == "hello world"

3 - Spell correction

In [23]:
def spell_correction(text: str) -> str:
    """
    Corrects spelling errors in the input text.
    """
    spell = SpellChecker()
    corrected_text = []
    misspelled_words = spell.unknown(text.split())
    for word in text.split():
        if word in misspelled_words:
            corrected_word = spell.correction(word)
            corrected_text.append(corrected_word if corrected_word is not None else word)
        else:
            corrected_text.append(word)
    return " ".join(corrected_text)

In [32]:
assert (spell_correction("cpasunmot hopfullly it works welld fr whhat I trys to do")) == "cpasunmot hopefully it works well for what I try to do"

4 - Lower casing

In [15]:
def lower_case(text: str) -> str:
    """
    Converts the input text to lowercase.
    """
    return text.lower()

In [16]:
assert lower_case("Hello World!") == "hello world!"

5 - Punctuation

In [17]:
def remove_punctuation(text: str) -> str:
    """
    Removes punctuation from the input text.
    """
    PUNCT_TO_REMOVE = string.punctuation
    translation_table = str.maketrans('', '', PUNCT_TO_REMOVE)
    return text.translate(translation_table)

In [18]:
assert remove_punctuation("Hello, World!") == "Hello World"

6 - Stopwords

In [19]:
def remove_stopwords(text: str,language: str) -> str:
    """
    Removes stopwords from the input text.
    """
    STOPWORDS = set(stopwords.words(language))
    split = text.split()
    filtered_words = [word for word in split if word not in STOPWORDS]
    return " ".join(filtered_words)

In [20]:
assert remove_stopwords("Hello the World!", 'english') == "Hello World!"

7 - Lemmatization

In [21]:
def lemmatize(text: str) -> str:
    """
    Lemmatizes words in the input text.
    """
    lemmatizer = WordNetLemmatizer()
    wordnet_map = {
        'N': wordnet.NOUN,
        'V': wordnet.VERB,
        'R': wordnet.ADV,
        'J': wordnet.ADJ
    }
    pos_tagged_text = nltk.pos_tag(nltk.word_tokenize(text))
    lemmatized_words = [lemmatizer.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_tagged_text]
    return " ".join(lemmatized_words)

In [35]:
assert lemmatize("feet caring") == "foot care"

# Create the pipeline for the script

As the pre-processing will be a python script, we will create the pipeline here but we will not run it.

In [41]:
def preprocessing_pipeline(text: str) -> str:
    """
    Chains all the cleaning functions together using scikit-learn pipelines.
    """
    preprocessing_steps = [
        ('lower_case', FunctionTransformer(lower_case)),
        ('remove_urls', FunctionTransformer(remove_urls)),
        ('remove_http_tags', FunctionTransformer(remove_http_tags)),
        ('remove_punctuation', FunctionTransformer(remove_punctuation)),
        ('remove_stopwords', FunctionTransformer(lambda x: remove_stopwords(x, 'english'))),
        ('lemmatize', FunctionTransformer(lambda x: lemmatize(x))),  # Replace 'lemmatizer' with your lemmatizer object
        ('spell_correction', FunctionTransformer(lambda x: spell_correction(x)))  # Replace 'spell' with your SpellChecker object
    ]

    # Create the pipeline
    preprocessing_pipeline = Pipeline(preprocessing_steps)

    # Apply the pipeline to the input text
    cleaned_text = preprocessing_pipeline.transform([text][0])

    return cleaned_text

The end of the pre-processing will be the creation of a csv file with the pre-processed data.

```python
if __name__ == "__main__":
    df = pd.read_csv("data\clean_data\Musical_instruments_reviews_clean.csv", index_col=0)
    df["cleaned_text"] = df.text.apply(lambda x: preprocessing_pipeline(x))
```