In [4]:
import re  # noqa: F401
import string  # noqa: F401

import nltk  # noqa: F401
import pandas as pd
from nltk.corpus import stopwords, wordnet  # noqa: F401
from nltk.stem import WordNetLemmatizer  # noqa: F401
from sklearn.pipeline import Pipeline  # noqa: F401
from sklearn.preprocessing import FunctionTransformer  # noqa: F401
from utils import emojis_unicode, emoticons, slang_words  # noqa: F401

from bs4 import BeautifulSoup
from spellchecker import SpellChecker
nltk.download('stopwords')
from collections import Counter

In [5]:
# Declare your cleaning functions here
# Chain those functions together inside the preprocessing pipeline
# You can use (or not) Sklearn pipelines and functionTransformer for readability
# and modularity
# --- Documentation ---
# https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.FunctionTransformer.html

In [6]:
def lower_case(text: str) -> str:
    """
    Converts the input text to lowercase.
    """
    return text.lower()

def remove_punctuation(text: str) -> str:
    """
    Removes punctuation from the input text.
    """
    PUNCT_TO_REMOVE = string.punctuation
    translation_table = str.maketrans('', '', PUNCT_TO_REMOVE)
    return text.translate(translation_table)

def remove_stopwords(text: str,language: str) -> str:
    """
    Removes stopwords from the input text.
    """
    STOPWORDS = set(stopwords.words(language))
    split = text.split()
    filtered_words = [word for word in split if word not in STOPWORDS]
    return " ".join(filtered_words)

def remove_frequent_words(text: str, freq_words: list) -> str:
    """
    Removes frequent words from the input text.
    """
    split = text.split()
    filtered_words = [word for word in split if word not in freq_words]
    return " ".join(filtered_words)

def remove_rare_words(text: str, rare_words: list) -> str:
    """
    Removes rare words from the input text.
    """
    split = text.split()
    filtered_words = [word for word in split if word not in rare_words]
    return " ".join(filtered_words)

def stemming(text: str, stemmer) -> str:
    """
    Applies stemming to words in the input text.
    """
    split = text.split()
    filtered_words = [stemmer.stem(word) for word in split]
    return " ".join(filtered_words)

def lemmatize(text: str, lemmatizer) -> str:
    """
    Lemmatizes words in the input text.
    """
    wordnet_map = {
        'N': wordnet.NOUN,
        'V': wordnet.VERB,
        'R': wordnet.ADV,
        'J': wordnet.ADJ
    }
    pos_tagged_text = nltk.pos_tag(nltk.word_tokenize(text))
    lemmatized_words = [lemmatizer.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_tagged_text]
    return " ".join(lemmatized_words)

def convert_emoticons(text: str, EMOTICONS: dict) -> str:
    """
    Converts emoticons to text in the input text.
    """
    for emoticon, description in EMOTICONS.items():
        text = re.sub(emoticon, "_".join(description.replace(",", "").split()), text)
    return text

def convert_emojis(text: str, EMO_UNICODE: dict) -> str:
    """
    Converts emojis to text in the input text.
    """
    for description, emoji in EMO_UNICODE.items():
        text = text.replace(emoji, "_".join(description.replace(",", "").replace(":", "").split()))
    return text

def remove_urls(text: str) -> str:
    """
    Removes URLs from the input text.
    """
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)

def remove_http_tags(text: str) -> str:
    """
    Removes HTTP tags from the input text.
    """
    return BeautifulSoup(text, "html.parser").text

def chat_words_conversion(text: str, slang_words_list: dict) -> str:
    """
    Converts chat words to standard words in the input text.
    """
    chat_words_list = list(slang_words_list.keys())
    new_text = []
    for word in text.split():
        new_text.append(slang_words_list.get(word.upper(), word))
    return " ".join(new_text)

def spell_correction(text: str, spell: SpellChecker) -> str:
    """
    Corrects spelling errors in the input text.
    """
    corrected_text = []
    for word in text.split():
        corrected_text.append(spell.correction(word))
    return " ".join(corrected_text)

In [7]:
# here we test our functions one by one
assert lower_case("Hello World!") == "hello world!"
assert remove_punctuation("Hello, World!") == "Hello World"
assert remove_stopwords("Hello the World!") == "World!"

TypeError: remove_stopwords() missing 1 required positional argument: 'STOPWORDS'

In [None]:
def preprocessing_pipeline(text: str) -> str:
    """
    Chains all the cleaning functions together using scikit-learn pipelines.
    """
    # Define your global variables (e.g., STOPWORDS, FREQWORDS, RAREWORDS) here

    # Define the preprocessing steps as a list of tuples with (step_name, transformer_function)
    preprocessing_steps = [
        ('lower_case', FunctionTransformer(lower_case)),
        ('remove_punctuation', FunctionTransformer(remove_punctuation)),
        ('remove_stopwords', FunctionTransformer(lambda x: remove_stopwords(x, STOPWORDS))),
        ('remove_frequent_words', FunctionTransformer(lambda x: remove_frequent_words(x, FREQWORDS))),
        ('remove_rare_words', FunctionTransformer(lambda x: remove_rare_words(x, RAREWORDS))),
        ('stemming', FunctionTransformer(lambda x: stemming(x, stemmer))),  # Replace 'stemmer' with your stemmer object
        ('lemmatize', FunctionTransformer(lambda x: lemmatize(x, lemmatizer))),  # Replace 'lemmatizer' with your lemmatizer object
        ('convert_emoticons', FunctionTransformer(lambda x: convert_emoticons(x, EMOTICONS))),
        ('convert_emojis', FunctionTransformer(lambda x: convert_emojis(x, EMO_UNICODE))),
        ('remove_urls', FunctionTransformer(remove_urls)),
        ('remove_http_tags', FunctionTransformer(remove_http_tags)),
        ('chat_words_conversion', FunctionTransformer(lambda x: chat_words_conversion(x, slang_words_list))),  # Replace 'slang_words_list' with your dictionary
        ('spell_correction', FunctionTransformer(lambda x: spell_correction(x, spell)))  # Replace 'spell' with your SpellChecker object
    ]

    # Create the pipeline
    preprocessing_pipeline = Pipeline(preprocessing_steps)

    # Apply the pipeline to the input text
    cleaned_text = preprocessing_pipeline.transform([text])[0]

    return cleaned_text

In [None]:
if __name__ == "__main__":
    df = pd.read_csv("nlp_courses/tp_1_text_cleaning/to_clean.csv", index_col=0)
    df["cleaned_text"] = df.text.apply(lambda x: preprocessing_pipeline(x))
    for idx, row in df.iterrows():
        print(f"\nBase text: {row.text}")
        print(f"Cleaned text: {row.cleaned_text}\n")