<a href="https://colab.research.google.com/github/PawelJakubczyk/ml_data_preprocessing_utilities/blob/main/ml_data_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%pip install numpy
%pip install pandas
%pip install language_tool_python
%pip install textaugment

Collecting language_tool_python
  Downloading language_tool_python-2.7.1-py3-none-any.whl (34 kB)
Installing collected packages: language_tool_python
Successfully installed language_tool_python-2.7.1
Collecting textaugment
  Downloading textaugment-2.0.0-py3-none-any.whl (19 kB)
Collecting googletrans>=2 (from textaugment)
  Downloading googletrans-3.0.0.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting httpx==0.13.3 (from googletrans>=2->textaugment)
  Downloading httpx-0.13.3-py3-none-any.whl (55 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.1/55.1 kB[0m [31m1.0 MB/s[0m eta [36m0:00:00[0m
Collecting hstspreload (from httpx==0.13.3->googletrans>=2->textaugment)
  Downloading hstspreload-2024.2.1-py3-none-any.whl (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
Collecting chardet==3.* (from httpx==0.13.3->googletrans>=2->textaugment)
  Downloading cha

In [None]:
# Read manage data
import pandas as pd
from concurrent.futures import ProcessPoolExecutor

# Data Preprocesing
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from re import sub, findall
import string
import nltk
import spacy
from language_tool_python import LanguageTool
from collections import Counter

# data augmentation
from textaugment import Wordnet

In [None]:
# Prior to running the functions, required resources need to be downloaded
nltk.download('punkt')
nltk.download('stopwords')
nlp = spacy.load('en_core_web_sm')
STOP_WORDS = nlp.Defaults.stop_words
stemmer = PorterStemmer()

def normalize_text(input_text: str) -> str:
    """Normalizes the input text by converting it to lowercase and removing leading/trailing whitespaces."""
    # Convert to lowercase
    normalized_text = input_text.lower()

    # Remove leading and trailing whitespaces
    normalized_text = normalized_text.strip()

    return normalized_text

def remove_punctuation(input_string: str) -> str:
    """Removes punctuation from the input string"""
    return input_string.translate(str.maketrans('', '', string.punctuation))

def replace_tabs_enters_and_spaces(input_string:str) -> str:
    """Replaces tabs with a single space and multiple spaces with a single space in the input string"""
    replaced_tabs = sub(r'\t', ' ', input_string)
    replace_enters = sub(r'\n', ' ', replaced_tabs)
    replaced_spaces = sub(r'\s+', ' ', replace_enters)
    return replaced_spaces

def remove_html_tags(input_text: str) -> str:
    """Removes HTML tags from the input text."""
    clean_text = sub(r'<.*?>', '', input_text)
    return clean_text

def remove_stopwords(input_string: str) -> str:
    """Removes stopwords from the input string"""
    words = input_string.split()
    filtered_words = [word for word in words if word.lower() not in STOP_WORDS]
    return ' '.join(filtered_words)

def lemmatize(input_string: str)-> str:
    """Lemmatizes the input text"""
    doc = nlp(input_string)
    sent = [token.lemma_ for token in doc if token.text not in STOP_WORDS]
    return ' '.join(sent)

def stemming(input_string: str)-> str:
    """Performs stemming on the input text"""
    tokens = word_tokenize(input_string)
    stem_words = [stemmer.stem(word) for word in tokens]
    return ' '.join(stem_words)

def remove_pos_tags(input_string: str)-> str:
    """Removes all words except nouns (NN) from the input text"""
    doc = nlp(input_string)
    sent = [token.text for token in doc if token.tag_ == 'NN']
    return ' '.join(sent)


def correct_grammar(text: str) -> str:
    """Corrects grammar mistakes in the input text."""
    # Creates a LanguageTool object for English
    tool = LanguageTool('en-US')
    corrected_text = tool.correct(text)
    return corrected_text

def remove_common_words(text: str, common_threshold: int) -> str:
    """Removes words from the text that occur too frequently."""
    words = findall(r'\b\w+\b', text.lower())  # Tokenizacja tekstu na słowa
    word_counts = Counter(words)
    common_words = {word for word, count in word_counts.items() if count > common_threshold}
    filtered_text = ' '.join(word for word in words if word not in common_words)
    return filtered_text

def remove_rare_words(text: str, rare_threshold: int) -> str:
    """Removes words from the text that occur too rarely."""
    words = findall(r'\b\w+\b', text.lower())  # Tokenizacja tekstu na słowa
    word_counts = Counter(words)
    rare_words = {word for word, count in word_counts.items() if count <= rare_threshold}
    filtered_text = ' '.join(word for word in words if word not in rare_words)
    return filtered_text

def clean_empty_data(df: pd.DataFrame, columns_to_check: list) -> pd.DataFrame:
    """Cleans the input DataFrame by removing rows with incorrect or invalid values."""
    # Make a copy to avoid modifying the original DataFrame
    cleaned_df = df.copy()

    for column in columns_to_check:
        cleaned_df = cleaned_df[cleaned_df[column].notna()]
        cleaned_df = cleaned_df[cleaned_df[column] != ""]

    return cleaned_df

# Only the first two preprocessing functions are used because they have the highest percentage of coverage in the model.
# The remaining functions are left in the notebook to demonstrate various preprocessing options.