In [1]:
import os
from pathlib import Path
import re
import json
import numpy as np
import pandas as pd
from tqdm import tqdm
from sqlalchemy import create_engine


# NLP libs
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize
from bs4 import BeautifulSoup
import contractions
import torch
import spacy
from langdetect import detect
from transformers import MarianMTModel, MarianTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [22]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ND.COM\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ND.COM\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ND.COM\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\ND.COM\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


True

In [3]:
nlp = spacy.load("en_core_web_sm", disable=["ner", "parser"])
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [4]:
model_name = 'Helsinki-NLP/opus-mt-mul-en'
tokenizer_mt = MarianTokenizer.from_pretrained(model_name)
model_mt = MarianMTModel.from_pretrained(model_name)

In [5]:
def translate_to_english(text):
    try:
        batch = tokenizer_mt(text, return_tensors="pt", truncation=True, padding=True)
        gen = model_mt.generate(**batch)
        return tokenizer_mt.batch_decode(gen, skip_special_tokens=True)[0]
    except:
        return text             

Loading Data

In [6]:
# Use the current working directory as the base
project_root = Path.cwd().parent.parent  # adjust if needed to point to ML_DB_PROJECT

# Output directory
out_dir = project_root / "data/ml/reviews/reviews_preprocessed"
out_dir.mkdir(parents=True, exist_ok=True)

# Load parquet file
df = pd.read_parquet(project_root / "data/ml/reviews/reviews.parquet")
df.fillna("", inplace=True)

print("Data loaded. Shape:", df.shape)

Data loaded. Shape: (300000, 19)


In [7]:
def remove_html(x):
    return BeautifulSoup(x, "lxml").get_text()

In [8]:
def remove_emojis(text):
    emoji_pattern = re.compile("[\U00010000-\U0010ffff]", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

In [9]:
def remove_special_chars(text):
    text = re.sub(r"[^A-Za-z0-9\s]", " ", text)
    return re.sub(r"\s+", " ", text).strip()

In [10]:
def lower(x): 
    return x.lower()

In [11]:
def fix_contractions(x): 
    return contractions.fix(x)

In [12]:
def remove_stopwords(tokens):
    return [w for w in tokens if w not in stop_words]

In [13]:
def tokenize_spacy(text):
    return [token.text for token in nlp(text)]

In [14]:
def lemmatize_spacy(text):
    return " ".join([token.lemma_ for token in nlp(text)])

In [23]:
from nltk.tokenize import sent_tokenize
import nltk

def split_sentences(text):
    try:
        return sent_tokenize(text)
    except LookupError:
        nltk.download('punkt')
        nltk.download('punkt_tab')
        return sent_tokenize(text)

In [16]:
def detect_language(x):
    try: 
        return detect(x)
    except:
        return "unknown"

In [17]:
def detect_spam(text):
    if len(text.split()) < 2: 
        return True
    if len(set(text.split())) < 2: 
        return True
    if re.search(r"(.)\1{3,}", text): 
        return True
    return False

In [18]:
def deep_clean(text):
    text = re.sub(r"\d+", " ", text)
    text = re.sub(r"\s+", " ", text)
    return text.strip()

In [19]:
clean_texts = []
tokens_list = []
lemmas_list = []
sentences_list = []
languages = []
translations = []
spam_flags = []
final_clean_text = []

In [24]:
for txt in tqdm(df.review_text.tolist()):

    t = txt

    # 1 remove html, emoji, special chars
    t = remove_html(t)
    t = remove_emojis(t)
    t = remove_special_chars(t)

    # 2 lowercase
    t = lower(t)

    # 3 contractions
    t = fix_contractions(t)

    # 5 tokenization
    tokens = tokenize_spacy(t)

    # 4 stopwords
    tokens_nostop = remove_stopwords(tokens)

    # 6 lemmatization
    lemma_text = lemmatize_spacy(" ".join(tokens_nostop))

    # 7 sentence splitting
    sents = split_sentences(lemma_text)

    # 8 language detection
    lang = detect_language(txt)
    if lang != "en":
        translated = translate_to_english(txt)
    else:
        translated = t

    # 9 spam detection
    spam = detect_spam(lemma_text)

    # 10 deep cleaning
    cleaned_final = deep_clean(translated)

    clean_texts.append(t)
    tokens_list.append(tokens_nostop)
    lemmas_list.append(lemma_text)
    sentences_list.append(sents)
    languages.append(lang)
    translations.append(translated)
    spam_flags.append(spam)
    final_clean_text.append(cleaned_final)

100%|██████████| 300000/300000 [5:58:03<00:00, 13.96it/s]     


In [25]:
df["clean_text"] = clean_texts
df["tokens"] = tokens_list
df["lemmas"] = lemmas_list
df["sentences"] = sentences_list
df["language_detected"] = languages
df["translated_text"] = translations
df["is_spam"] = spam_flags
df["final_text_for_ml"] = final_clean_text

In [28]:
df.isna().sum()

review_id                    0
customer_id                  0
article_id                   0
category_id                  0
rating                       0
review_text                  0
created_at                   0
verified_purchase            0
helpful_votes                0
synthetic_sentiment_label    0
aspect_terms                 0
language                     0
review_length                0
review_source                0
review_age_days              0
clean_text                   0
vader_score                  0
vader_label                  0
aspect_terms_list            0
tokens                       0
lemmas                       0
sentences                    0
language_detected            0
translated_text              0
is_spam                      0
final_text_for_ml            0
dtype: int64

In [26]:
# impute missing values
df.replace("", np.nan, inplace=True)
df.fillna({"translated_text": "", "final_text_for_ml": ""}, inplace=True)

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

In [29]:
df["text_for_training"] = df["final_text_for_ml"]

In [31]:
print(df.columns.tolist())

['review_id', 'customer_id', 'article_id', 'category_id', 'rating', 'review_text', 'created_at', 'verified_purchase', 'helpful_votes', 'synthetic_sentiment_label', 'aspect_terms', 'language', 'review_length', 'review_source', 'review_age_days', 'clean_text', 'vader_score', 'vader_label', 'aspect_terms_list', 'tokens', 'lemmas', 'sentences', 'language_detected', 'translated_text', 'is_spam', 'final_text_for_ml', 'text_for_training']


In [36]:
# Balancing — Transformers use class weights
from sklearn.utils.class_weight import compute_class_weight

classes = df["synthetic_sentiment_label"].unique()
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=classes,
    y=df["synthetic_sentiment_label"])
print("Class Weights:", dict(zip(classes, class_weights)))


Class Weights: {'neutral': 1.667834150572067, 'positive': 0.5121979952570466, 'negative': 2.2318937618569357}


In [37]:
from sklearn.model_selection import train_test_split

train, temp = train_test_split(df, test_size=0.3, stratify=df["synthetic_sentiment_label"], random_state=42)
val, test = train_test_split(temp, test_size=0.5, stratify=temp["synthetic_sentiment_label"], random_state=42)

In [None]:
train.to_parquet(out_dir / "train.parquet", index=False)
val.to_parquet(out_dir / "val.parquet", index=False)
test.to_parquet(out_dir / "test.parquet", index=False)
df.to_parquet(out_dir / "reviews_preprocessed.parquet", index=False)

print(f"All files saved successfully to:\n{out_dir.resolve()}")

All files saved successfully to:
C:\Users\ND.COM\Desktop\ML DB Project\data\ml\reviews\reviews_preprocessed


In [41]:
print("Preprocessing complete. Rows:", len(df))

Preprocessing complete. Rows: 300000
