In [2]:
import pandas as pd
import re
import html
import os
from sklearn.model_selection import train_test_split
import sys
notebook_dir = os.path.abspath(os.getcwd())
project_root = os.path.dirname(notebook_dir)

if project_root not in sys.path:
    sys.path.append(project_root)
import src.config as config

In [3]:
def clean_and_standardize_df(df, text_col, label_col=None, positive_label_val=1):
    df = df.copy()
    
    final_df = pd.DataFrame()
    final_df['text'] = df[text_col]
    if label_col and label_col in df.columns:
        final_df['label'] = (df[label_col] == positive_label_val).astype('int8')

    def clean_review_pipeline(text):
        if not isinstance(text, str):
            return ""
        text = html.unescape(text)
        text = re.sub(r'<.*?>', ' ', text)
        text = text.lower()
        text = re.sub(r'[“”‘’\'"`]', ' ', text)
        text = re.sub(r'[-–—]', ' ', text)
        text = re.sub(r'[^\w\s]', '', text)
        text = re.sub(r'\s+', ' ', text).strip()
        return text
    
    final_df['text'] = final_df['text'].apply(clean_review_pipeline)
    
    final_df.dropna(subset=['text'], inplace=True)
    final_df = final_df[final_df['text'] != '']
    
    return final_df

In [4]:
imdb_full_train_raw = pd.read_csv(config.RAW_IMDB_TRAIN_PATH)
imdb_train_raw, imdb_val_raw = train_test_split(
    imdb_full_train_raw,
    test_size=0.2,
    random_state=42,
    stratify=imdb_full_train_raw['label']
)

tasks = [
    {'df': imdb_train_raw, 'dest_path': config.CLEAN_IMDB_TRAIN_PATH, 'text_col': 'text', 'label_col': 'label'},
    {'df': imdb_val_raw,   'dest_path': config.CLEAN_IMDB_VAL_PATH,   'text_col': 'text', 'label_col': 'label'},
    
    {'source_path': config.RAW_IMDB_TEST_PATH, 'dest_path': config.CLEAN_IMDB_TEST_PATH, 'text_col': 'text', 'label_col': 'label'},
    {'source_path': config.RAW_IMDB_UNSUPERVISED_PATH, 'dest_path': config.CLEAN_IMDB_UNSUPERVISED_PATH, 'text_col': 'text', 'label_col': None},
    
    {'source_path': config.RAW_RT_TRAIN_PATH, 'dest_path': config.CLEAN_RT_TRAIN_PATH, 'text_col': 'review', 'label_col': 'sentiment'},
    {'source_path': config.RAW_RT_VAL_PATH, 'dest_path': config.CLEAN_RT_VAL_PATH, 'text_col': 'review', 'label_col': 'sentiment'},
    {'source_path': config.RAW_RT_TEST_PATH, 'dest_path': config.CLEAN_RT_TEST_PATH, 'text_col': 'review', 'label_col': 'sentiment'},
]


for task in tasks:
    
    df_to_process = pd.read_csv(task['source_path']) if 'source_path' in task else task['df']
    
    
    clean_df = clean_and_standardize_df(
        df=df_to_process,
        text_col=task['text_col'],
        label_col=task.get('label_col'), 
        positive_label_val=1
    )
    

    clean_df.to_csv(task['dest_path'], index=False)
    print(f"  -> Saved clean file with {len(clean_df)} rows.")



  -> Saved clean file with 20000 rows.
  -> Saved clean file with 5000 rows.
  -> Saved clean file with 25000 rows.
  -> Saved clean file with 50000 rows.
  -> Saved clean file with 7035 rows.
  -> Saved clean file with 449 rows.
  -> Saved clean file with 3178 rows.
