In [10]:
# -----------------------------
# Preprocessing Notebook - SentimentSense

import pandas as pd
import numpy as np
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.utils import resample

# Transformers
from transformers import AutoTokenizer

# Download NLTK resources
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Rajit\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Rajit\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Rajit\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [11]:
# Load dataset from previous EDA step
data_path = "../data/synthetic_uae_reviews.csv"
df = pd.read_csv(data_path)

print("Dataset shape:", df.shape)
df.head()


Dataset shape: (5000, 11)


Unnamed: 0,review_id,review_text,product_category,rating,sentiment,length_chars,length_tokens,review_source,contains_slang,review_date,city
0,700d5322-6701-48f1-a7ca-c8432efbe3f3,u! Bought this for for electronics. highly rec...,Electronics,5,positive,95,16,Android App,True,2025-09-28 06:41:55,Abu Dhabi
1,6e34c983-b9e1-48de-adef-0e90345c2513,Bought this for for electronics. five stars. B...,Electronics,4,positive,86,15,iOS App,False,2025-03-11 12:37:57,Ajman
2,8625b4c2-386b-4769-8d66-3aa58831a541,Bought this for for home. not bad.,Home,3,neutral,34,7,iOS App,False,2025-10-24 11:12:22,Ajman
3,9b42823a-aeb0-4565-80e8-3a264ebf0f50,Bought this for for grocery. highly recommend.,Grocery,5,positive,46,7,iOS App,False,2025-05-14 03:04:01,Sharjah
4,60d25eca-4575-4810-8388-724acf444f83,I bought this for electronics. highly recommen...,Electronics,5,positive,90,15,Android App,False,2025-03-01 10:37:05,Al Ain


In [12]:
# Function to clean text
def clean_text(text):
    text = str(text).lower()                           # Lowercase
    text = re.sub(r'\b(?:lol|omg|btw|u|luv|thx|gr8|wtf)\b', '', text)  # Remove slang
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE) # URs
    text = re.sub(r'\@\w+|\#','', text)               # Mentions /hashtags
    text = re.sub(r'[^A-Za-z0-9\s]', '', text)       # Special chars
    text = re.sub(r'\s+', ' ', text).strip()          # Extrawhitespace
    return text

# Lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = clean_text(text)
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(t) for t in tokens if t not in stop_words]
    return " ".join(tokens)


In [13]:
# Apply preprocessing (can take some time for 5000+ rows)
df['cleaned_text'] = df['review_text'].apply(preprocess_text)

# to Preview
df[['review_text','cleaned_text']].head()


Unnamed: 0,review_text,cleaned_text
0,u! Bought this for for electronics. highly rec...,bought electronics highly recommend battery li...
1,Bought this for for electronics. five stars. B...,bought electronics five star battery life good...
2,Bought this for for home. not bad.,bought home bad
3,Bought this for for grocery. highly recommend.,bought grocery highly recommend
4,I bought this for electronics. highly recommen...,bought electronics highly recommend battery li...


In [14]:
# TF-IDF vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
X_tfidf = tfidf_vectorizer.fit_transform(df['cleaned_text'])

print("TF-IDF shape:", X_tfidf.shape)


TF-IDF shape: (5000, 233)


In [6]:
# Load RoBERTa tokenizer
roberta_tokenizer = AutoTokenizer.from_pretrained("roberta-base")

# Example: tokenize first 5 reviews
tokens = roberta_tokenizer(df['review_text'].tolist()[:5], 
                           padding='max_length', 
                           truncation=True, 
                           max_length=128,
                           return_tensors='pt')
tokens


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



{'input_ids': tensor([[    0,   100,  2162,    42,    13,  2734,     4,  2200,  5940,     4,
             2,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,  

In [15]:
# Map sentiment to numerical labels
sentiment_map = {"negative":0, "neutral":1, "positive":2}
df['label'] = df['sentiment'].map(sentiment_map)

# Train/validation/test split
X_train, X_temp, y_train, y_temp = train_test_split(df['cleaned_text'], df['label'], 
                                                    test_size=0.3, random_state=42, stratify=df['label'])
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, 
                                                test_size=0.5, random_state=42, stratify=y_temp)

print("Train:", X_train.shape, "Validation:", X_val.shape, "Test:", X_test.shape)

# Optional to handle class imbalance (upsampling minority classes)
train_df = pd.concat([X_train, y_train], axis=1)
max_size = train_df['label'].value_counts().max()
lst = [train_df]
for class_index, group in train_df.groupby('label'):
    lst.append(group.sample(max_size-len(group), replace=True))
df_train_balanced = pd.concat(lst)
print("Balanced training class distribution:\n", df_train_balanced['label'].value_counts())


Train: (3500,) Validation: (750,) Test: (750,)
Balanced training class distribution:
 0    2425
2    2425
1    2425
Name: label, dtype: int64


In [16]:
# Save processed CSV
processed_path = "../data/processed/processed_reviews.csv"
df.to_csv(processed_path, index=False)
print(f"✅ Preprocessed data saveding to {processed_path}")


✅ Preprocessed data saveding to ../data/processed/processed_reviews.csv
