In [2]:
import random
import csv
import os
import uuid
import pandas as pd
from datetime import datetime, timedelta

# -----------------------------
# CONFIG (You can modify these)
# -----------------------------
N = 5000
OUT_PATH = "../data/synthetic_uae_reviews.csv"

# Ensure folder exists
os.makedirs("data", exist_ok=True)

# -----------------------------
# Static lookup lists
# -----------------------------
CITIES = ["Dubai", "Abu Dhabi", "Sharjah", "Ajman", "Al Ain", "Ras Al Khaimah"]
PRODUCT_CATEGORIES = ["Electronics", "Fashion", "Beauty", "Home", "Grocery", "Sports", "Toys"]
REVIEW_SOURCES = ["Web", "iOS App", "Android App", "ThirdParty"]
SLANG_WORDS = ["lol", "omg", "btw", "u", "luv", "thx", "gr8", "wtf"]

POSITIVE_PHRASES = [
    "loved it", "highly recommend", "works great", "exceeded expectations",
    "very happy", "perfect", "five stars", "would buy again"
]

NEGATIVE_PHRASES = [
    "very disappointed", "stopped working", "not worth it", "poor quality",
    "waste of money", "one star", "do not buy"
]

NEUTRAL_PHRASES = [
    "average", "okay", "not bad", "as expected", "decent for the price"
]

# -----------------------------
# Helper functions
# -----------------------------
def random_date(days_back=180):
    base = datetime.now()
    rand_days = random.randint(0, days_back)
    rand_seconds = random.randint(0, 86400)
    return (base - timedelta(days=rand_days, seconds=rand_seconds)).strftime("%Y-%m-%d %H:%M:%S")


def map_rating_to_sentiment(rating: int) -> str:
    if rating <= 2:
        return "negative"
    elif rating == 3:
        return "neutral"
    else:
        return "positive"


def generate_review_text(sentiment, category, contains_slang):
    if sentiment == "positive":
        template = random.choice(POSITIVE_PHRASES)
    elif sentiment == "negative":
        template = random.choice(NEGATIVE_PHRASES)
    else:
        template = random.choice(NEUTRAL_PHRASES)

    intro = "I bought this" if random.random() > 0.5 else "Bought this for"
    category_phrase = f"for {category.lower()}"
    text = f"{intro} {category_phrase}. {template}."

    if category == "Electronics" and sentiment != "negative":
        text += " Battery life is good and it charges fast."
    if category == "Fashion" and sentiment == "negative":
        text += " The size runs small and the fabric feels cheap."

    if contains_slang:
        slang = random.choice(SLANG_WORDS)
        if random.random() > 0.5:
            text = f"{slang}! " + text
        else:
            text = text + f" {slang}"

    return text


# -----------------------------
# Generate dataset
# -----------------------------
rows = []

for _ in range(N):
    category = random.choice(PRODUCT_CATEGORIES)

    if category == "Electronics":
        rating = random.choices([1,2,3,4,5], weights=[5,10,20,30,35])[0]
    else:
        rating = random.choices([1,2,3,4,5], weights=[5,5,20,30,40])[0]

    sentiment = map_rating_to_sentiment(rating)
    contains_slang = random.random() < 0.12
    text = generate_review_text(sentiment, category, contains_slang)

    rows.append({
        "review_id": str(uuid.uuid4()),
        "review_text": text,
        "product_category": category,
        "rating": rating,
        "sentiment": sentiment,
        "length_chars": len(text),
        "length_tokens": len(text.split()),
        "review_source": random.choice(REVIEW_SOURCES),
        "contains_slang": contains_slang,
        "review_date": random_date(days_back=365),
        "city": random.choice(CITIES)
    })

# Convert to DataFrame
df = pd.DataFrame(rows)

# Save
df.to_csv(OUT_PATH, index=False)
print(f"✅ Saved synthetic dataset: {OUT_PATH}")

df.head()


✅ Saved synthetic dataset: ../data/synthetic_uae_reviews.csv


Unnamed: 0,review_id,review_text,product_category,rating,sentiment,length_chars,length_tokens,review_source,contains_slang,review_date,city
0,700d5322-6701-48f1-a7ca-c8432efbe3f3,u! Bought this for for electronics. highly rec...,Electronics,5,positive,95,16,Android App,True,2025-09-28 06:41:55,Abu Dhabi
1,6e34c983-b9e1-48de-adef-0e90345c2513,Bought this for for electronics. five stars. B...,Electronics,4,positive,86,15,iOS App,False,2025-03-11 12:37:57,Ajman
2,8625b4c2-386b-4769-8d66-3aa58831a541,Bought this for for home. not bad.,Home,3,neutral,34,7,iOS App,False,2025-10-24 11:12:22,Ajman
3,9b42823a-aeb0-4565-80e8-3a264ebf0f50,Bought this for for grocery. highly recommend.,Grocery,5,positive,46,7,iOS App,False,2025-05-14 03:04:01,Sharjah
4,60d25eca-4575-4810-8388-724acf444f83,I bought this for electronics. highly recommen...,Electronics,5,positive,90,15,Android App,False,2025-03-01 10:37:05,Al Ain
