In [2]:
import pandas as pd
import numpy as np
import uuid
import os
from faker import Faker
import geopandas as gpd
from shapely.geometry import Point
from datetime import datetime, timedelta
import zipfile

# ------------------------------
# Initialize Faker and output directory
# ------------------------------
fake = Faker()
output_dir = "city_datasets"
os.makedirs(output_dir, exist_ok=True)

n = 10000  # Minimum records per dataset

# ------------------------------
# Expanded Tamil Nadu locations and stops
# ------------------------------
tn_locations = [
    "Chennai", "Coimbatore", "Madurai", "Tiruchirappalli", "Tirunelveli",
    "Salem", "Erode", "Vellore", "Thoothukudi", "Dindigul",
    "Kanchipuram", "Hosur", "Cuddalore", "Karur", "Nagapattinam",
    "Sivakasi", "Tiruvannamalai", "Krishnagiri", "Namakkal", "Virudhunagar",
    "Perambalur", "Ariyalur", "Dharmapuri", "Tiruvarur", "Ramanathapuram"
]

tn_stops = [
    "Chennai Central", "Egmore", "Tambaram", "Koyambedu", "Vadapalani",
    "Coimbatore Junction", "Salem Town", "Madurai Junction", "Trichy Fort",
    "Tirunelveli", "Nagercoil", "Vellore Town", "Erode Junction", "Thanjavur",
    "Kanchipuram Stop", "Hosur Stop", "Cuddalore Junction", "Karur Stop",
    "Nagapattinam Stop", "Sivakasi Stop", "Tiruvannamalai Stop", "Krishnagiri Stop",
    "Namakkal Junction", "Virudhunagar Stop", "Perambalur Stop", "Ariyalur Stop",
    "Dharmapuri Stop", "Tiruvarur Stop", "Ramanathapuram Stop"
]

complaint_category_map = {
    "Garbage not cleared for 3 days": "Garbage",
    "Overflowing garbage bins": "Garbage",
    "Streetlight not working in main road": "Streetlight",
    "Pothole near bus stop causing accidents": "Pothole",
    "Water leakage from underground pipe": "Water Leakage",
    "Bus delay causing inconvenience": "Transport Delay",
    "Metro late arrival issue": "Transport Delay",
    "Open drainage complaint": "Blocked Drain",
    "Blocked stormwater drain": "Blocked Drain",
    "Traffic signal not functioning": "Traffic Signal Issue",
    "Illegal parking complaint": "Illegal Parking",
    "No water supply issue": "Water Supply Issue",
    "Road surface broken causing accidents": "Road Damage",
    "Public toilet maintenance issue": "Public Toilet Issue",
    "Street cleaning not done": "Street Cleaning Issue",
    "Noise pollution complaint": "Noise Complaint",
    "Encroachment complaint": "Encroachment",
    "Unauthorized billboard issue": "Unauthorized Billboard",
    "Air pollution from industries": "Air Pollution"
}


# ------------------------------
# Generate Neighborhoods
# ------------------------------
neigh_ids = [str(uuid.uuid4()) for _ in range(n)]
neigh_names = [f"Neighborhood-{i}" for i in range(1, n+1)]
latitudes = np.random.uniform(8.0, 13.5, n)
longitudes = np.random.uniform(76.5, 80.5, n)

start_date = datetime.now() - timedelta(days=365)

# --------------------------
# 1. 311 Service Request Data
# --------------------------
service_dates = [start_date + timedelta(minutes=np.random.randint(0, 525600)) for _ in range(n)]
selected_complaints = np.random.choice(list(complaint_category_map.keys()), n)  # select complaints

service_df = pd.DataFrame({
    "Neighborhood_ID": neigh_ids,
    "Request_ID": [str(uuid.uuid4()) for _ in range(n)],
    "DateTime": [d.strftime("%d-%m-%Y %H:%M:%S") for d in service_dates],
    "Complaints": selected_complaints,
    "Category": [complaint_category_map[c] for c in selected_complaints],  # map complaint → category
    "Location": np.random.choice(tn_locations, n),
    "Stop_Name": np.random.choice(tn_stops, n),
    "Latitude": latitudes,
    "Longitude": longitudes,
    "Status": np.random.choice(["Open", "Closed"], n),
    "Resolution_Time(hrs)": np.random.randint(1, 200, n)
})
service_df.to_csv(os.path.join(output_dir, "311_service_requests.csv"), index=False)

print("✅ 311 Service Requests generated with complaint-related categories")


# --------------------------
# 2. Public Transport Usage Data
# --------------------------
transport_dates = [start_date + timedelta(minutes=np.random.randint(0, 525600)) for _ in range(n)]
scheduled_times = pd.to_datetime(transport_dates)
actual_times = scheduled_times + pd.to_timedelta(np.random.randint(0, 60, n), unit='m')
transport_df = pd.DataFrame({
    "Neighborhood_ID": neigh_ids,
    "Route_ID": [f"R{str(i).zfill(3)}" for i in range(n)],
    "Stop_Name": np.random.choice(tn_stops, n),
    "Scheduled_Time": scheduled_times.strftime("%d-%m-%Y %H:%M:%S"),
    "Actual_Time": actual_times.strftime("%d-%m-%Y %H:%M:%S"),
    "Delay(mins)": (actual_times - scheduled_times).seconds // 60,
    "Daily_Ridership": np.random.randint(500, 5000, n)
})
transport_df.to_csv(os.path.join(output_dir, "public_transport_usage.csv"), index=False)

# --------------------------
# 3. Social Media Sentiment Data
# --------------------------
social_dates = [start_date + timedelta(minutes=np.random.randint(0, 525600)) for _ in range(n)]
social_df = pd.DataFrame({
    "Neighborhood_ID": neigh_ids,
    "Post_ID": [str(uuid.uuid4()) for _ in range(n)],
    "User_Location": np.random.choice(tn_locations, n),
    "Timestamp": [d.strftime("%d-%m-%Y %H:%M:%S") for d in social_dates],
    # "Text_Content": np.random.choice(complaint_category_map, n),
    "Text_Content": np.random.choice(list(complaint_category_map.keys()), n),
    "Sentiment_Score": np.random.choice(["Positive", "Negative", "Neutral"], n)
})
social_df.to_csv(os.path.join(output_dir, "social_media_sentiment.csv"), index=False)

# --------------------------
# 4. Demographic Data
# --------------------------
demo_df = pd.DataFrame({
    "Neighborhood_ID": neigh_ids,
    "Total_Population": np.random.randint(1000, 100000, n),
    "Population_Density": np.random.randint(5000, 20000, n),
    "Income_Level": np.random.choice(["Low", "Medium", "High"], n),
    "Age_Group": np.random.choice(["0-14", "15-24", "25-44", "45-64", "65+"], n),
    "Ward_ID": [f"Ward-{i}" for i in range(1, n+1)],
    "Zone_ID": np.random.choice(["North", "South", "East", "West", "Central"], n)
})
demo_df.to_csv(os.path.join(output_dir, "demographics.csv"), index=False)

# --------------------------
# 5. Neighborhood Shapefile
# --------------------------
# neigh_info_df = pd.DataFrame({
#     "Neighborhood_ID": neigh_ids,
#     "Neighborhood_Name": neigh_names,
#     "Latitude": latitudes,
#     "Longitude": longitudes
# })
# gdf = gpd.GeoDataFrame(neigh_info_df, geometry=gpd.points_from_xy(neigh_info_df.Longitude, neigh_info_df.Latitude))
# gdf.set_crs(epsg=4326, inplace=True)
# gdf.to_file(os.path.join(output_dir, "neighborhoods.shp"))

# --------------------------
# 6. Real-Time TN Public Complaints (Expanded version)
# --------------------------
# timestamps = [start_date + timedelta(minutes=np.random.randint(0, 525600)) for _ in range(n)]
# status_options = ["Open", "Closed"]
# resolution_times = np.random.randint(1, 200, n)

# real_time_df = pd.DataFrame({
#     "Neighborhood_ID": neigh_ids,
#     "Request_ID": [str(uuid.uuid4()) for _ in range(n)],
#     "DateTime": [ts.strftime("%d-%m-%Y %H:%M:%S") for ts in timestamps],
#     "Category": np.random.choice(["Garbage", "Pothole", "Streetlight", "Water Leakage"], n),
#     "Complaints": np.random.choice(complaint_category_map, n),
#     "Location": np.random.choice(tn_locations, n),
#     "Stop_Name": np.random.choice(tn_stops, n),
#     "Latitude": latitudes,
#     "Longitude": longitudes,
#     "Status": np.random.choice(status_options, n),
#     "Resolution_Time(hrs)": resolution_times
# })
# real_time_df.to_csv(os.path.join(output_dir, "real_time_tn_public_complaints.csv"), index=False)

# --------------------------
# Optional: ZIP all datasets
# --------------------------
zip_path = os.path.join(output_dir, "city_datasets_10000_realistic.zip")
with zipfile.ZipFile(zip_path, 'w') as zipf:
    for root, dirs, files in os.walk(output_dir):
        for file in files:
            if file.endswith(".csv") or file.endswith((".shp", ".shx", ".dbf", ".prj")):
                zipf.write(os.path.join(root, file), os.path.relpath(os.path.join(root, file), output_dir))

print(f"✅ All datasets saved independently & compressed into: {zip_path}")


✅ 311 Service Requests generated with complaint-related categories
✅ All datasets saved independently & compressed into: city_datasets\city_datasets_10000_realistic.zip
