In [1]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta


def add_nulls(series, ratio=0.05):
    n = int(len(series) * ratio)
    idx = np.random.choice(series.index, n, replace=False)
    series.loc[idx] = None
    return series


In [2]:
def generate_weather_data(rows=5000):
    start_date = datetime(2024, 1, 1)
    data = []

    for i in range(rows):
        # Generate valid timestamp
        dt = start_date + timedelta(hours=i)

        # Randomly break some timestamps
        dt_bad = random.choice([
            dt.strftime("%Y-%m-%d %H:%M"),
            dt.strftime("%d/%m/%Y %I%p"),
            dt.strftime("%Y-%m-%dT%H:%MZ"),
            "Unknown",
            "2099-13-40 25:61"
        ])

        temperature = np.random.choice([
            np.random.normal(15, 8),     # normal values
            np.random.uniform(-30, -10), # extreme low
            np.random.uniform(40, 60)    # extreme high
        ], p=[0.9, 0.05, 0.05])

        humidity = np.random.choice([
            np.random.randint(20, 100),
            np.random.randint(-10, 150)  # outliers
        ], p=[0.9, 0.1])

        rain = np.random.choice([
            np.random.uniform(0, 30),
            np.random.uniform(50, 150)   # outlier heavy rain
        ], p=[0.9, 0.1])

        wind = np.random.choice([
            np.random.uniform(0, 80),
            np.random.uniform(100, 250)  # extreme winds
        ], p=[0.9, 0.1])

        visibility = np.random.choice([
            np.random.randint(50, 10000),
            np.random.randint(20000, 50000)  # extreme visibility
        ], p=[0.9, 0.1])

        data.append([
            i+1,
            dt_bad,
            random.choice(["London", None]),
            random.choice(["Winter", "Spring", "Summer", "Autumn", None]),
            temperature,
            humidity,
            rain,
            wind,
            visibility,
            random.choice(["Clear", "Rain", "Storm", "Fog", "Snow", None]),
            np.random.uniform(950, 1050)
        ])

    df = pd.DataFrame(data, columns=[
        "weather_id","date_time","city","season","temperature_c","humidity",
        "rain_mm","wind_speed_kmh","visibility_m","weather_condition","air_pressure_hpa"
    ])

    # Insert random NULLs
    for col in df.columns:
        df[col] = add_nulls(df[col], ratio=0.03)

    # Duplicate some rows
    df = pd.concat([df, df.sample(50)], ignore_index=True)

    df.to_csv("Weather.csv", index=False)
    print("✔ Weather.csv generated successfully!")



In [3]:
def generate_traffic_data(rows=5000):
    start_date = datetime(2024, 1, 1)
    areas = ["Camden", "Chelsea", "Islington", "Southwark", "Kensington"]

    data = []

    for i in range(rows):
        dt = start_date + timedelta(hours=i)

        dt_bad = random.choice([
            dt.strftime("%Y-%m-%d %H:%M"),
            dt.strftime("%d/%m/%Y %I%p"),
            dt.strftime("%Y-%m-%dT%H:%MZ"),
            "TBD",
            "2099-00-00 99:99"
        ])

        vehicle_count = np.random.choice([
            np.random.randint(0, 5000),
            np.random.randint(10000, 30000)  # extreme outliers
        ], p=[0.9, 0.1])

        avg_speed = np.random.choice([
            np.random.uniform(3, 120),
            np.random.uniform(-20, -1)  # negative speed (invalid)
        ], p=[0.9, 0.1])

        accident_count = np.random.choice([
            np.random.randint(0, 10),
            np.random.randint(20, 60)  # extreme
        ], p=[0.95, 0.05])

        visibility = np.random.choice([
            np.random.randint(50, 9000),
            np.random.randint(10000, 50000)
        ], p=[0.9, 0.1])

        data.append([
            i+1,
            dt_bad,
            random.choice(["London", None]),
            random.choice(areas + [None]),
            vehicle_count,
            avg_speed,
            accident_count,
            random.choice(["Low", "Medium", "High", None]),
            random.choice(["Dry", "Wet", "Snowy", "Damaged", None]),
            visibility
        ])

    df = pd.DataFrame(data, columns=[
        "traffic_id","date_time","city","area","vehicle_count","avg_speed_kmh",
        "accident_count","congestion_level","road_condition","visibility_m"
    ])

    # Insert random NULLs
    for col in df.columns:
        df[col] = add_nulls(df[col], ratio=0.03)

    # Duplicate rows
    df = pd.concat([df, df.sample(50)], ignore_index=True)

    df.to_csv("Traffic.csv", index=False)
    print("✔ Traffic.csv generated successfully!")




In [4]:
generate_weather_data()


✔ Weather.csv generated successfully!


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  series.loc[idx] = None
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  series.loc[idx] = None
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  series.loc[idx] = None
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  series.loc[idx] = None
A value is trying to be set on a copy of a slice from a DataFrame

See the c

In [5]:
generate_traffic_data()

✔ Traffic.csv generated successfully!


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  series.loc[idx] = None
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  series.loc[idx] = None
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  series.loc[idx] = None
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  series.loc[idx] = None
A value is trying to be set on a copy of a slice from a DataFrame

See the c