# libs 

In [84]:
import pandas as pd
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
# pyarrow 
import pyarrow.parquet as pq


### Simulation Scenarios Must Include:  
- Heavy rain
- Temperature extremes
- High humidity
- Low visibility
- Strong winds  
### Main Outputs:  
-  Probability of:  
    -  Traffic jams
    -  Accident risk under bad weather
    -  Distribution of congestion probabilities from thousands of simulation runs

In [85]:
data = pd.read_parquet("../clean_data/merged_data.parquet",engine="pyarrow",dtype_backend="pyarrow")

In [86]:
data.head()

Unnamed: 0,traffic_id,date_time,city,area,vehicle_count,avg_speed_kmh,accident_count,congestion_level,road_condition,visibility_m_x,weather_id,season,temperature_c,humidity,rain_mm,wind_speed_kmh,visibility_m_y,weather_condition,air_pressure_hpa
0,5,2024-01-01 04:00:00,London,Chelsea,3748,107.906517,2,Low,Snowy,8711,5,Summer,15.360175,91.0,16.867431,24.270452,4666,Rain,951.177326
1,8,2024-01-01 07:00:00,London,Kensington,3218,95.727944,6,Medium,Snowy,6887,8,Spring,17.661852,59.0,9.826941,78.793561,1508,Clear,1033.559398
2,9,2024-01-01 08:00:00,London,Kensington,259,88.98006,1,High,Dry,1372,9,Winter,12.073337,50.0,26.247984,62.388994,2597,Snow,956.031503
3,10,2024-01-01 09:00:00,London,Chelsea,1404,36.028368,1,Medium,Dry,3294,10,Winter,8.638003,18.0,16.867431,45.055543,6285,Fog,963.563889
4,11,2024-01-01 10:00:00,London,Camden,41,72.182852,0,High,Snowy,3225,11,Winter,9.430213,92.0,24.425748,68.345731,7919,Clear,960.12186


In [87]:
data.columns , data.shape

(Index(['traffic_id', 'date_time', 'city', 'area', 'vehicle_count',
        'avg_speed_kmh', 'accident_count', 'congestion_level', 'road_condition',
        'visibility_m_x', 'weather_id', 'season', 'temperature_c', 'humidity',
        'rain_mm', 'wind_speed_kmh', 'visibility_m_y', 'weather_condition',
        'air_pressure_hpa'],
       dtype='object'),
 (1512, 19))

1. Define a Model that represent the process 
2. Specify Probability Distributions
3. Generate Random Samples
4. Run the simulation 
5. Analysis the simulation  

In [88]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1512 entries, 0 to 1511
Data columns (total 19 columns):
 #   Column             Non-Null Count  Dtype                 
---  ------             --------------  -----                 
 0   traffic_id         1512 non-null   int64[pyarrow]        
 1   date_time          1512 non-null   timestamp[ns][pyarrow]
 2   city               1512 non-null   string[pyarrow]       
 3   area               1512 non-null   string[pyarrow]       
 4   vehicle_count      1512 non-null   int64[pyarrow]        
 5   avg_speed_kmh      1512 non-null   double[pyarrow]       
 6   accident_count     1512 non-null   int64[pyarrow]        
 7   congestion_level   1512 non-null   string[pyarrow]       
 8   road_condition     1512 non-null   string[pyarrow]       
 9   visibility_m_x     1512 non-null   int64[pyarrow]        
 10  weather_id         1512 non-null   int64[pyarrow]        
 11  season             1512 non-null   string[pyarrow]       
 12  temper

In [89]:
print(data["city"].value_counts())
print(data["area"].value_counts())
print(data["congestion_level"].value_counts())
print(data["road_condition"].value_counts())

city
London    1512
Name: count, dtype: int64[pyarrow]
area
Chelsea       571
Southwark     258
Kensington    233
Islington     226
Camden        224
Name: count, dtype: int64[pyarrow]
congestion_level
Low       799
High      357
Medium    356
Name: count, dtype: int64[pyarrow]
road_condition
Wet        634
Dry        322
Snowy      285
Damaged    271
Name: count, dtype: int64[pyarrow]


In [90]:
print(data["season"].value_counts())
print(data["weather_condition"].value_counts())

season
Autumn    658
Winter    294
Summer    281
Spring    279
Name: count, dtype: int64[pyarrow]
weather_condition
Rain     531
Storm    252
Snow     250
Clear    242
Fog      237
Name: count, dtype: int64[pyarrow]


In [91]:
traffic_col = ["traffic_id", "date_time","city", "area", "vehicle_count", "avg_speed_kmh", "accident_count", "congestion_level", "road_condition", "visibility_m_x"]
weather_col = ["weather_id", "season", "temperature_c", "humidity", "rain_mm", "wind_speed_kmh", "visibility_m_y", "weather_condition", "air_pressure_hpa"]

## Data Generation

In [92]:
cat_col = ["season", "weather_condition"]
num_col = ["temperature_c", "humidity", "rain_mm", "wind_speed_kmh", "visibility_m_y", "air_pressure_hpa"]

In [93]:
# generate new day
def generate_new_day(n_days=1):
    probs = data['weather_condition'].value_counts(normalize=True)
    sim_weather = np.random.choice(probs.index, size=n_days, p=probs.values)
    sim_df = pd.DataFrame({'weather_condition': sim_weather})

    for condition in np.unique(sim_weather):
        mask = sim_df['weather_condition'] == condition
        history = data[data['weather_condition'] == condition]
        for col in num_col:
            kde = stats.gaussian_kde(history[col])
            simulated_kde = kde.resample(size=mask.sum()).T.flatten()
            simulated_kde = np.maximum(simulated_kde, 0)
            sim_df.loc[mask, col] = simulated_kde
        for col in cat_col:
            probs = history[col].value_counts(normalize=True)
            sim_df.loc[mask, col] = np.random.choice(probs.index, size=mask.sum(), p=probs.values)
    
    return sim_df

In [94]:
test = generate_new_day(1000)
test.head()

Unnamed: 0,weather_condition,temperature_c,humidity,rain_mm,wind_speed_kmh,visibility_m_y,air_pressure_hpa,season
0,Rain,16.63436,79.900912,11.37105,81.281788,821.800054,1006.89361,Summer
1,Clear,31.557567,70.191356,0.0,7.366339,6077.986981,1021.574805,Summer
2,Clear,20.796236,29.578074,19.111202,62.792147,8064.627082,1041.214161,Spring
3,Storm,10.152353,74.415668,9.177467,7.045272,4588.708562,1012.114771,Autumn
4,Clear,11.958205,29.345603,9.2335,5.801015,2143.916531,1042.284893,Winter
