#### Requirements
Using Python 3.11

**Necessary**
- Numpy 
- Pandas
- Matplotlib
- Beautiful Soup
- Seaborn (nice graphs)

_Might Need_
- Scipy
- Scikit-Learn
- tqdm (decoration) 

`pip install pandas seaborn scipy scikit-learn tqdm --no-cache-dir`


In [1]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import os
import csv
import random
import glob
import json
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

# Data

## Inspection

Just inspect it and decide which dataset to use and stuff

Viable Datasets
- [Traffic flow in Romanian cities during and around lifting of COVID19 restrictions](https://data.mendeley.com/datasets/g64s8h9k57/3)
- [New Delhi Traffic Probe & Analytics 2024](https://www.kaggle.com/datasets/rawsi18/new-delhi-traffic-probe-and-analytics-2024)

In [None]:
with open("data/nd_traffic/new_delhi__2024-08-13_to_2024-08-13_.geojson") as f:
    jsondata = json.load(f)

lis  = []

for fea in jsondata["features"][1:]:
    lis.append(fea["properties"]["distance"])

sum(lis)

1403784.3000000007

In [None]:


files = glob.glob("data/nd_traffic/*.geojson")

In [None]:
all_df = []

for file in files:
    with open(file, "r") as f:
        data = json.load(f)
        
    rows = []
    for feat in data["features"][1:]:
        coords = feat.get("geometry").get("coordinates", None)
        if not coords:
            continue
        lat = np.mean([i[1] for i in coords])
        long = np.mean([i[0] for i in coords])

        props = feat["properties"]

        seg_counts = props.get("segmentProbeCounts", [])
        if len(seg_counts) <1:
            continue
        record = {}
        record['lat'] = lat
        record['long'] = long
        record["segment_id"] =abs( props['segmentId'])
        record["street_name"] = props.get('streetName', f"Street_w_ID_{record['segment_id']}")
        record["frc"] = props['frc']
        record["speed_limit"] = props['speedLimit']
        record["length"] = props['distance']

        for seg in seg_counts:
            record[f"pc_timeset_{seg['timeSet']}"] = seg.get("probeCount", 0)
            
        rows.append(record)    
        
    df = pd.DataFrame(rows)
    probecols = [c for c in df.columns if c.startswith("pc_")]
    sum_probe_cols_df = df.groupby("segment_id")[probecols].mean().reset_index()
    props_df = df.groupby("segment_id")[["lat", "long", "street_name", "frc", "speed_limit", "length"]].first().reset_index()
    df_dir_summed = sum_probe_cols_df.merge(props_df, on= "segment_id")
    all_df.append(df_dir_summed)


In [90]:
all_df[0]['length'].sum()

np.float64(1229687.28)

In [None]:
cat_df = pd.concat(all_df, ignore_index=True)

probecols = [c for c in cat_df.columns if c.startswith("pc_")]
cat_df[probecols] = cat_df[probecols].fillna(0)


In [None]:
avgd_probe_cols_df = cat_df.groupby("segment_id")[probecols].mean().reset_index()

props_df = cat_df.groupby("segment_id")[["lat", "long", "street_name", "frc", "speed_limit", "length"]].first().reset_index()

df_avgd_all = avgd_probe_cols_df.merge(props_df, on= "segment_id")

In [62]:
df_avgd_all.shape

(24938, 31)

In [64]:
df_avgd_all

Unnamed: 0,segment_id,pc_timeset_2,pc_timeset_3,pc_timeset_4,pc_timeset_5,pc_timeset_6,pc_timeset_7,pc_timeset_8,pc_timeset_9,pc_timeset_10,...,pc_timeset_22,pc_timeset_23,pc_timeset_24,pc_timeset_25,lat,long,street_name,frc,speed_limit,length
0,-13560345653226,0.000000,0.333333,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.333333,...,0.000000,0.333333,0.000000,0.000000,28.618965,77.191500,Street_w_ID_-13560345653226,2,28,6.88
1,-13560345653148,25.333333,13.666667,8.000000,7.333333,8.000000,4.333333,3.000000,5.000000,23.333333,...,2.000000,1.000000,4.333333,7.666667,28.612253,77.249280,Bhairon Marg,2,60,50.26
2,-13560345652995,15.666667,13.000000,8.666667,6.666667,5.333333,4.000000,8.333333,22.000000,36.333333,...,98.000000,58.333333,33.000000,25.333333,28.545775,77.249540,Outer Ring Road,2,45,5.70
3,-13560345652992,25.000000,11.000000,10.333333,9.333333,4.000000,7.000000,11.333333,25.333333,33.666667,...,84.000000,66.666667,35.333333,24.333333,28.568843,77.214100,Mahatma Gandhi Marg,1,60,9.13
4,-13560345652964,26.333333,19.666667,13.666667,12.666667,13.666667,13.000000,24.333333,44.000000,86.666667,...,88.666667,73.666667,60.000000,47.333333,28.643507,77.270663,Marginal Bund Road,3,50,4.56
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24933,13560345653439,129.000000,88.000000,72.000000,68.000000,72.000000,73.666667,108.666667,225.000000,252.333333,...,333.666667,263.666667,193.000000,172.333333,28.545176,77.248156,Outer Ring Road,2,50,242.28
24934,13560345653514,2.333333,3.000000,2.000000,1.000000,0.333333,1.000000,1.666667,2.000000,2.000000,...,2.666667,2.666667,4.333333,1.666667,28.569755,77.198385,Mahatma Gandhi Marg,4,45,4.78
24935,13560345653561,77.666667,64.000000,52.000000,36.666667,35.666667,31.000000,28.333333,70.666667,116.666667,...,192.666667,147.333333,116.666667,84.000000,28.545990,77.250220,Lala Lajpat Rai Marg,3,45,4.95
24936,13560345653573,2.333333,3.000000,2.000000,1.000000,0.333333,1.000000,1.666667,2.000000,2.000000,...,2.666667,2.666667,4.333333,1.666667,28.569710,77.198390,Mahatma Gandhi Marg,4,45,5.27


In [75]:
all_df[2]['length'].sum()

np.float64(1403784.2999999998)

In [22]:
data["features"][1]

{'type': 'Feature',
 'geometry': {'type': 'LineString',
  'coordinates': [[77.24376, 28.64596],
   [77.2437, 28.64601],
   [77.24362, 28.64611]]},
 'properties': {'segmentId': -13560111507837,
  'newSegmentId': '-00004e30-3400-0400-0000-0000008bad86',
  'speedLimit': 45,
  'frc': 6,
  'streetName': 'Ghata Masjid Road',
  'distance': 20.87,
  'segmentProbeCounts': [{'timeSet': 2, 'dateRange': 1, 'probeCount': 3},
   {'timeSet': 3, 'dateRange': 1, 'probeCount': 4},
   {'timeSet': 4, 'dateRange': 1, 'probeCount': 4},
   {'timeSet': 5, 'dateRange': 1, 'probeCount': 2},
   {'timeSet': 6, 'dateRange': 1, 'probeCount': 1},
   {'timeSet': 7, 'dateRange': 1, 'probeCount': 1},
   {'timeSet': 8, 'dateRange': 1, 'probeCount': 0},
   {'timeSet': 9, 'dateRange': 1, 'probeCount': 4},
   {'timeSet': 10, 'dateRange': 1, 'probeCount': 6},
   {'timeSet': 11, 'dateRange': 1, 'probeCount': 7},
   {'timeSet': 12, 'dateRange': 1, 'probeCount': 13},
   {'timeSet': 13, 'dateRange': 1, 'probeCount': 15},
   {'t

## Sanitation

We need it in .csv, in nice and tidy format. 

# Algorithm

In [None]:
def gmm(X: np.ndarray, max_iter:int, n_clusters:int, seed:int = 0):
    n,d = X.shape
    k = n_clusters
    # np.random.seed(seed)
    c_pi = (1/k)*np.ones(k)
    c_mu = np.random.default_rng((d,k))
    c_sigma = np.array([np.eye(d) for i in range(k)])

    for i in range(max_iter):
        # E step
        cp_resp = np.zeros(n,k)
        



    

In [37]:
X = np.array([[1,2,3],[4,5,6], [7,8,9], [10, 11, 12], [13,14,15]])
k = np.array([[1,1,1], [2,2,2]])

X[:,None, :] - k[None, :, :]

array([[[ 0,  1,  2],
        [-1,  0,  1]],

       [[ 3,  4,  5],
        [ 2,  3,  4]],

       [[ 6,  7,  8],
        [ 5,  6,  7]],

       [[ 9, 10, 11],
        [ 8,  9, 10]],

       [[12, 13, 14],
        [11, 12, 13]]])

In [5]:
np.array([[1,2],[1,1]]).sum(axis=1, keepdims=True)


array([[3],
       [2]])

# Graphs and Interpretations and stuff