#### Requirements
Using Python 3.11

**Necessary**
- Numpy 
- Pandas
- Matplotlib
- Beautiful Soup
- Seaborn (nice graphs)

_Might Need_
- Scipy
- Scikit-Learn
- tqdm (decoration) 

`pip install pandas seaborn scipy scikit-learn tqdm --no-cache-dir`


In [1]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import os
import csv
import random

from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

# Data

## Inspection

Just inspect it and decide which dataset to use and stuff

Viable Datasets
- [Traffic flow in Romanian cities during and around lifting of COVID19 restrictions](https://data.mendeley.com/datasets/g64s8h9k57/3)
- [New Delhi Traffic Probe & Analytics 2024](https://www.kaggle.com/datasets/rawsi18/new-delhi-traffic-probe-and-analytics-2024)

In [2]:
def cleanUnicode(s):
    s = s.replace(r"\u0163", "t")
    s = s.replace(r"\u015f", "s")
    s = s.replace(r"\u0219", "s")
    s = s.replace(r"\xe2", "a")
    s = s.replace(r"\u015e", "S")
    s = s.replace(r"\xf3k", "a")
    s = s.replace(r"\xe9", "e")
    s = s.replace(r"\u0103", "a")
    s = s.replace(r"\u0103", "a")
    return s


In [3]:
import glob
# glob.glob("data/sample_data/*.xml")

In [4]:
def parse_data(folder_path):
    
    # all_files:list[str] = [ f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f))]
    xml_files = glob.glob(f"{folder_path}/*.xml")
    rows = [["date", "time", "de", "le", "pc", "qd", "fc", "la", "lo", "cn", "ff", "jf", "sp", "su", "ty"]]
    for file in xml_files:
        # if file_name.endswith(".xml"):
            # print(file)
            file_name = file.split('/')[-1]
            name_parts = file_name.split("_")
            city = name_parts[0]
            date = name_parts[1]
            time = 5 * round(int(name_parts[2].split(".")[0])/5)
            with open(os.path.join(folder_path, file_name), "r", encoding="utf-8") as file:
                content = file.read()
            soup = BeautifulSoup(content, "lxml")
            roads = soup.find_all("fi")

            for road in roads:
                tmc = road.find("tmc")
                shps = road.find_all("shp")
                cf = road.find("cf")
                
                de = le = pc = qd = fc = la = lo = cn = ff = jf = sp = su = ty = ""
                
                if tmc:
                    de =tmc.get("de", "N/A")
                    le =tmc.get("le","-1")
                    pc =tmc.get("pc","-1")
                    qd =tmc.get("qd","")
                
                if shps:
                    fc = 6
                    coords = []
                    for shp in shps:
                        fc = min(fc, int(shp.get("fc",6))) 
                        for p in shp.text.strip().split():
                            coords.append(tuple(map(float, p.split(',')) ))
                    if len(coords) > 0:        
                        la = sum(c[0] for c in coords)/len(coords) 
                        lo = sum(c[1] for c in coords)/len(coords)    
                    else:
                        la = "N/A"
                        lo = "N/A"

                if cf:
                    cn =cf.get("cn","")
                    ff =cf.get("ff","")
                    jf =cf.get("jf","")
                    sp =cf.get("sp","")
                    su =cf.get("su","")
                    ty =cf.get("ty","")
                de = cleanUnicode(de)
                rows.append([date, time, de, le, pc, qd, fc, la, lo, cn, ff, jf, sp, su, ty ])
    
    with open(f"dump_{city}.csv", "w", newline="", encoding = "utf-8") as f:
        writer = csv.writer(f)
        writer.writerows(rows)
    


        
                
                


    

In [5]:
parse_data("data/sample_data")

## Sanitization

We need it in .csv, in nice and tidy format. 

In [6]:
df = pd.read_csv("./dump_bacau.csv", encoding="utf-8")
df.columns  = df.columns.str.strip()

In [7]:
df["datetime"] = pd.to_datetime(
    df["date"].astype(str) + " " + df["time"].astype(str).str.zfill(4),
    format="%d-%m-%Y %H%M",
    errors="coerce"
)

df["road_id"] = df["pc"].astype(str) + "-" + df["qd"].apply(lambda x: "up" if x == "+" else "down")
df['hour'] = df['datetime'].dt.hour

clean_df = df.drop(["date", "time", "sp", "ty", "pc", "qd"], axis=1)


In [8]:
clean_df['road_id'].nunique()

373

In [9]:
clean_df.head()

Unnamed: 0,de,le,fc,la,lo,cn,ff,jf,su,datetime,road_id,hour
0,DN2,0.04634,1,46.539543,26.916972,0.97,31.5,1.33671,23.66,2020-06-01 06:30:00,3763-up,6
1,Strada Bucegi,0.08475,1,46.539415,26.916098,0.96,40.4,3.14478,22.79,2020-06-01 06:30:00,18042-up,6
2,Strada Victor Babes,0.20342,1,46.539132,26.914116,0.93,20.0,0.0,28.16,2020-06-01 06:30:00,18041-up,6
3,Strada Aeroportului,0.45421,1,46.538463,26.909395,0.91,20.0,0.0,27.26,2020-06-01 06:30:00,18040-up,6
4,Strada Alexei Tolstoi,0.70002,1,46.539362,26.90484,0.77,29.4,0.0,30.86,2020-06-01 06:30:00,14603-up,6


In [10]:
hourly_agg_df =  clean_df.groupby(["road_id", "de", "le", "la", "lo", "hour"]).agg({
        "ff": "mean",
        "jf": "mean",
        "su": "mean"
    }).reset_index()

In [13]:
hourly_agg_df.head(30)

Unnamed: 0,road_id,de,le,la,lo,hour,ff,jf,su
0,13639-down,Calea Marasesti,0.50153,46.557819,26.911716,0,46.9,0.496862,44.22
1,13639-down,Calea Marasesti,0.50153,46.557819,26.911716,1,46.9,1.018533,40.655
2,13639-down,Calea Marasesti,0.50153,46.557819,26.911716,2,46.9,0.16686,46.0
3,13639-down,Calea Marasesti,0.50153,46.557819,26.911716,3,46.9,1.329482,38.98
4,13639-down,Calea Marasesti,0.50153,46.557819,26.911716,4,46.9,1.078993,42.1425
5,13639-down,Calea Marasesti,0.50153,46.557819,26.911716,5,46.9,1.284973,39.22
6,13639-down,Calea Marasesti,0.50153,46.557819,26.911716,6,46.9,0.753665,43.2425
7,13639-down,Calea Marasesti,0.50153,46.557819,26.911716,7,46.9,2.153598,33.785
8,13639-down,Calea Marasesti,0.50153,46.557819,26.911716,8,46.9,2.610873,30.57
9,13639-down,Calea Marasesti,0.50153,46.557819,26.911716,9,46.9,3.019057,28.3675


In [22]:
fin_df = hourly_agg_df.pivot_table(
        index=["road_id", "de", "le", "ff"],
        columns="hour",
        values=["jf", "su"])

In [24]:
fin_df.columns = [f"{var}_{hour}" for var, hour in fin_df.columns]
fin_df = fin_df.reset_index()

In [25]:
fin_df.head(10)

Unnamed: 0,road_id,de,le,ff,jf_0,jf_1,jf_2,jf_3,jf_4,jf_5,...,su_14,su_15,su_16,su_17,su_18,su_19,su_20,su_21,su_22,su_23
0,13639-down,Calea Marasesti,0.50153,46.9,0.496862,1.018533,0.16686,1.329482,1.078993,1.284973,...,32.895,30.34,39.7925,33.4825,34.9625,41.8275,32.875,43.125,47.15,45.28
1,13639-up,Calea Marasesti,0.57544,43.3,0.06024,0.06024,0.03012,0.763127,1.100038,2.16381,...,24.625,21.7925,32.065,26.2,30.1525,40.0475,41.9125,48.61,43.0,38.8
2,13640-down,Strada Ionita Sandu Sturza,0.57544,42.5,0.1023,0.21313,0.1023,0.09409,1.789005,1.237302,...,22.5775,23.42,27.3725,25.1025,24.1025,39.8,39.315,41.3725,42.0,40.39
3,13640-up,Strada Ionita Sandu Sturza,0.43693,43.2,0.04025,0.04025,0.191218,0.569725,1.171325,0.964165,...,20.065,28.215,34.8825,30.3425,33.3025,40.2575,40.985,42.705,42.0,37.46
4,13641-down,DN2G,0.45878,39.4,0.08828,0.074978,0.047745,0.69207,1.590383,0.643452,...,20.22,22.5775,27.94,19.075,30.015,35.0,38.525,38.895,38.2375,34.91
5,13641-up,DN2G,0.39176,38.5,0.11293,0.066433,0.105152,0.46301,1.282075,1.240585,...,24.4175,19.6225,24.865,24.69,27.3725,33.8125,35.2625,40.3775,37.135,36.8725
6,13642-down,DN15,0.33088,39.6,0.05869,0.120457,0.186805,1.98613,1.334963,2.372248,...,17.5725,20.4325,19.5,26.31,29.9725,37.065,39.335,39.2575,39.34,39.01
7,13642-up,DN15,0.52862,44.2,0.0,0.0,0.155277,2.353575,2.875167,2.212077,...,23.5075,25.4075,34.4175,34.1275,42.745,41.4975,39.735,44.1,42.2575,41.6575
8,13643-down,Strada I. L. Caragiale,0.54963,44.8,0.049232,0.0,0.124367,2.068763,2.82552,2.751215,...,24.0675,28.7875,33.21,38.05,40.3275,44.3025,45.615,43.5925,45.045,41.5
9,13643-up,Strada I. L. Caragiale,0.78983,48.1,0.185713,0.031227,0.0,0.34554,0.0,0.43141,...,45.735,45.8375,48.45,51.7175,48.8225,48.0725,50.615,49.2575,45.65,47.1125


In [123]:
fin_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 373 entries, 0 to 372
Data columns (total 52 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   road_id  373 non-null    object 
 1   de       373 non-null    object 
 2   le       373 non-null    float64
 3   ff       373 non-null    float64
 4   jf_0     373 non-null    float64
 5   jf_1     373 non-null    float64
 6   jf_2     373 non-null    float64
 7   jf_3     373 non-null    float64
 8   jf_4     373 non-null    float64
 9   jf_5     373 non-null    float64
 10  jf_6     373 non-null    float64
 11  jf_7     373 non-null    float64
 12  jf_8     373 non-null    float64
 13  jf_9     373 non-null    float64
 14  jf_10    373 non-null    float64
 15  jf_11    373 non-null    float64
 16  jf_12    373 non-null    float64
 17  jf_13    373 non-null    float64
 18  jf_14    373 non-null    float64
 19  jf_15    373 non-null    float64
 20  jf_16    373 non-null    float64
 21  jf_17    373 non

## Preprocessing


# Algorithm

In [None]:
def gmm(X: np.ndarray, max_iter:int, n_clusters:int, seed:int = 0):
    n,d = X.shape
    k = n_clusters
    # np.random.seed(seed)
    c_pi = (1/k)*np.ones(k)
    c_mu = np.random.default_rng((d,k))
    c_sigma = np.array([np.eye(d) for i in range(k)])

    for i in range(max_iter):
        # E step
        cp_resp = np.zeros(n,k)
        



    

In [37]:
X = np.array([[1,2,3],[4,5,6], [7,8,9], [10, 11, 12], [13,14,15]])
k = np.array([[1,1,1], [2,2,2]])

X[:,None, :] - k[None, :, :]

array([[[ 0,  1,  2],
        [-1,  0,  1]],

       [[ 3,  4,  5],
        [ 2,  3,  4]],

       [[ 6,  7,  8],
        [ 5,  6,  7]],

       [[ 9, 10, 11],
        [ 8,  9, 10]],

       [[12, 13, 14],
        [11, 12, 13]]])

# Graphs and Interpretations and stuff