# Description of the Notebook
In this notebook we did some investigation of the UTD-19 dataset

In [1]:
import pandas as pd 
import matplotlib.pyplot as plt

In [2]:
import sys
import os

# Add the directory containing the module to the system path
module_path = os.path.abspath(os.path.join('C:\\Users\\samue\\OneDrive\\AIML\\HS2024\\Data Sicence Projekt\\HSLU_DSPRO1_TrafficStatus\\data'))
if module_path not in sys.path:
    sys.path.append(module_path)

# Import the module
import data_engineering_library as dlib

In [3]:
# Load the data form seperate dictionary, because the data is too big to load onto github
# Dataload restricted to 1 million rows, because of speed and memory issues
# Just for testing purposes
dataframe_London_UTD19 = dlib.load_data(path=r"C:\Users\samue\OneDrive\AIML\HS2024\Data Sicence Projekt\Data\London_UTD19.csv", nrows=1000000)
dataframe_London_UTD19.head()

Unnamed: 0,day,interval,detid,flow,occ,error,city,speed
0,2015-05-16,0,EAST_N04/161x1,144.0,0.028333,0.0,london,
1,2015-05-16,300,EAST_N04/161x1,204.0,0.04,0.0,london,
2,2015-05-16,600,EAST_N04/161x1,216.0,0.039167,0.0,london,
3,2015-05-16,900,EAST_N04/161x1,168.0,0.029167,0.0,london,
4,2015-05-16,1200,EAST_N04/161x1,144.0,0.024167,0.0,london,


In [4]:
anomalies = dlib.load_data(path=r"C:\Users\samue\OneDrive\AIML\HS2024\Data Sicence Projekt\Data\Datasets\Anomalies_18.10.2024.csv")
anomalies.head()

Unnamed: 0,detid
0,CNTR_N00/005b1
1,CNTR_N00/005g1
2,CNTR_N00/005g2
3,CNTR_N00/005x1
4,CNTR_N00/005x2


In [5]:
dataframe_detectors = dlib.load_data(path=r"C:\Users\samue\OneDrive\AIML\HS2024\Data Sicence Projekt\Data\London_detectors.csv")
dataframe_detectors.head()

Unnamed: 0,detid,length,pos,fclass,road,limit,citycode,lanes,linkid,long,lat
0,EAST_N04/161x1,0.303585,0.261157,secondary,Homerton Road,,london,1.0,5082.0,-0.021497,51.550929
1,EAST_N04/161y1,0.103679,0.063417,primary,Eastway,,london,1.0,5091.0,-0.020899,51.550704
2,EAST_N04/162a1,0.260623,0.117906,secondary,Homerton Road,,london,1.0,5083.0,-0.022649,51.550907
3,EAST_N04/162a2,0.216874,0.117942,secondary,Homerton Road,,london,1.0,5084.0,-0.022617,51.55088
4,EAST_N04/163f1,0.344754,0.329789,primary,Eastway,,london,1.0,5092.0,-0.019288,51.552281


In [6]:
dataframe_links = dlib.load_data(path=r"C:\Users\samue\OneDrive\AIML\HS2024\Data Sicence Projekt\Data\London_links.csv")
dataframe_links.head()

Unnamed: 0,long,lat,order,piece,linkid,group,citycode
0,-0.073746,51.509116,1,1,0,0.1,london
1,-0.073782,51.508952,2,1,0,0.1,london
2,-0.073879,51.508815,3,1,0,0.1,london
3,-0.073941,51.50869,4,1,0,0.1,london
4,-0.074041,51.508261,5,1,0,0.1,london


In [7]:
dataframe_anomalies = dataframe_detectors[dataframe_detectors['detid'].isin(anomalies['detid'])]
dataframe_anomalies.head()

Unnamed: 0,detid,length,pos,fclass,road,limit,citycode,lanes,linkid,long,lat
0,EAST_N04/161x1,0.303585,0.261157,secondary,Homerton Road,,london,1.0,5082.0,-0.021497,51.550929
1,EAST_N04/161y1,0.103679,0.063417,primary,Eastway,,london,1.0,5091.0,-0.020899,51.550704
3,EAST_N04/162a2,0.216874,0.117942,secondary,Homerton Road,,london,1.0,5084.0,-0.022617,51.55088
4,EAST_N04/163f1,0.344754,0.329789,primary,Eastway,,london,1.0,5092.0,-0.019288,51.552281
5,EAST_N04/237c2,0.441917,0.075087,primary,Eastway,,london,1.0,3485.0,-0.023171,51.550043


In [8]:
dataframe_without_anomalies = dataframe_detectors[~dataframe_detectors['detid'].isin(anomalies['detid'])]
dataframe_without_anomalies.head()

Unnamed: 0,detid,length,pos,fclass,road,limit,citycode,lanes,linkid,long,lat
2,EAST_N04/162a1,0.260623,0.117906,secondary,Homerton Road,,london,1.0,5083.0,-0.022649,51.550907
7,EAST_N04/161a1,0.063615,0.03903,secondary,Homerton Road,,london,1.0,5086.0,-0.020662,51.551381
10,EAST_N04/161e1,0.084818,0.026256,secondary,Homerton Road,,london,1.0,5087.0,-0.020714,51.551375
12,EAST_N04/162c1,0.077,0.022373,secondary,Homerton Road,,london,1.0,5089.0,-0.020858,51.551195
14,EAST_N17/006c1,0.177435,0.112597,tertiary,Barking Road,,london,1.0,3325.0,0.02992,51.524791


In [9]:
unique_days = dataframe_London_UTD19['day'].unique()
unique_days = pd.to_datetime(unique_days)
unique_days

DatetimeIndex(['2015-05-16', '2015-05-15', '2015-05-17', '2015-05-18',
               '2015-05-19', '2015-05-20', '2015-05-21', '2015-05-22',
               '2015-05-23', '2015-10-01', '2015-09-25', '2015-09-26',
               '2015-09-27', '2015-09-28', '2015-09-29', '2015-09-30',
               '2016-05-16', '2016-05-17', '2016-05-18', '2016-05-19',
               '2016-05-20', '2016-05-21', '2016-05-22'],
              dtype='datetime64[ns]', freq=None)

In [10]:
unique_links = dataframe_detectors['linkid'].unique()
unique_links.size

5158

In [11]:
multiple_entities = dataframe_links.groupby('linkid').filter(lambda x: len(x) > 1)

# Display the result
print("Values in 'linkid' with more than one entity:")
multiple_entities.groupby('linkid').head()

Values in 'linkid' with more than one entity:


Unnamed: 0,long,lat,order,piece,linkid,group,citycode
0,-0.073746,51.509116,1,1,0,0.1,london
1,-0.073782,51.508952,2,1,0,0.1,london
2,-0.073879,51.508815,3,1,0,0.1,london
3,-0.073941,51.508690,4,1,0,0.1,london
4,-0.074041,51.508261,5,1,0,0.1,london
...,...,...,...,...,...,...,...
33255,-0.123317,51.486183,4,1,5222,5222.1,london
33256,-0.139346,51.474186,1,1,5223,5223.1,london
33257,-0.137544,51.473404,2,1,5223,5223.1,london
33258,-0.136600,51.472910,3,1,5223,5223.1,london


In [12]:
dataframe_london_merged = pd.merge(dataframe_London_UTD19, dataframe_detectors, on='detid', how='inner')
dataframe_london_merged.head()

Unnamed: 0,day,interval,detid,flow,occ,error,city,speed,length,pos,fclass,road,limit,citycode,lanes,linkid,long,lat
0,2015-05-16,0,EAST_N04/161x1,144.0,0.028333,0.0,london,,0.303585,0.261157,secondary,Homerton Road,,london,1.0,5082.0,-0.021497,51.550929
1,2015-05-16,300,EAST_N04/161x1,204.0,0.04,0.0,london,,0.303585,0.261157,secondary,Homerton Road,,london,1.0,5082.0,-0.021497,51.550929
2,2015-05-16,600,EAST_N04/161x1,216.0,0.039167,0.0,london,,0.303585,0.261157,secondary,Homerton Road,,london,1.0,5082.0,-0.021497,51.550929
3,2015-05-16,900,EAST_N04/161x1,168.0,0.029167,0.0,london,,0.303585,0.261157,secondary,Homerton Road,,london,1.0,5082.0,-0.021497,51.550929
4,2015-05-16,1200,EAST_N04/161x1,144.0,0.024167,0.0,london,,0.303585,0.261157,secondary,Homerton Road,,london,1.0,5082.0,-0.021497,51.550929


In [13]:
dataframe_london_merged = dataframe_london_merged.drop(columns=['city', 'fclass', 'road', 'limit', 'citycode', 'day', 'error', 'speed', 'long', 'lat', 'lanes'])
dataframe_london_merged.head()

Unnamed: 0,interval,detid,flow,occ,length,pos,linkid
0,0,EAST_N04/161x1,144.0,0.028333,0.303585,0.261157,5082.0
1,300,EAST_N04/161x1,204.0,0.04,0.303585,0.261157,5082.0
2,600,EAST_N04/161x1,216.0,0.039167,0.303585,0.261157,5082.0
3,900,EAST_N04/161x1,168.0,0.029167,0.303585,0.261157,5082.0
4,1200,EAST_N04/161x1,144.0,0.024167,0.303585,0.261157,5082.0


In [14]:
unique_detectors = dataframe_london_merged['detid'].unique()
random_detector = pd.Series(unique_detectors).sample(1).iloc[0]
print(f"Selected detector: {random_detector}")
dataframe_detector = dataframe_london_merged[dataframe_london_merged['detid'] == random_detector]
dataframe_detector = dataframe_detector.drop(columns=['detid'])
dataframe_detector.head()

Selected detector: CNTR_N01/160d1


Unnamed: 0,interval,flow,occ,length,pos,linkid
387586,0,420.0,0.064167,0.381844,0.099483,5104.0
387587,300,384.0,0.061667,0.381844,0.099483,5104.0
387588,600,348.0,0.056667,0.381844,0.099483,5104.0
387589,900,336.0,0.050833,0.381844,0.099483,5104.0
387590,1200,312.0,0.0475,0.381844,0.099483,5104.0


In [15]:
dataframe_london_merged['detid_numeric'], unique_detids = pd.factorize(dataframe_london_merged['detid'])

#dataframe_london_merged['detid'] = pd.DataFrame({'detid': unique_detids, 'detid_numeric': range(len(unique_detids))})
dataframe_london_merged = dataframe_london_merged.drop(columns=['detid'])
dataframe_london_merged.head()

Unnamed: 0,interval,flow,occ,length,pos,linkid,detid_numeric
0,0,144.0,0.028333,0.303585,0.261157,5082.0,0
1,300,204.0,0.04,0.303585,0.261157,5082.0,0
2,600,216.0,0.039167,0.303585,0.261157,5082.0,0
3,900,168.0,0.029167,0.303585,0.261157,5082.0,0
4,1200,144.0,0.024167,0.303585,0.261157,5082.0,0


In [16]:
dataframe_london_merged.corr()

Unnamed: 0,interval,flow,occ,length,pos,linkid,detid_numeric
interval,1.0,0.203812,0.078225,-0.000173,-6.1e-05,1.6e-05,-0.000337
flow,0.203812,1.0,-0.228887,0.126306,0.235641,-0.148815,0.158924
occ,0.078225,-0.228887,1.0,-0.148653,-0.173764,0.059235,-0.00694
length,-0.000173,0.126306,-0.148653,1.0,0.681104,-0.32123,0.276067
pos,-6.1e-05,0.235641,-0.173764,0.681104,1.0,-0.369981,0.282726
linkid,1.6e-05,-0.148815,0.059235,-0.32123,-0.369981,1.0,-0.162234
detid_numeric,-0.000337,0.158924,-0.00694,0.276067,0.282726,-0.162234,1.0


In [17]:
dataframe_weather = dlib.load_data(path=r"C:\Users\samue\OneDrive\AIML\HS2024\Data Sicence Projekt\Data\London_UTD19_weather.csv")
dataframe_weather.head()

Unnamed: 0,rain,cloud_cover,date_string,interval
0,0.0,87.3,2015-05-16,0
1,0.0,74.4,2015-05-16,3600
2,0.0,100.0,2015-05-16,7200
3,0.0,100.0,2015-05-16,10800
4,0.0,100.0,2015-05-16,14400


In [18]:
dataframe_london_merged = pd.merge(dataframe_london_merged, dataframe_weather, on='interval', how='inner')
dataframe_london_merged = dataframe_london_merged.drop(columns=['date_string'])
dataframe_london_merged.head()

Unnamed: 0,interval,flow,occ,length,pos,linkid,detid_numeric,rain,cloud_cover
0,0,144.0,0.028333,0.303585,0.261157,5082.0,0,0.0,87.3
1,0,144.0,0.028333,0.303585,0.261157,5082.0,0,0.0,100.0
2,0,144.0,0.028333,0.303585,0.261157,5082.0,0,0.0,33.9
3,0,144.0,0.028333,0.303585,0.261157,5082.0,0,0.0,88.5
4,0,144.0,0.028333,0.303585,0.261157,5082.0,0,0.0,3.3


In [19]:
dataframe_london_merged.corr()

Unnamed: 0,interval,flow,occ,length,pos,linkid,detid_numeric,rain,cloud_cover
interval,1.0,0.212492,0.085155,-0.0001433747,-5.1e-05,4.3e-05,-0.000338,0.037075,0.06964559
flow,0.212492,1.0,-0.227759,0.1258626,0.235109,-0.148698,0.15775,0.030631,0.04012952
occ,0.085155,-0.227759,1.0,-0.1485481,-0.173406,0.058839,-0.006975,0.018308,0.02488284
length,-0.000143,0.125863,-0.148548,1.0,0.681088,-0.321251,0.276034,-1.5e-05,-3.836981e-07
pos,-5.1e-05,0.235109,-0.173406,0.6810881,1.0,-0.369977,0.282736,-2e-06,4.279429e-06
linkid,4.3e-05,-0.148698,0.058839,-0.3212514,-0.369977,1.0,-0.162282,-2.3e-05,-5.663795e-06
detid_numeric,-0.000338,0.15775,-0.006975,0.276034,0.282736,-0.162282,1.0,3.4e-05,3.658774e-05
rain,0.037075,0.030631,0.018308,-1.48114e-05,-2e-06,-2.3e-05,3.4e-05,1.0,0.2149487
cloud_cover,0.069646,0.04013,0.024883,-3.836981e-07,4e-06,-6e-06,3.7e-05,0.214949,1.0


In [20]:
dataframe_anomalies = dlib.load_data(path=r"C:\Users\samue\OneDrive\AIML\HS2024\Data Sicence Projekt\Data\Anomalies.csv")
dataframe_anomalies.head()

Unnamed: 0,detid,IQR_out_of_bound,IQR_to_small,not_enough_data
0,CNTR_N00/005b1,False,False,True
1,CNTR_N00/005g1,False,False,True
2,CNTR_N00/005g2,False,False,True
3,CNTR_N00/005x1,False,False,True
4,CNTR_N00/005x2,False,False,True
