In [2]:
# Notebook adapted from linear regression notebook from the Python Data Science Handbook
# Modified by: Gábor Major
# Last Modified date: 2024-10-08

In [4]:
# Imports
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('seaborn-v0_8-whitegrid')
import numpy as np
import pandas as pd

In [20]:
# Import raw data
# To use already cleaned data skip down 6 cells
# Data from: https://data.gov.ie/dataset/dcc-scats-detector-volume-jan-jun-2023
traffic_data_list = pd.read_csv('data/SCATSFebruary2023.csv', sep=',', header=0, usecols=[0, 1, 2, 4, 5])

In [21]:
print(traffic_data_list)
print(type(traffic_data_list))

               End_Time Region  Site  Sum_Volume  Avg_Volume
0        20230228060000  CCITY   782           0           0
1        20230228060000  CCITY   782           0           0
2        20230228060000  CCITY   782           0           0
3        20230228060000  CCITY   782           0           0
4        20230228060000  CCITY   782           0           0
...                 ...    ...   ...         ...         ...
9430335  20230228060000    IRE   745          59           4
9430336  20230228060000    IRE   745          18           1
9430337  20230228060000    IRE   745          20           1
9430338  20230228060000    IRE   745          26           2
9430339  20230228060000    IRE   745          41           3

[9430340 rows x 5 columns]
<class 'pandas.core.frame.DataFrame'>


In [22]:
# Function for processing raw file
def sum_site_number_vaues(site_number, volume_list):
    specific_site_data = volume_data.loc[volume_data['Site'] == site_number]
    
    start_row_index = 0
    previous_time = specific_site_data.iloc[0, 0]
    region_code = specific_site_data.iloc[0, 1]
    row_index = 0
    
    for _, row in specific_site_data.iterrows():
        if row['End_Time'] != previous_time:
            volume_list.append([
                int(previous_time),
                region_code,
                int(site_number),
                int(specific_site_data.iloc[start_row_index:row_index, 3:4].sum().iloc[0]),
                int(specific_site_data.iloc[start_row_index:row_index, 4:5].sum().iloc[0])
            ])
            
            start_row_index = row_index
            previous_time = row['End_Time']
        
        row_index += 1

In [6]:
from multiprocessing import Process, Manager
# Sum up data for each camera at each site
summed_traffic_volume_list = Manager().list()

site_numbers = traffic_data_list['Site'].unique()
process_list = []
counter = 0

for number in site_numbers:
    process = Process(target=sum_site_number_vaues, args=(number,summed_traffic_volume_list,))
    process_list.append(process)
    process.start()
    counter += 1
    if counter % 100 == 0:
        print(counter)

for process in process_list:
    process.join()

100
200
300
400
500
600
700
800
900


In [12]:
print(len(summed_traffic_volume_list))

611851


In [17]:
# Save cleaned file
import csv
columns_names = ['End_Time', 'Region', 'Site', 'Sum_Volume', 'Avg_Volume']
with open('data/summed_data.csv', 'w') as f:
    writer = csv.writer(f)
    writer.writerow(columns_names)
    writer.writerows(summed_traffic_volume_list)

In [5]:
# Import cleaned data
cleaned_data = pd.read_csv('data/summed_data.csv', sep=',', header=0)
print(cleaned_data)

              End_Time Region  Site  Sum_Volume  Avg_Volume
0       20230228060000  CCITY   782           0           0
1       20230228050000  CCITY   782          90           7
2       20230228040000  CCITY   782         194          15
3       20230228030000  CCITY   782         121           9
4       20230228060000  CCITY   796         266          18
...                ...    ...   ...         ...         ...
611846  20230228110000    IRE  6381          86           4
611847  20230228100000    IRE  6381         105           5
611848  20230228090000    IRE  6381         133           8
611849  20230228080000    IRE  6381          74           3
611850  20230228070000    IRE  6381          65           3

[611851 rows x 5 columns]


In [None]:
# Categorise data
time_and_volume = []
region_and_volume = []

In [None]:
# Convert to numpy data
x = input_data[0].to_numpy()
y = input_data[1].to_numpy()
plt.scatter(x, y)

In [None]:
# Create 7-th degree polynomial model
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression
poly_model = make_pipeline(PolynomialFeatures(7),
                           LinearRegression())

In [None]:
# Fit data
poly_model.fit(x, y)

In [None]:
# Create test set
xfit = np.linspace(0, 10, 1000)
yfit = poly_model.predict(xfit[:, np.newaxis])

In [None]:
# Plot data
plt.scatter(x, y)
plt.plot(xfit, yfit);