## Baseline model development
This notebook was used to develop the baseline model (baseline_model.py). 

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn.metrics as metrics

In [4]:
# Used prepared data from 23.10.2024
train_data_path = '/Users/finneyer/Documents/HSLU/Semester 3/DSPRO1/Projektarbeit/Data/20241023/London_UTD19_train_Sniper_0.csv'
test_data_path = '/Users/finneyer/Documents/HSLU/Semester 3/DSPRO1/Projektarbeit/Data/20241023/London_UTD19_test_Sniper_0.csv'
train_data = pd.read_csv(train_data_path)
test_data = pd.read_csv(test_data_path)
train_data = train_data.drop('day', axis=1)
test_data = test_data.drop('day', axis=1)

In [13]:


def weekday_to_interval(df):
    weekday_to_num = {'Monday': 0, 'Tuesday': 1, 'Wednesday': 2, 'Thursday': 3, 'Friday': 4, 'Saturday': 5, 'Sunday': 6}
    seconds_per_day = 86400
    df['interval'] = df.apply(lambda row: row['interval'] + (weekday_to_num[row['weekday']] * seconds_per_day), axis=1)
    df = df.drop(columns=['weekday'])
    return df

def get_random_prediction(X_test, y_train):

    Q1 = y_train.quantile(0.25)
    Q3 = y_train.quantile(0.75)
    
    rand_lower_bound = int(np.round(Q1,0))
    rand_higher_bound = int(np.round(Q3,0))

    np.random.seed(2)
    random_ints = []
    for _ in range(X_test.shape[0]):
        if (rand_lower_bound == rand_higher_bound):
            random_ints.append(rand_lower_bound)
        else:
            random_ints.append(np.random.randint(rand_lower_bound, rand_higher_bound))
            
    
    return pd.DataFrame(random_ints, columns=['traffic'])

def random_traffic_prediction(train_data_set, test_data_set, sensors):
    
    train_detid_dfs = {detid: data for detid, data in train_data_set.groupby('detid')}
    test_detid_dfs = {detid: data for detid, data in test_data_set.groupby('detid')}
    
    sum_explained_variance = 0
    sum_mean_absolute_error = 0
    sum_mean_squared_error = 0
    sum_median_absolute_error = 0
    sum_r2 = 0
    counter = 0

    for sensor in sensors:

        train_data = train_detid_dfs[sensor]
        train_data = train_data.drop('detid', axis=1)
        train_data = weekday_to_interval(train_data)
        
        test_data = test_detid_dfs[sensor]
        test_data = test_data.drop('detid', axis=1)
        test_data = weekday_to_interval(test_data)

        # X_train = train_data.drop(['traffic'], axis=1)
        y_train = train_data['traffic']
        
        X_test = test_data.drop(['traffic'], axis=1)
        y_test = test_data['traffic']

        y_pred = get_random_prediction(X_test, y_train)

        # plt.figure(figsize=(20,6))
        # plt.scatter(X_test, y_test, alpha=0.5, color='red', label='Test data')
        # plt.scatter(X_test, y_pred, alpha=0.5, color='blue', label='Random Prediction')
        # plt.legend()
        # plt.show()

        sum_explained_variance += metrics.explained_variance_score(y_test, y_pred)
        sum_mean_absolute_error += metrics.mean_absolute_error(y_test, y_pred) 
        sum_mean_squared_error += metrics.mean_squared_error(y_test, y_pred) 
        sum_median_absolute_error += metrics.median_absolute_error(y_test, y_pred)
        sum_r2 += metrics.r2_score(y_test, y_pred)
        
        counter += 1
        print(counter , ' of ', len(sensors), ' sensors done')
    print ('Mean of explained variance: ' , sum_explained_variance / len(sensors))
    print ('Mean of mean absolute_error: ' , sum_mean_absolute_error / len(sensors))
    print ('Mean of mean squared error: ' , sum_mean_squared_error / len(sensors))
    print ('Mean of median absolute error: ' , sum_median_absolute_error / len(sensors))
    print ('Mean of r2: ' , sum_r2 / len(sensors))
    
    return sum_explained_variance / len(sensors), sum_mean_absolute_error / len(sensors), sum_mean_squared_error / len(sensors), sum_median_absolute_error / len(sensors), sum_r2 / len(sensors)


            
        

In [14]:
sensors = random_elements = np.random.choice(train_data['detid'], size=10, replace=False)
random_traffic_prediction(train_data, test_data, train_data['detid'].unique())



1  of  2179  sensors done
2  of  2179  sensors done
3  of  2179  sensors done
4  of  2179  sensors done
5  of  2179  sensors done
6  of  2179  sensors done
7  of  2179  sensors done
8  of  2179  sensors done
9  of  2179  sensors done
10  of  2179  sensors done
11  of  2179  sensors done
12  of  2179  sensors done
13  of  2179  sensors done
14  of  2179  sensors done
15  of  2179  sensors done
16  of  2179  sensors done
17  of  2179  sensors done
18  of  2179  sensors done
19  of  2179  sensors done
20  of  2179  sensors done
21  of  2179  sensors done
22  of  2179  sensors done
23  of  2179  sensors done
24  of  2179  sensors done
25  of  2179  sensors done
26  of  2179  sensors done
27  of  2179  sensors done
28  of  2179  sensors done
29  of  2179  sensors done
30  of  2179  sensors done
31  of  2179  sensors done
32  of  2179  sensors done
33  of  2179  sensors done
34  of  2179  sensors done
35  of  2179  sensors done
36  of  2179  sensors done
37  of  2179  sensors done
38  of  21

(-0.16045715248276896,
 5.551045666864425,
 69.87083396896507,
 4.744837081229922,
 -0.23990156085661596)