# LINEAR REGRESSION FOR TRAFFIC

In [1]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDRegressor
from sklearn import metrics
import pandas as pd
import numpy as np
import sys

### Generating new values for Traffic Density for test

#### Importing checkpoint 5

In [2]:
#Loading the checkpoint5 
checkpoint5 = pd.read_csv('../kaggle_test_traffic.csv')
checkpoint5.head(5)

Unnamed: 0,key,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,day,weekend,holiday,peak_hour,hotspot,dist,traffic_density
0,2015-01-27 13:08:24.0000002,-73.97332,40.763805,-73.98143,40.743835,1,1,0,0,0,0,2.32326,0.083333
1,2015-01-27 13:08:24.0000003,-73.986862,40.719383,-73.998886,40.739201,1,1,0,0,0,0,2.425353,0.083333
2,2011-10-08 11:53:44.0000002,-73.982524,40.75126,-73.979654,40.746139,1,5,1,0,0,0,0.618628,0.4
3,2012-12-01 21:12:12.0000002,-73.98116,40.767807,-73.990448,40.751635,1,5,1,0,0,0,1.961033,0.4
4,2012-12-01 21:12:12.0000003,-73.966046,40.789775,-73.988565,40.744427,1,5,1,0,0,1,5.387301,0.7


In [3]:
#converting the weekend column from boolean to binary
checkpoint5.replace({False: 0, True: 1}, inplace=True)
checkpoint5

Unnamed: 0,key,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,day,weekend,holiday,peak_hour,hotspot,dist,traffic_density
0,2015-01-27 13:08:24.0000002,-73.973320,40.763805,-73.981430,40.743835,1,1,0,0,0,0,2.323260,0.083333
1,2015-01-27 13:08:24.0000003,-73.986862,40.719383,-73.998886,40.739201,1,1,0,0,0,0,2.425353,0.083333
2,2011-10-08 11:53:44.0000002,-73.982524,40.751260,-73.979654,40.746139,1,5,1,0,0,0,0.618628,0.400000
3,2012-12-01 21:12:12.0000002,-73.981160,40.767807,-73.990448,40.751635,1,5,1,0,0,0,1.961033,0.400000
4,2012-12-01 21:12:12.0000003,-73.966046,40.789775,-73.988565,40.744427,1,5,1,0,0,1,5.387301,0.700000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9909,2015-05-10 12:37:51.0000002,-73.968124,40.796997,-73.955643,40.780388,6,6,1,0,0,0,2.124874,0.400000
9910,2015-01-12 17:05:51.0000001,-73.945511,40.803600,-73.960213,40.776371,6,0,0,0,1,0,3.270969,0.383333
9911,2015-04-19 20:44:15.0000001,-73.991600,40.726608,-73.789742,40.647011,6,6,1,0,0,0,19.183941,0.400000
9912,2015-01-31 01:05:19.0000005,-73.985573,40.735432,-73.939178,40.801731,6,5,1,0,0,0,8.343486,0.400000


#### Generating Traffic Density using the new weights

In [4]:
del checkpoint5['traffic_density']

In [5]:
checkpoint5

Unnamed: 0,key,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,day,weekend,holiday,peak_hour,hotspot,dist
0,2015-01-27 13:08:24.0000002,-73.973320,40.763805,-73.981430,40.743835,1,1,0,0,0,0,2.323260
1,2015-01-27 13:08:24.0000003,-73.986862,40.719383,-73.998886,40.739201,1,1,0,0,0,0,2.425353
2,2011-10-08 11:53:44.0000002,-73.982524,40.751260,-73.979654,40.746139,1,5,1,0,0,0,0.618628
3,2012-12-01 21:12:12.0000002,-73.981160,40.767807,-73.990448,40.751635,1,5,1,0,0,0,1.961033
4,2012-12-01 21:12:12.0000003,-73.966046,40.789775,-73.988565,40.744427,1,5,1,0,0,1,5.387301
...,...,...,...,...,...,...,...,...,...,...,...,...
9909,2015-05-10 12:37:51.0000002,-73.968124,40.796997,-73.955643,40.780388,6,6,1,0,0,0,2.124874
9910,2015-01-12 17:05:51.0000001,-73.945511,40.803600,-73.960213,40.776371,6,0,0,0,1,0,3.270969
9911,2015-04-19 20:44:15.0000001,-73.991600,40.726608,-73.789742,40.647011,6,6,1,0,0,0,19.183941
9912,2015-01-31 01:05:19.0000005,-73.985573,40.735432,-73.939178,40.801731,6,5,1,0,0,0,8.343486


In [6]:
#function to calculate the equation
def calc_traffic_density(weekend, public_hol, peak_hr, hotspot, weights):
    #defining the weights and the small constant
    WEIGHT_A = weights[1]
    WEIGHT_B = weights[2]
    WEIGHT_C = weights[3]
    EPSILON = sys.float_info.epsilon
    
    #Getting the Unweighted values
    day = max((weekend or public_hol),0.05)
    time = max(peak_hr,0.1)
    place = max(hotspot,0.1)
    
    #Getting the Weighted Values
    weighted_day = WEIGHT_A * day
    weighted_time = WEIGHT_B * time
    weighted_place = WEIGHT_C * place
    
    #Summing up all the variables
    traffic = weighted_day + weighted_place + weighted_time + EPSILON
    
    return traffic

In [7]:
#Iterating through the dataset
weights = [8.32199613e-23, 3.33126264e-01, 3.33042499e-01,3.33102844e-01]
traffic_density = []
for index, row in checkpoint5.iterrows():
    traffic_value = calc_traffic_density(weekend=row['weekend'], public_hol=row['holiday'], peak_hr=row['peak_hour'], hotspot=row['hotspot'], weights=weights)
    traffic_density.insert(index,traffic_value)

In [8]:
# traffic_density

In [9]:
traffic_density_df = pd.DataFrame(traffic_density)
traffic_density_df.to_csv('SGDR_traffic_density_test.csv')

In [10]:
checkpoint5_sgdr_traffic = checkpoint5.join(traffic_density_df)
checkpoint5_sgdr_traffic

Unnamed: 0,key,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,day,weekend,holiday,peak_hour,hotspot,dist,0
0,2015-01-27 13:08:24.0000002,-73.973320,40.763805,-73.981430,40.743835,1,1,0,0,0,0,2.323260,0.083271
1,2015-01-27 13:08:24.0000003,-73.986862,40.719383,-73.998886,40.739201,1,1,0,0,0,0,2.425353,0.083271
2,2011-10-08 11:53:44.0000002,-73.982524,40.751260,-73.979654,40.746139,1,5,1,0,0,0,0.618628,0.399741
3,2012-12-01 21:12:12.0000002,-73.981160,40.767807,-73.990448,40.751635,1,5,1,0,0,0,1.961033,0.399741
4,2012-12-01 21:12:12.0000003,-73.966046,40.789775,-73.988565,40.744427,1,5,1,0,0,1,5.387301,0.699533
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9909,2015-05-10 12:37:51.0000002,-73.968124,40.796997,-73.955643,40.780388,6,6,1,0,0,0,2.124874,0.399741
9910,2015-01-12 17:05:51.0000001,-73.945511,40.803600,-73.960213,40.776371,6,0,0,0,1,0,3.270969,0.383009
9911,2015-04-19 20:44:15.0000001,-73.991600,40.726608,-73.789742,40.647011,6,6,1,0,0,0,19.183941,0.399741
9912,2015-01-31 01:05:19.0000005,-73.985573,40.735432,-73.939178,40.801731,6,5,1,0,0,0,8.343486,0.399741


In [11]:
checkpoint5_sgdr_traffic.to_csv('testset_SGDR_traffic.csv', index=False)

#### Generating Traffic Density using the old weights

In [2]:
#Loading the checkpoint5 
checkpoint5 = pd.read_csv('../kaggle_test_traffic.csv')
checkpoint5.head(5)

Unnamed: 0,key,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,day,weekend,holiday,peak_hour,hotspot,dist,traffic_density
0,2015-01-27 13:08:24.0000002,-73.97332,40.763805,-73.98143,40.743835,1,1,0,0,0,0,2.32326,0.083333
1,2015-01-27 13:08:24.0000003,-73.986862,40.719383,-73.998886,40.739201,1,1,0,0,0,0,2.425353,0.083333
2,2011-10-08 11:53:44.0000002,-73.982524,40.75126,-73.979654,40.746139,1,5,1,0,0,0,0.618628,0.4
3,2012-12-01 21:12:12.0000002,-73.98116,40.767807,-73.990448,40.751635,1,5,1,0,0,0,1.961033,0.4
4,2012-12-01 21:12:12.0000003,-73.966046,40.789775,-73.988565,40.744427,1,5,1,0,0,1,5.387301,0.7


In [3]:
#converting the weekend column from boolean to binary
checkpoint5.replace({False: 0, True: 1}, inplace=True)
checkpoint5

Unnamed: 0,key,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,day,weekend,holiday,peak_hour,hotspot,dist,traffic_density
0,2015-01-27 13:08:24.0000002,-73.973320,40.763805,-73.981430,40.743835,1,1,0,0,0,0,2.323260,0.083333
1,2015-01-27 13:08:24.0000003,-73.986862,40.719383,-73.998886,40.739201,1,1,0,0,0,0,2.425353,0.083333
2,2011-10-08 11:53:44.0000002,-73.982524,40.751260,-73.979654,40.746139,1,5,1,0,0,0,0.618628,0.400000
3,2012-12-01 21:12:12.0000002,-73.981160,40.767807,-73.990448,40.751635,1,5,1,0,0,0,1.961033,0.400000
4,2012-12-01 21:12:12.0000003,-73.966046,40.789775,-73.988565,40.744427,1,5,1,0,0,1,5.387301,0.700000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9909,2015-05-10 12:37:51.0000002,-73.968124,40.796997,-73.955643,40.780388,6,6,1,0,0,0,2.124874,0.400000
9910,2015-01-12 17:05:51.0000001,-73.945511,40.803600,-73.960213,40.776371,6,0,0,0,1,0,3.270969,0.383333
9911,2015-04-19 20:44:15.0000001,-73.991600,40.726608,-73.789742,40.647011,6,6,1,0,0,0,19.183941,0.400000
9912,2015-01-31 01:05:19.0000005,-73.985573,40.735432,-73.939178,40.801731,6,5,1,0,0,0,8.343486,0.400000


In [4]:
#function to calculate the equation
def calc_traffic_density_v2(weekend, public_hol, peak_hr, hotspot, old_weights):
    #defining the weights and the small constant
    WEIGHT_A = old_weights[1]
    WEIGHT_B = old_weights[2]
    WEIGHT_C = old_weights[3]
    EPSILON = sys.float_info.epsilon
    
    #Getting the Unweighted values
    day = max((weekend or public_hol),0.05)
    time = max(peak_hr,0.1)
    place = max(hotspot,0.1)
    
    #Getting the Weighted Values
    weighted_day = WEIGHT_A * day
    weighted_time = WEIGHT_B * time
    weighted_place = WEIGHT_C * place
    
    #Summing up all the variables
    traffic = weighted_day + weighted_place + weighted_time + EPSILON
    
    return traffic

In [5]:
old_weights = [-7.60922359e-23,3.33125643e-01,3.33044434e-01,3.33102771e-01]

In [6]:
#Iterating through the dataset
traffic_density_old = []
for index, row in checkpoint5.iterrows():
    traffic_value = calc_traffic_density_v2(weekend=row['weekend'], public_hol=row['holiday'], peak_hr=row['peak_hour'], hotspot=row['hotspot'], old_weights=old_weights)
    traffic_density_old.insert(index,traffic_value)

In [7]:
# traffic_density_old

In [8]:
traffic_density_df_old = pd.DataFrame(traffic_density_old)
traffic_density_df_old.to_csv('SGDR_traffic_density_oldtest.csv')

In [9]:
checkpoint5_sgdr_traffic_v2 = checkpoint5.join(traffic_density_df_old)
checkpoint5_sgdr_traffic_v2

Unnamed: 0,key,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,day,weekend,holiday,peak_hour,hotspot,dist,traffic_density,0
0,2015-01-27 13:08:24.0000002,-73.973320,40.763805,-73.981430,40.743835,1,1,0,0,0,0,2.323260,0.083333,0.083271
1,2015-01-27 13:08:24.0000003,-73.986862,40.719383,-73.998886,40.739201,1,1,0,0,0,0,2.425353,0.083333,0.083271
2,2011-10-08 11:53:44.0000002,-73.982524,40.751260,-73.979654,40.746139,1,5,1,0,0,0,0.618628,0.400000,0.399740
3,2012-12-01 21:12:12.0000002,-73.981160,40.767807,-73.990448,40.751635,1,5,1,0,0,0,1.961033,0.400000,0.399740
4,2012-12-01 21:12:12.0000003,-73.966046,40.789775,-73.988565,40.744427,1,5,1,0,0,1,5.387301,0.700000,0.699533
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9909,2015-05-10 12:37:51.0000002,-73.968124,40.796997,-73.955643,40.780388,6,6,1,0,0,0,2.124874,0.400000,0.399740
9910,2015-01-12 17:05:51.0000001,-73.945511,40.803600,-73.960213,40.776371,6,0,0,0,1,0,3.270969,0.383333,0.383011
9911,2015-04-19 20:44:15.0000001,-73.991600,40.726608,-73.789742,40.647011,6,6,1,0,0,0,19.183941,0.400000,0.399740
9912,2015-01-31 01:05:19.0000005,-73.985573,40.735432,-73.939178,40.801731,6,5,1,0,0,0,8.343486,0.400000,0.399740


In [10]:
del checkpoint5_sgdr_traffic_v2['traffic_density']

In [11]:
checkpoint5_sgdr_traffic_v2.to_csv('testset_SGDR_traffic_old.csv', index=False)