In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import geopandas as gpd
from shapely.geometry.polygon import Polygon
from h3 import h3 
from shapely.geometry import shape
from shapely.geometry import Point
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split


In [5]:
# Provide the list of columns you want to read
columns_to_read = ['start_time','start_census','start_latitude','start_longitude','date','weekday','hour','day','month']


# Parse the 'start_time' column as dates for better handling
# Select predefined specific columns to read from the CSV
df = pd.read_csv('taxi_2017_preprocessed_with_census.csv', parse_dates=['start_time'], usecols=columns_to_read)

In [None]:
# This effectively removes the time information and retains only the date
df['date'] = df['date'].apply(lambda d: d[:10])

In [None]:
# Define the conversion function using NumPy's vectorize
# This function converts latitude and longitude to H3 hexagons
get_h3_hex_id = np.vectorize(h3.geo_to_h3)

# Iterate over a range of H3 resolutions from 3 to 9
for i in range(3, 10):
    # Create a column name for the H3 hexagon values based on the resolution
    name = 'h3_with_reso:' + str(i)
    
    # Apply the 'get_h3_hex_id' function to the latitude and longitude columns
    # Using the current H3 resolution 'i', and store the result in a new column
    df[name] = get_h3_hex_id(df["start_latitude"].values, df["start_longitude"].values, i)

#optional, safe this dataframe
#df.to_csv('svm_preprocessed')

In [170]:
#start from here if you saved the above df
df=pd.read_csv('svm_preprocessed.csv')

#show current df
df

Unnamed: 0.1,Unnamed: 0,start_time,start_census,start_latitude,start_longitude,date,weekday,hour,day,month,h3_with_reso:5,h3_with_reso:6,h3_with_reso:7,h3_with_reso:8,h3_with_reso:9,h3_with_reso:10,h3_with_reso:3,h3_with_reso:4
0,0,2017-01-01 00:00:00,17031081800,41.893216,-87.637844,2017-01-01,6,0,1,1,852664c3fffffff,862664c1fffffff,872664c1effffff,882664c1e5fffff,892664c1e4fffff,8a2664c1e4effff,832664fffffffff,842664dffffffff
1,1,2017-01-01 00:00:00,17031020602,42.001571,-87.695013,2017-01-01,6,0,1,1,852664dbfffffff,862664d8fffffff,872664d8effffff,882664d8e1fffff,892664d8e1bffff,8a2664d8e18ffff,832664fffffffff,842664dffffffff
2,2,2017-01-01 00:00:00,17031160900,41.953582,-87.723452,2017-01-01,6,0,1,1,852664cbfffffff,862664ca7ffffff,872664ca4ffffff,882664ca6bfffff,892664ca6b7ffff,8a2664ca6b4ffff,832664fffffffff,842664dffffffff
3,3,2017-01-01 00:00:00,17031839100,41.880994,-87.632746,2017-01-01,6,0,1,1,852664c3fffffff,862664c1fffffff,872664c1affffff,882664c1a9fffff,892664c1a8bffff,8a2664c1a887fff,832664fffffffff,842664dffffffff
4,4,2017-01-01 00:00:00,17031281900,41.879255,-87.642649,2017-01-01,6,0,1,1,852664c3fffffff,862664c1fffffff,872664c1affffff,882664c1adfffff,892664c1acfffff,8a2664c1acd7fff,832664fffffffff,842664dffffffff
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21276739,21276739,2017-12-31 23:45:00,17031062200,41.944227,-87.655998,2017-12-31,6,23,31,12,852664c3fffffff,862664c17ffffff,872664c16ffffff,882664c163fffff,892664c1607ffff,8a2664c1604ffff,832664fffffffff,842664dffffffff
21276740,21276740,2017-12-31 23:45:00,17031081402,41.891972,-87.612945,2017-12-31,6,23,31,12,852664c3fffffff,862664c1fffffff,872664c1effffff,882664c1ebfffff,892664c1ea7ffff,8a2664c1ea57fff,832664fffffffff,842664dffffffff
21276741,21276741,2017-12-31 23:45:00,17031241400,41.906026,-87.675312,2017-12-31,6,23,31,12,852664cbfffffff,862664cafffffff,872664cacffffff,882664cac1fffff,892664cac03ffff,8a2664cac077fff,832664fffffffff,842664dffffffff
21276742,21276742,2017-12-31 23:45:00,17031081000,41.899602,-87.633308,2017-12-31,6,23,31,12,852664c3fffffff,862664c1fffffff,872664c1effffff,882664c1edfffff,892664c1ecfffff,8a2664c1ecdffff,832664fffffffff,842664dffffffff


In [3]:
#generate preprocessed dataset for the regression task
def createdataset(data, timebucket=1, useh3=False, h3reso=4, coordinateenc=True):
    # Check if the requested H3 resolution is too high when using H3 hexagons
    if h3reso > 9 and useh3:
        print('Resolution is too high, no need, just use census tract!')
        return
    
    # Create a copy of the input data
    df_raw = data.copy()
    
    #part 1, timebucket:
    # Create a new column 'new_hour' by categorizing the 'hour' column into time buckets
    df_raw['new_hour'] = pd.cut(df_raw['hour'], bins=range(-1, 25, timebucket), labels=False)
    
    #part 2 spatial unit
    # Decide whether to use H3 values or the 'start_census' attribute for spatial grouping
    if useh3:
        spatial = 'h3_with_reso:' + str(h3reso)
    else:
        spatial = 'start_census'
    
    #part 3: group by spatial-temporal resolution = calculate demand
    # Group the data by date, new_hour, spatial attribute,( weekday, day, and month are sublclasses of date)
    df_done = df_raw.groupby(['date', 'new_hour', spatial, 'weekday', 'day', 'month']).size().reset_index(name='demand_count')
    
    #part 4: encode the spatial information to coordinates (different approach for h3/census_tract)
    if useh3 and coordinateenc:
        # Calculate centroids using apply
        df_done['centroids'] = df_done[spatial].apply(lambda h: h3.h3_to_geo(h))
        df_done['lat'] = df_done['centroids'].apply(lambda centroid: centroid[0])
        df_done['lng'] = df_done['centroids'].apply(lambda centroid: centroid[1])
        df_done.drop(['centroids', spatial], axis=1, inplace=True)
    elif coordinateenc:
        # Load census tract data
        #reasoning: because in original df each census tract had two different centroid coordinates
        gdf = gpd.read_file('extra_dataframes/Census Tracts - 2010.geojson')
        gdf = gdf.drop(['statefp10', 'name10', 'commarea_n', 'namelsad10', 'commarea', 'tractce10', 'countyfp10'], axis=1)
        gdf['midpoint'] = gdf['geometry'].centroid
        gdf['lat'] = gdf['midpoint'].y
        gdf['lng'] = gdf['midpoint'].x
        gdf['geoid10'] = gdf['geoid10'].astype('int')
        missing = df_raw[df_raw['start_census'] == 17031770700]
        missing = missing.drop_duplicates(subset='start_census')
        # Merge census tract data and missing data
        df_done = df_done.merge(gdf[['geoid10', 'lat', 'lng']],
                               right_on='geoid10', left_on='start_census', how='left').drop(['geoid10'], axis=1)
        df_done = df_done.merge(missing[['start_census', 'start_latitude', 'start_longitude']], on='start_census', how='left')
        df_done['lat'].fillna(df_done['start_latitude'], inplace=True)
        df_done['lng'].fillna(df_done['start_longitude'], inplace=True)
        df_done.drop(['start_latitude', 'start_longitude', spatial], axis=1, inplace=True)
    
    # Return the resulting grouped data with demand counts
    return df_done



In [113]:
#test the function
createdataset(df, timebucket=2, useh3=True, h3reso=4, coordinateenc=True)

Unnamed: 0,date,new_hour,weekday,day,month,demand_count,lat,lng
0,2017-01-01,0,6,1,1,6,41.608438,-87.884807
1,2017-01-01,0,6,1,1,9570,41.924549,-87.558586
2,2017-01-01,0,6,1,1,55,41.974574,-88.119476
3,2017-01-01,1,6,1,1,5,41.608438,-87.884807
4,2017-01-01,1,6,1,1,9754,41.924549,-87.558586
...,...,...,...,...,...,...,...,...
13911,2017-12-31,9,6,31,12,203,41.974574,-88.119476
13912,2017-12-31,10,6,31,12,3367,41.924549,-87.558586
13913,2017-12-31,10,6,31,12,120,41.974574,-88.119476
13914,2017-12-31,11,6,31,12,2960,41.924549,-87.558586


In [4]:
#function for training data with different options
def trainsvmmodel(X, y, maxiter=1000, splitratio=0.2, kernel='linear'):
    #part 1: splitting and scaling data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=splitratio)
    
    #scaling input data
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    #part 2: applying grid search to find best model parameters
    if kernel=='linear':
        param_grid = {'C': [0.1, 1, 10, 100, 1000], 'epsilon': [0.0001, 0.001, 0.1, 0.2, 0.5, 0.9]}
    else:
        param_grid = {'C': [0.1, 1, 10, 100, 1000], 'gamma':('auto', 'scale'), 'epsilon': [0.0001, 0.001, 0.1, 0.2, 0.5, 0.9]}
    
    svr = SVR(kernel=kernel, max_iter=maxiter)
    grid_search = GridSearchCV(svr, param_grid, cv=5)
    grid_search.fit(X_train_scaled, y_train)
    
    best_params =grid_search.best_params_
    
    #part 3: create model with best parameters, then train the model
    if kernel=='linear':
        best_model = SVR(kernel=kernel,epsilon=best_params['epsilon'], C=best_params['C'], max_iter=maxiter)
    else:
        best_model = SVR(kernel=kernel,gamma=best_params['gamma'],epsilon=best_params['epsilon'], C=best_params['C'], max_iter=maxiter)
     
    # Train SVR with the best parameter
    best_model.fit(X_train_scaled, y_train)
    
    #part 4: evaluating Model
    y_pred = best_model.predict(X_test_scaled)
    mse = np.mean((y_pred - y_test)**2) #this is the MSE
    print(f'Final Loss: {mse:.4f}')
    r2 = r2_score(y_test, y_pred)
    print(f'R-squared: {r2:.4f}')
    mae=mean_absolute_error(y_test, y_pred)
    
    #returning y_pred and y_test to look at results, and r2 with final loss to evaluate the model
    return y_pred, y_test, r2, mse, mae



In [173]:
iterations=1000

In [174]:
pred = {}
truth = {}
r2_res={}
mse_res={}
mae_res={}

for time_bucket in [24,6,2,1]:
    
    columnname = f'h:{time_bucket}(_rbf)'
    print(columnname)  
    taxi_data = createdataset(df, timebucket=time_bucket, useh3=False, h3reso=resolution, coordinateenc=True)
    y = taxi_data['demand_count']
    X = taxi_data.drop(['demand_count', 'date'], axis=1)
        
    y_p, y_t,rtwo, mse, mae = trainsvmmodel(X, y, maxiter=iterations, splitratio=0.2, kernel='rbf')
        
    pred[columnname] = y_p
    truth[columnname] = y_t
    r2_res[columnname]=rtwo
    mse_res[columnname]=mse
    mae_res[columnname]=mae
    for resolution in [3,5,7]:
        columnname = f'h:{time_bucket}_res:{resolution}(_rbf)'
        print(columnname)
        taxi_data = createdataset(df, timebucket=time_bucket, useh3=True, h3reso=resolution, coordinateenc=True)
        y = taxi_data['demand_count']
        X = taxi_data.drop(['demand_count', 'date'], axis=1)
        
        y_p, y_t ,rtwo, mse,mae= trainsvmmodel(X, y, maxiter=iterations, splitratio=0.2, kernel='rbf')
        
        pred[columnname] = y_p
        truth[columnname] = y_t
        r2_res[columnname]=rtwo
        mse_res[columnname]=mse
        mae_res[columnname]=mae



h:24(_rbf)



  gdf['midpoint'] = gdf['geometry'].centroid




















Final Loss: 1453302.7647
R-squared: -0.7401
h:24_res:3(_rbf)
Final Loss: 42498322.0884
R-squared: 0.9421
h:24_res:5(_rbf)






















Final Loss: 81404181.1565
R-squared: 0.6266
h:24_res:7(_rbf)






















Final Loss: 8085429.0175
R-squared: 0.0864
h:6(_rbf)



  gdf['midpoint'] = gdf['geometry'].centroid




















Final Loss: 200962.2692
R-squared: -0.8937
h:6_res:3(_rbf)






















Final Loss: 6239722.4181
R-squared: 0.9066
h:6_res:5(_rbf)






















Final Loss: 1991301.1515
R-squared: 0.8865
h:6_res:7(_rbf)






















Final Loss: 3545299.1651
R-squared: -2.7340
h:2(_rbf)



  gdf['midpoint'] = gdf['geometry'].centroid




















Final Loss: 122870.2456
R-squared: -5.8347
h:2_res:3(_rbf)






















Final Loss: 1895206.2358
R-squared: 0.7488
h:2_res:5(_rbf)






















Final Loss: 597919.2931
R-squared: 0.7572
h:2_res:7(_rbf)






















Final Loss: 758933.8979
R-squared: -4.1275
h:1(_rbf)



  gdf['midpoint'] = gdf['geometry'].centroid




















Final Loss: 65472.0066
R-squared: -11.5092
h:1_res:3(_rbf)






















Final Loss: 568014.1780
R-squared: 0.7169
h:1_res:5(_rbf)






















Final Loss: 373551.0231
R-squared: 0.3843
h:1_res:7(_rbf)






















Final Loss: 193970.4917
R-squared: -3.5728


In [175]:
predlin = {}
truthlin = {}
r2_reslin={}
mse_reslin={}
mae_reslin={}

for time_bucket in [24,6,2,1]:
    
    columnname = f'h:{time_bucket}(_lin)'
    print(columnname)  
    taxi_data = createdataset(df, timebucket=time_bucket, useh3=False, h3reso=resolution, coordinateenc=True)
    y = taxi_data['demand_count']
    X = taxi_data.drop(['demand_count', 'date'], axis=1)
        
    y_p, y_t,rtwo, mse,mae = trainsvmmodel(X, y, maxiter=iterations, splitratio=0.2, kernel='linear')
        
    predlin[columnname] = y_p
    truthlin[columnname] = y_t
    r2_reslin[columnname]=rtwo
    mse_reslin[columnname]=mse
    mae_reslin[columnname]=mae
    for resolution in [3,5,7]:
        columnname = f'h:{time_bucket}_res:{resolution}(_lin)'
        print(columnname)
        taxi_data = createdataset(df, timebucket=time_bucket, useh3=True, h3reso=resolution, coordinateenc=True)
        y = taxi_data['demand_count']
        X = taxi_data.drop(['demand_count', 'date'], axis=1)
        
        y_p, y_t ,rtwo, mse,mae= trainsvmmodel(X, y, maxiter=iterations, splitratio=0.2, kernel='linear')
        
        predlin[columnname] = y_p
        truthlin[columnname] = y_t
        r2_reslin[columnname]=rtwo
        mse_reslin[columnname]=mse
        mae_reslin[columnname]=mae




h:24(_lin)



  gdf['midpoint'] = gdf['geometry'].centroid










Final Loss: 1490939.0153
R-squared: -0.6500
h:24_res:3(_lin)




Final Loss: 59807361.2167
R-squared: 0.9151
h:24_res:5(_lin)












Final Loss: 210770865.1143
R-squared: -0.0389
h:24_res:7(_lin)












Final Loss: 10190405.2202
R-squared: -0.0445
h:6(_lin)



  gdf['midpoint'] = gdf['geometry'].centroid










Final Loss: 359231.1849
R-squared: -2.2983
h:6_res:3(_lin)












Final Loss: 20088165.3099
R-squared: 0.6801
h:6_res:5(_lin)












Final Loss: 16243497.0516
R-squared: 0.1413
h:6_res:7(_lin)












Final Loss: 15739792.8067
R-squared: -14.8210
h:2(_lin)



  gdf['midpoint'] = gdf['geometry'].centroid










Final Loss: 115532.9109
R-squared: -5.2710
h:2_res:3(_lin)












Final Loss: 7629091.8753
R-squared: 0.0332
h:2_res:5(_lin)












Final Loss: 2354373.1786
R-squared: 0.0186
h:2_res:7(_lin)












Final Loss: 4338816.4052
R-squared: -28.6263
h:1(_lin)



  gdf['midpoint'] = gdf['geometry'].centroid










Final Loss: 54038.7788
R-squared: -9.0382
h:1_res:3(_lin)












Final Loss: 1921046.6336
R-squared: 0.0663
h:1_res:5(_lin)












Final Loss: 585395.2567
R-squared: 0.1094
h:1_res:7(_lin)












Final Loss: 374655.3288
R-squared: -7.5801


In [176]:
predpoly = {}
truthpoly = {}
r2_respoly={}
mse_respoly={}
mae_respoly={}

for time_bucket in [24,6,2,1]:
    
    columnname = f'h:{time_bucket}(_poly)'
    print(columnname)  
    taxi_data = createdataset(df, timebucket=time_bucket, useh3=False, h3reso=resolution, coordinateenc=True)
    y = taxi_data['demand_count']
    X = taxi_data.drop(['demand_count', 'date'], axis=1)
        
    y_p, y_t,rtwo, mae,mse = trainsvmmodel(X, y, maxiter=iterations, splitratio=0.2, kernel='poly')
        
    predpoly[columnname] = y_p
    truthpoly[columnname] = y_t
    r2_respoly[columnname]=rtwo
    mse_respoly[columnname]=mse
    mae_respoly[columnname]=mae

    for resolution in [3,5,7]:
        columnname = f'h:{time_bucket}_res:{resolution}(_poly)'
        print(columnname)
        taxi_data = createdataset(df, timebucket=time_bucket, useh3=True, h3reso=resolution, coordinateenc=True)
        y = taxi_data['demand_count']
        X = taxi_data.drop(['demand_count', 'date'], axis=1)
        
        y_p, y_t ,rtwo, mse,mae= trainsvmmodel(X, y, maxiter=iterations, splitratio=0.2, kernel='poly')
        
        predpoly[columnname] = y_p
        truthpoly[columnname] = y_t
        r2_respoly[columnname]=rtwo
        mse_respoly[columnname]=mse
        mae_respoly[columnname]=mae







h:24(_poly)



  gdf['midpoint'] = gdf['geometry'].centroid




















Final Loss: 873074.4377
R-squared: -0.0337
h:24_res:3(_poly)




Final Loss: 97295956.1697
R-squared: 0.8540
h:24_res:5(_poly)






















Final Loss: 223098773.3749
R-squared: -0.0346
h:24_res:7(_poly)






















Final Loss: 10634293.5876
R-squared: 0.0014
h:6(_poly)



  gdf['midpoint'] = gdf['geometry'].centroid




















Final Loss: 230967.2413
R-squared: -0.9584
h:6_res:3(_poly)






















Final Loss: 18028350.6390
R-squared: 0.7250
h:6_res:5(_poly)






















Final Loss: 9402870.6520
R-squared: 0.4986
h:6_res:7(_poly)






















Final Loss: 3632045.5809
R-squared: -2.9506
h:2(_poly)



  gdf['midpoint'] = gdf['geometry'].centroid




















Final Loss: 31121.7525
R-squared: -0.7031
h:2_res:3(_poly)






















Final Loss: 5625187.4438
R-squared: 0.2914
h:2_res:5(_poly)






















Final Loss: 1460186.9942
R-squared: 0.3483
h:2_res:7(_poly)






















Final Loss: 2186462.8746
R-squared: -13.8499
h:1(_poly)



  gdf['midpoint'] = gdf['geometry'].centroid




















Final Loss: 6025.3164
R-squared: -0.1284
h:1_res:3(_poly)






















Final Loss: 1013645.7578
R-squared: 0.4919
h:1_res:5(_poly)






















Final Loss: 521208.1596
R-squared: 0.1946
h:1_res:7(_poly)






















Final Loss: 122126.5770
R-squared: -1.8172


In [177]:

# Initialize a list to store data for the DataFrame
data = []

kernels = ['rbf', 'lin', 'poly']

# Loop through the kernels
for kernel in kernels:
    # Loop through time buckets
    for time_bucket in [24, 6, 2, 1]:
        for resolution in [3, 5, 7]:
            columnname = f'h:{time_bucket}_res:{resolution}(_{kernel})'
            if kernel=='lin':
                r2_sco = r2_reslin[columnname]
                mse = mse_reslin[columnname]
                mae=mae_reslin[columnname]
                test_mean = np.mean(predlin[columnname])  # Calculate the mean
                truth_mean = np.mean(truthlin[columnname])
                test_var = np.var(predlin[columnname])  # Calculate the mean
                truth_var = np.var(truthlin[columnname])
            elif kernel=='rbf':
                r2_sco = r2_res[columnname]
                mse = mse_res[columnname]
                mae=mae_res[columnname]
                test_mean = np.mean(pred[columnname])  # Calculate the mean
                truth_mean = np.mean(truth[columnname])
                test_var = np.var(pred[columnname])  # Calculate the mean
                truth_var = np.var(truth[columnname])
            elif kernel=='poly':
                r2_sco = r2_respoly[columnname]
                mse = mse_respoly[columnname]
                mae=mae_respoly[columnname]
                test_mean = np.mean(predpoly[columnname])  # Calculate the mean
                truth_mean = np.mean(truthpoly[columnname])
                test_var = np.var(predpoly[columnname])  # Calculate the mean
                truth_var = np.var(truthpoly[columnname])
            
            data.append({
                'Kernel': kernel,
                'Spatial-resolution': f'H3-{resolution}',
                'Timebucket': time_bucket,
                'R2-score': r2_sco,
                'MSE': mse,
                'MAE':mae,
                'Test_mean': test_mean,
                'Truth_mean': truth_mean,
                'Test_variance':test_var,
                'Truth_var':truth_var
            })

# Create the DataFrame
df_svmresults = pd.DataFrame(data)


# Display the DataFrame
df_svmresults



Unnamed: 0,Kernel,Spatial-resolution,Timebucket,R2-score,MSE,MAE,Test_mean,Truth_mean,Test_variance,Truth_var
0,rbf,H3-3,24,0.942104,42498320.0,4267.008376,30595.955005,31805.972603,597345100.0,734049700.0
1,rbf,H3-5,24,0.626626,81404180.0,3416.48076,4353.393576,7009.432247,42838980.0,218023300.0
2,rbf,H3-7,24,0.086404,8085429.0,929.799038,226.809359,683.744965,543371.2,8850119.0
3,rbf,H3-3,6,0.906583,6239722.0,1418.202894,7293.279699,7489.469178,59063530.0,66794340.0
4,rbf,H3-5,6,0.886515,1991301.0,842.088007,1636.272774,1853.714876,11740200.0,17546750.0
5,rbf,H3-7,6,-2.734008,3545299.0,1525.806462,-531.384465,221.21155,2557166.0,949462.1
6,rbf,H3-3,2,0.748783,1895206.0,1114.207162,2214.930681,2382.212329,3440726.0,7544110.0
7,rbf,H3-5,2,0.757213,597919.3,575.816652,525.248773,734.404571,1845379.0,2462728.0
8,rbf,H3-7,2,-4.127527,758933.9,736.892524,-90.423267,96.142104,704518.2,148011.7
9,rbf,H3-3,1,0.716875,568014.2,639.126966,1267.305676,1202.57645,860655.4,2006231.0


In [None]:
import pickle
#safe current variavle

# Create a dictionary to store the variables
variables = {"df_svmresults": df_svmresults, "predlin": predlin, "pred": pred, "predpoly": predpoly, "r2_res": r2_res, "r2_reslin": r2_reslin, "r2_respoly": r2_respoly, "var3": var3, "var3": var3, "var3": var3}

# Save the variables to a file using pickle
with open('saved_variables.pkl', 'wb') as f:
    pickle.dump(variables, f)


In [2]:
import pickle

# Load variables from the saved file
with open('saved_variables.pkl', 'rb') as f:
    loaded_variables = pickle.load(f)

# Now you can access the loaded variables
#loaded_var1 = loaded_variables["var1"]
#loaded_var2 = loaded_variables["var2"]
#loaded_var3 = loaded_variables["var3"]




In [36]:
# best R2-score
results = loaded_variables["df_svmresults"]
results[results['R2-score']==results['R2-score'].max()]

Unnamed: 0,Kernel,Spatial-resolution,Timebucket,R2-score,MSE,MAE,Test_mean,Truth_mean,Test_variance,Truth_var
0,rbf,H3-3,24,0.942104,42498320.0,4267.008376,30595.955005,31805.972603,597345100.0,734049700.0


In [38]:
# best MSE
print(min(loaded_variables["df_svmresults"].MSE))
results[results['MSE']==results['MSE'].min()]

122126.57696412962


Unnamed: 0,Kernel,Spatial-resolution,Timebucket,R2-score,MSE,MAE,Test_mean,Truth_mean,Test_variance,Truth_var
35,poly,H3-7,1,-1.81722,122126.576964,305.006829,310.449534,56.66791,13828.165565,43350.032839


In [37]:
# best MAE
print(min(loaded_variables["df_svmresults"].MAE))
results[results['MAE']==results['MAE'].min()]

305.0068288307094


Unnamed: 0,Kernel,Spatial-resolution,Timebucket,R2-score,MSE,MAE,Test_mean,Truth_mean,Test_variance,Truth_var
35,poly,H3-7,1,-1.81722,122126.576964,305.006829,310.449534,56.66791,13828.165565,43350.032839


In [4]:
# prediction values with linear kernel
loaded_variables["predlin"]

{'h:24(_lin)': array([1022.29734377, 1017.33651719,  958.14370323, ...,  854.19529576,
         843.95630652,  906.54151468]),
 'h:24_res:3(_lin)': array([ 4740.27469365,  3723.68436734,  3761.2672019 , 55183.38017392,
        56574.24932966,  4678.54372944,  3727.33968426,  4413.81770578,
         4527.38930033,  3202.96672543, 54992.52231078, 56478.93900252,
         3474.98806577, 56501.91588658, 56868.18725433, 55521.8480951 ,
        56435.46930573,  4204.58762938, 56630.09343164, 55902.37725864,
        55293.29645156,  4047.54633803,  4084.06887286,  3746.6612514 ,
         4293.0617404 , 55249.82675477, 54970.4947806 ,  3022.94855003,
         5033.97540945,  3936.56976066, 55749.10223006, 55554.47810416,
        55939.9600932 ,  3522.11307947,  3393.93553442, 56420.86335523,
         4340.18675411, 55811.78254815, 55249.93770062, 55722.35908324,
        56897.05100062, 56783.47940607, 55808.12723123,  3500.19649515,
         4545.65056774, 54665.95437706, 55793.52128074,  2729

In [5]:
# prediction values with rbf kernel
loaded_variables["pred"]

{'h:24(_rbf)': array([ 1289.79964863,  1017.3894627 ,  -334.40873995, ...,
        -1004.83611749,  1242.12148121,  1437.84455748]),
 'h:24_res:3(_rbf)': array([53594.49813376, 63578.79759202, 51835.34832461, 39322.21932809,
         3222.30348826,  3878.04466583, 52348.27709553, 52746.17405776,
        56881.44534815,  4080.40626176,  3951.37792992,  3857.44175383,
        45349.19696795, 46097.32064988, 64885.71331148,  5599.09784939,
         4332.4086437 , 36853.21770342, 43075.6845077 ,  3545.66721724,
         4254.85447899, 47672.01224087,  3976.66227683, 58376.74160329,
         4377.89689123, 56099.54798015,  4438.28335676, 35688.5945164 ,
        58408.24447788, 53452.32950758,  3818.41773077,  4382.93436767,
        51905.52391529, 44319.92833669, 59150.16840837,  3533.7082354 ,
        52680.12009691,  4538.93876697, 54787.70721139, 51607.82860078,
        60940.07924358,  4056.5298793 , 51631.86925485, 59565.28854351,
         5143.81551053, 51878.7088589 , 52251.78808974,

In [6]:
# prediction values with polynomial kernel
loaded_variables["predpoly"]

{'h:24(_poly)': array([493.48930922, 480.2942972 , 525.96229581, ..., 698.67468692,
        514.70679701, 512.42017271]),
 'h:24_res:3(_poly)': array([56197.41085749,  2646.13860734,  7204.54219905, 47549.24856975,
        48738.15250326,  9783.30545147, 46795.0812068 , 46395.77849029,
         1329.04241489, 50330.38607305, 52066.75953452, 51273.70047002,
        53498.61312549, 48289.95009893, 46824.1386337 ,  6676.00660523,
        45782.12456884, 51901.15149352, 52096.24197305, 45902.10859237,
         5218.24306556, 51719.57577047,  3550.47936714,  4453.13813037,
         8438.90005772, 51944.93002976, 52196.41102256, 48927.14234632,
        44109.25677501, -1362.05075973,  6650.79451387, 51704.0377637 ,
         6659.65703475,  7633.93341285,  5306.7470433 ,  3485.20104283,
        61730.06651224, 49604.62928753, 11653.04575724, 46138.24023558,
        47495.5375185 ,  4659.2821541 ,  7189.06619714, 43133.79167065,
        55204.56373728,  8291.78504516,  7008.82633052, 51947.833

In [17]:
# mean of R2 scores with linear kernel
sum(loaded_variables["r2_reslin"].values())/len(loaded_variables["r2_reslin"])

-4.150261188188284

In [16]:
# mean of R2 scores with rbf kernel
sum(loaded_variables["r2_res"].values())/len(loaded_variables["r2_res"])

-1.4597922458940158

In [18]:
# mean of R2 scores with polynomial kernel
sum(loaded_variables["r2_respoly"].values())/len(loaded_variables["r2_respoly"])

-1.0669321217216403