In [1]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score

In [2]:
# Read the CSV file
df = pd.read_csv('dataformodeltraining.csv')

In [3]:
# df.columns

In [4]:
df

Unnamed: 0,LocationID,busyness,temp,precip,snow,vis,wind_spd,uv,day_of_week,is_holiday,day,month,hour,timeslot
0,4,16.5,10.6,0.0,0.0,13,1.6,0.0,5,False,1,1,0,Late Night
1,12,1.0,10.6,0.0,0.0,13,1.6,0.0,5,False,1,1,0,Late Night
2,13,17.5,10.6,0.0,0.0,13,1.6,0.0,5,False,1,1,0,Late Night
3,24,12.5,10.6,0.0,0.0,13,1.6,0.0,5,False,1,1,0,Late Night
4,41,13.5,10.6,0.0,0.0,13,1.6,0.0,5,False,1,1,0,Late Night
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495376,163,0.5,2.2,0.0,0.0,16,2.1,0.0,3,False,1,12,22,Late Night
495377,50,0.5,2.2,0.0,0.0,11,1.5,0.0,3,False,1,12,23,Late Night
495378,68,0.5,2.2,0.0,0.0,11,1.5,0.0,3,False,1,12,23,Late Night
495379,79,0.5,2.2,0.0,0.0,11,1.5,0.0,3,False,1,12,23,Late Night


In [5]:
df['vis'].unique()

array([13,  9, 16,  1, 12,  7,  6, 14, 11,  4, 10,  5, 15,  2,  8,  3,  0])

In [6]:
# Set the maximum value for 'vis' to be 10
df['vis'] = df['vis'].clip(upper=10)

In [7]:
df['vis'].unique()

array([10,  9,  1,  7,  6,  4,  5,  2,  8,  3,  0])

In [8]:
# Check for missing values
missing_values = df.isnull().sum()
missing_values

LocationID     0
busyness       0
temp           0
precip         0
snow           0
vis            0
wind_spd       0
uv             0
day_of_week    0
is_holiday     0
day            0
month          0
hour           0
timeslot       0
dtype: int64

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

# List of features to drop
features_to_drop = ['uv', 'snow', 'is_holiday']

# Drop the specified features from the DataFrame
df = df.drop(columns=features_to_drop)

# Separate the target feature
y = df['busyness']
X = df.drop('busyness', axis=1)

# Apply one-hot encoding to the 'timeslot' feature
one_hot_encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
timeslot_encoded = one_hot_encoder.fit_transform(X[['timeslot']])
timeslot_encoded_df = pd.DataFrame(timeslot_encoded, columns=one_hot_encoder.get_feature_names_out(['timeslot']))

# Drop the original 'timeslot' feature and add the encoded features
X = pd.concat([X.drop('timeslot', axis=1), timeslot_encoded_df], axis=1)

# # Convert boolean feature 'is_holiday' to int
# X['is_holiday'] = X['is_holiday'].astype(int)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train.head()



Unnamed: 0,LocationID,temp,precip,vis,wind_spd,day_of_week,day,month,hour,timeslot_Afternoon,timeslot_Evening,timeslot_Late Night,timeslot_Morning
41569,107,22.8,0.0,10,2.6,5,18,6,2,0.0,0.0,1.0,0.0
236257,137,28.3,0.0,10,2.1,3,18,8,15,1.0,0.0,0.0,0.0
63156,4,19.4,0.0,10,1.5,5,13,8,4,0.0,0.0,1.0,0.0
156693,263,25.6,0.0,3,2.8,0,18,7,11,0.0,0.0,0.0,1.0
108056,166,14.4,0.0,10,2.1,6,23,10,8,0.0,0.0,0.0,1.0


### Now we start the training

In [12]:
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

In [13]:
# Instantiate the XGBRegressor model
model = XGBRegressor(random_state=42)

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)


In [14]:
# Calculate MAE
mae = mean_absolute_error(y_test, y_pred)

# Calculate MSE
mse = mean_squared_error(y_test, y_pred)

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

# Calculate R^2
r2 = r2_score(y_test, y_pred)

mae, mse, rmse, r2

(10.343983072463185, 249.1549561262777, 15.784643047160671, 0.9520860371797626)

### Use the Grid Search method to try different combinations of parameters within a given parameter range and evaluate the performance of the model on the validation set

In [15]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'learning_rate': [0.05, 0.1, 0.15],
    'max_depth': [3, 5, 7],
    'n_estimators': [50, 100],
    'min_child_weight': [1, 3],
    'subsample': [0.6, 0.8],
    'colsample_bytree': [0.6, 0.8]
}

# Instantiate the grid search model
grid_search = GridSearchCV(estimator=XGBRegressor(random_state=42),
                           param_grid=param_grid,
                           cv=3,  # Use 3 folds in cross-validation
                           scoring='neg_root_mean_squared_error',  # Use RMSE as the scoring metric
                           verbose=2,  # Output messages
                           n_jobs=-1)  # Use all CPU cores

# Fit the grid search model
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 144 candidates, totalling 432 fits


In [16]:
best_params = grid_search.best_params_
best_params

{'colsample_bytree': 0.8,
 'learning_rate': 0.15,
 'max_depth': 7,
 'min_child_weight': 1,
 'n_estimators': 100,
 'subsample': 0.6}

In [17]:
# Instantiate the XGBRegressor model with the best parameters
best_model = XGBRegressor(**best_params, random_state=42)

# Train the model on the full training data
best_model.fit(X_train, y_train)

In [18]:
# Make predictions
y_test_pred = best_model.predict(X_test)

# Make predictions on the training set
y_train_pred = best_model.predict(X_train)

# Calculate metrics on the training set
mae_train = mean_absolute_error(y_train, y_train_pred)
mse_train = mean_squared_error(y_train, y_train_pred)
rmse_train = np.sqrt(mse_train)
r2_train = r2_score(y_train, y_train_pred)

print(f"Training set metrics:")
print(f"MAE: {mae_train}")
print(f"MSE: {mse_train}")
print(f"RMSE: {rmse_train}")
print(f"R^2: {r2_train}")

# Calculate metrics on the testing set (we've done this before)
mae_test = mean_absolute_error(y_test, y_pred)
mse_test = mean_squared_error(y_test, y_pred)
rmse_test = np.sqrt(mse_test)
r2_test = r2_score(y_test, y_pred)

print(f"\nTesting set metrics:")
print(f"MAE: {mae_test}")
print(f"MSE: {mse_test}")
print(f"RMSE: {rmse_test}")
print(f"R^2: {r2_test}")

Training set metrics:
MAE: 10.389295558686518
MSE: 260.2560888323172
RMSE: 16.13245451976596
R^2: 0.9499658608685109

Testing set metrics:
MAE: 10.343983072463185
MSE: 249.1549561262777
RMSE: 15.784643047160671
R^2: 0.9520860371797626


### Apply Cross-validation to see if any overfitting

In [19]:
from sklearn.metrics import make_scorer
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score

# Instantiate the XGBRegressor model with the best parameters
best_model = XGBRegressor(**best_params, random_state=42)

# Perform 5-fold cross validation and calculate RMSE
rmse_scores = -cross_val_score(best_model, X_train, y_train, cv=5, scoring='neg_root_mean_squared_error')

# Perform 5-fold cross validation and calculate MAE
mae_scores = -cross_val_score(best_model, X_train, y_train, cv=5, scoring='neg_mean_absolute_error')

# Perform 5-fold cross validation and calculate MSE
mse_scores = -cross_val_score(best_model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')

# Perform 5-fold cross validation and calculate R^2
r2_scores = cross_val_score(best_model, X_train, y_train, cv=5, scoring='r2')

# Print the results
print(f"Cross-validated RMSE scores: {rmse_scores}")
print(f"Mean RMSE: {rmse_scores.mean()}")
print(f"Standard deviation of RMSE: {rmse_scores.std()}")

print(f"\nCross-validated MAE scores: {mae_scores}")
print(f"Mean MAE: {mae_scores.mean()}")
print(f"Standard deviation of MAE: {mae_scores.std()}")

print(f"\nCross-validated MSE scores: {mse_scores}")
print(f"Mean MSE: {mse_scores.mean()}")
print(f"Standard deviation of MSE: {mse_scores.std()}")

print(f"\nCross-validated R^2 scores: {r2_scores}")
print(f"Mean R^2: {r2_scores.mean()}")
print(f"Standard deviation of R^2: {r2_scores.std()}")

Cross-validated RMSE scores: [16.63837124 16.81156518 16.52871964 16.71517887 16.5443109 ]
Mean RMSE: 16.647629164651768
Standard deviation of RMSE: 0.10614879002330245

Cross-validated MAE scores: [10.87851887 10.92558069 10.75226126 10.80751651 10.58169842]
Mean MAE: 10.789115148598993
Standard deviation of MAE: 0.11944403309110434

Cross-validated MSE scores: [276.83539758 282.62872373 273.19857284 279.39720461 273.71422308]
Mean MSE: 277.15482436938754
Standard deviation of MSE: 3.5378959913303945

Cross-validated R^2 scores: [0.94628622 0.94601386 0.94726891 0.9467403  0.94727729]
Mean R^2: 0.9467173156400192
Standard deviation of R^2: 0.0005097128459908256


In [20]:
from sklearn2pmml import make_pmml_pipeline, sklearn2pmml

# Fit the best model on the training data
best_model.fit(X_train, y_train)

# Make a PMML pipeline with the best model
pmml_pipeline = make_pmml_pipeline(best_model)

# Export the PMML pipeline to a .pmml file
sklearn2pmml(pmml_pipeline, "best_model_xgb.pmml")

### models for each location id

In [21]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer


# Create a one-hot encoder
one_hot_encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')

# Create a preprocessor that applies the one-hot encoder to the 'timeslot' column
preprocessor = ColumnTransformer(transformers=[('one_hot', one_hot_encoder, ['timeslot'])], remainder='passthrough')

# Get a list of all unique location ids
location_ids = df['LocationID'].unique()

# Create a dictionary to store the models for each location id
models = {}

for id in location_ids:
    # Filter the data for the current location id
    df_location = df[df['LocationID'] == id]
    
    # Separate the target feature
    y = df_location['busyness']
    X = df_location.drop(['busyness', 'LocationID'], axis=1)

    # Create a pipeline that preprocesses the data and then trains a model
    pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', XGBRegressor(random_state=42))])
    
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Train the model on the preprocessed data
    pipeline.fit(X_train, y_train)
    
    # Store the model in the dictionary
    models[id] = pipeline
    
    # Make predictions
    y_pred = pipeline.predict(X_test)
    
    # Calculate and print metrics
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    
    print(f"Location ID: {id}")
    print(f"MAE: {mae}")
    print(f"MSE: {mse}")
    print(f"RMSE: {rmse}")
    print(f"R^2: {r2}")
    print("--------------------")




Location ID: 4
MAE: 2.2910122158664996
MSE: 10.082505399223065
RMSE: 3.175296112053656
R^2: 0.8693777395500083
--------------------




Location ID: 12
MAE: 1.0742507107773036
MSE: 2.349317161515694
RMSE: 1.5327482381381796
R^2: 0.6478785701480163
--------------------




Location ID: 13
MAE: 4.2212134480082995
MSE: 34.2890018858757
RMSE: 5.855681163270051
R^2: 0.9165149905205361
--------------------




Location ID: 24
MAE: 2.565743714663011
MSE: 11.661645925196208
RMSE: 3.4149152149352417
R^2: 0.8532341184720584
--------------------




Location ID: 41
MAE: 3.2023659981272403
MSE: 18.447415027850106
RMSE: 4.295045404631959
R^2: 0.8577243658070294
--------------------




Location ID: 42
MAE: 2.372913897266971
MSE: 9.724390228254387
RMSE: 3.118395457323267
R^2: 0.7236085115191377
--------------------




Location ID: 43
MAE: 8.916865219023043
MSE: 161.75878747380574
RMSE: 12.71844280852832
R^2: 0.9356474742256511
--------------------




Location ID: 45
MAE: 1.9729615184742386
MSE: 6.958121973042801
RMSE: 2.6378252355004115
R^2: 0.7488209285791279
--------------------
Location ID: 48
MAE: 11.346633047666902
MSE: 251.71844395814614
RMSE: 15.865637206180725
R^2: 0.9407610186740984
--------------------




Location ID: 50
MAE: 5.374441708558156
MSE: 51.793463969889494
RMSE: 7.196767605660856
R^2: 0.8987652043812158
--------------------




Location ID: 68
MAE: 9.467202450388145
MSE: 180.89209427850523
RMSE: 13.44961316464177
R^2: 0.9511292597246329
--------------------




Location ID: 74
MAE: 3.4038361918956115
MSE: 19.990495636552527
RMSE: 4.471073208587903
R^2: 0.8477883263280739
--------------------




Location ID: 75
MAE: 4.930152024062017
MSE: 46.46481954019429
RMSE: 6.816510803937326
R^2: 0.915435235247269
--------------------




Location ID: 79
MAE: 9.717981225259612
MSE: 232.56412773464288
RMSE: 15.250053368255564
R^2: 0.9631243233492537
--------------------




Location ID: 87
MAE: 4.013685148940794
MSE: 31.4563046724945
RMSE: 5.608592040119739
R^2: 0.9026556920893167
--------------------




Location ID: 88
MAE: 2.6878178785624476
MSE: 16.33623450414347
RMSE: 4.0418107951935935
R^2: 0.8114573911281705
--------------------




Location ID: 90
MAE: 6.644228325814046
MSE: 85.09849034819835
RMSE: 9.224884299989803
R^2: 0.9445642575688853
--------------------




Location ID: 100
MAE: 7.187818196691918
MSE: 93.89088292225385
RMSE: 9.689730797202461
R^2: 0.9331693431451937
--------------------




Location ID: 107
MAE: 9.4948583492605
MSE: 244.72307169456838
RMSE: 15.643627191114227
R^2: 0.9213137073025647
--------------------




Location ID: 113
MAE: 5.838559708142348
MSE: 65.91417831465547
RMSE: 8.118754726844227
R^2: 0.9484512490796824
--------------------




Location ID: 114
MAE: 5.971997408478761
MSE: 70.16775569042906
RMSE: 8.376619586111635
R^2: 0.9359936353697148
--------------------




Location ID: 116
MAE: 1.8929253562816284
MSE: 6.1564475511420405
RMSE: 2.481218964771558
R^2: 0.6360243636894614
--------------------




Location ID: 125
MAE: 3.5327685324553806
MSE: 22.110936066798708
RMSE: 4.702226713674992
R^2: 0.8653324217846907
--------------------




Location ID: 127
MAE: 0.763249639885036
MSE: 1.0467498876264667
RMSE: 1.023107955020616
R^2: 0.45936484417094225
--------------------




Location ID: 137
MAE: 5.953632302260354
MSE: 64.28342018481551
RMSE: 8.017694193770145
R^2: 0.9268611841588487
--------------------




Location ID: 140
MAE: 7.900875748099576
MSE: 133.2826012270339
RMSE: 11.544808410148429
R^2: 0.9643924515599754
--------------------




Location ID: 141
MAE: 10.422927146971002
MSE: 210.99132559517332
RMSE: 14.5255404579373
R^2: 0.9574316898514607
--------------------




Location ID: 142
MAE: 12.543284105391093
MSE: 324.8029132225369
RMSE: 18.02228934465699
R^2: 0.9603225133009279
--------------------




Location ID: 143
MAE: 6.291881314024554
MSE: 75.31250751652237
RMSE: 8.67827791192022
R^2: 0.9518832610182433
--------------------




Location ID: 144
MAE: 5.147071757371504
MSE: 53.45773761459876
RMSE: 7.311479851206509
R^2: 0.9226090162380454
--------------------




Location ID: 148
MAE: 5.5672443542552505
MSE: 70.7355096892779
RMSE: 8.410440516957355
R^2: 0.9529371719538473
--------------------




Location ID: 151
MAE: 4.711697659806969
MSE: 40.47082520073293
RMSE: 6.3616684290155305
R^2: 0.938802596383281
--------------------




Location ID: 152
MAE: 1.5587671234663778
MSE: 4.176895222031764
RMSE: 2.0437453907059373
R^2: 0.49436214609292095
--------------------




Location ID: 158
MAE: 4.917211770000706
MSE: 45.32005345337376
RMSE: 6.732017041969945
R^2: 0.9297495275899608
--------------------




Location ID: 161
MAE: 16.383533214161254
MSE: 736.4689421366776
RMSE: 27.137961274507663
R^2: 0.9520453531337572
--------------------




Location ID: 162
MAE: 11.828562728669004
MSE: 286.02999477156123
RMSE: 16.912421316049375
R^2: 0.9617931237587677
--------------------




Location ID: 163
MAE: 10.941767772281073
MSE: 282.49725764220756
RMSE: 16.80765473354946
R^2: 0.9518175668392692
--------------------




Location ID: 164
MAE: 9.668675248085371
MSE: 216.32382580795235
RMSE: 14.707951108429492
R^2: 0.9321138027703204
--------------------




Location ID: 166
MAE: 4.153493809153198
MSE: 37.96507926704882
RMSE: 6.161580906475936
R^2: 0.8838351271424871
--------------------




Location ID: 170
MAE: 11.639434569874654
MSE: 336.4392107728629
RMSE: 18.34227932327013
R^2: 0.9490502676684819
--------------------




Location ID: 186
MAE: 12.339530975559029
MSE: 300.8808753813529
RMSE: 17.345918118720405
R^2: 0.9339598337138235
--------------------




Location ID: 209
MAE: 2.211356889896927
MSE: 9.487749054799018
RMSE: 3.0802189946169443
R^2: 0.7912127797855055
--------------------




Location ID: 211
MAE: 4.7107695774980325
MSE: 42.268063923749
RMSE: 6.5013893841046775
R^2: 0.9220370648426975
--------------------




Location ID: 224
MAE: 2.386257396757574
MSE: 10.331009087707614
RMSE: 3.2141887137670704
R^2: 0.8390546017829003
--------------------




Location ID: 229
MAE: 8.31424000745497
MSE: 142.63464628424134
RMSE: 11.942974766959919
R^2: 0.9557914497077082
--------------------




Location ID: 230
MAE: 13.990713860855914
MSE: 456.06229777897363
RMSE: 21.355615134642544
R^2: 0.92942798463248
--------------------




Location ID: 231
MAE: 6.95520665283614
MSE: 90.7329776627047
RMSE: 9.525385958726538
R^2: 0.9368671976906809
--------------------




Location ID: 232
MAE: 2.600530882568566
MSE: 12.28891193274268
RMSE: 3.5055544401339254
R^2: 0.8216092345411468
--------------------




Location ID: 233
MAE: 7.096442876678746
MSE: 99.21553270638415
RMSE: 9.960699408494573
R^2: 0.9283220361922995
--------------------




Location ID: 234
MAE: 9.27754872272219
MSE: 178.58987225669023
RMSE: 13.363752177314955
R^2: 0.9640757531585862
--------------------




Location ID: 236
MAE: 15.881620576195816
MSE: 539.6516597064713
RMSE: 23.230403778377838
R^2: 0.9730697919118486
--------------------




Location ID: 237
MAE: 15.832198557431095
MSE: 585.8526331145155
RMSE: 24.20439284746708
R^2: 0.9734569669875079
--------------------




Location ID: 238
MAE: 8.810594484023516
MSE: 149.89297441656427
RMSE: 12.243078633112027
R^2: 0.96074861668479
--------------------




Location ID: 239
MAE: 10.841493106828315
MSE: 246.35753313018193
RMSE: 15.695780742931584
R^2: 0.9635251077780649
--------------------




Location ID: 243
MAE: 1.3356514275560019
MSE: 3.1973073219181463
RMSE: 1.788101597202504
R^2: 0.6988525644827823
--------------------




Location ID: 244
MAE: 2.0814440610250884
MSE: 7.706419536585017
RMSE: 2.7760438643121286
R^2: 0.6028018886999993
--------------------




Location ID: 246
MAE: 9.192932255902237
MSE: 180.65773577847264
RMSE: 13.440897878433296
R^2: 0.9274729290472763
--------------------




Location ID: 249
MAE: 8.668322748675923
MSE: 169.87554486614067
RMSE: 13.03363130006909
R^2: 0.9439837190865532
--------------------




Location ID: 261
MAE: 3.206787446805142
MSE: 18.80985902860137
RMSE: 4.337033436417268
R^2: 0.8973335298891325
--------------------




Location ID: 262
MAE: 6.536940699433301
MSE: 81.97233974872094
RMSE: 9.053857727439775
R^2: 0.9434490590053721
--------------------




Location ID: 263
MAE: 8.535721549664034
MSE: 137.60234643810276
RMSE: 11.730402654559764
R^2: 0.9503532655495145
--------------------
Location ID: 120
MAE: 0.16793928918291312
MSE: 0.07600964755176078
RMSE: 0.2756984721607299
R^2: -0.051419697175099754
--------------------




Location ID: 202
MAE: 0.7029438651560015
MSE: 0.8821331732442709
RMSE: 0.9392194489278163
R^2: 0.16678997338280233
--------------------




Location ID: 194
MAE: 0.49298059620822854
MSE: 0.5996234800791376
RMSE: 0.7743535885363595
R^2: 0.18366392772601703
--------------------
Location ID: 153
MAE: 0.14993702891646632
MSE: 0.05930385462819503
RMSE: 0.24352382763950436
R^2: -0.2569715913339299
--------------------




Location ID: 128
MAE: 0.19562849051812115
MSE: 0.08549318720352268
RMSE: 0.2923921804760221
R^2: -0.2146719200893792
--------------------


In [22]:
import os
from sklearn2pmml import sklearn2pmml, make_pmml_pipeline

# Create a new folder named 'xgboost_models'
os.makedirs('xgboost_models', exist_ok=True)

# Iterate over the models
for location_id, model in models.items():
    # Create a pipeline for the model
    pipeline = make_pmml_pipeline(model)

    # Convert the model to PMML
    sklearn2pmml(pipeline, f'xgboost_models/model_{location_id}.pmml')


[CV] END colsample_bytree=0.6, learning_rate=0.05, max_depth=3, min_child_weight=1, n_estimators=100, subsample=0.6; total time=   8.6s
[CV] END colsample_bytree=0.6, learning_rate=0.05, max_depth=5, min_child_weight=1, n_estimators=50, subsample=0.8; total time=   6.3s
[CV] END colsample_bytree=0.6, learning_rate=0.05, max_depth=5, min_child_weight=1, n_estimators=100, subsample=0.6; total time=  14.1s
[CV] END colsample_bytree=0.6, learning_rate=0.05, max_depth=7, min_child_weight=1, n_estimators=50, subsample=0.8; total time=   9.5s
[CV] END colsample_bytree=0.6, learning_rate=0.05, max_depth=7, min_child_weight=3, n_estimators=50, subsample=0.8; total time=   9.2s
[CV] END colsample_bytree=0.6, learning_rate=0.05, max_depth=7, min_child_weight=3, n_estimators=100, subsample=0.6; total time=  19.3s
[CV] END colsample_bytree=0.6, learning_rate=0.1, max_depth=3, min_child_weight=3, n_estimators=100, subsample=0.6; total time=   8.3s
[CV] END colsample_bytree=0.6, learning_rate=0.1, ma

[CV] END colsample_bytree=0.6, learning_rate=0.05, max_depth=3, min_child_weight=1, n_estimators=100, subsample=0.8; total time=   8.1s
[CV] END colsample_bytree=0.6, learning_rate=0.05, max_depth=3, min_child_weight=3, n_estimators=100, subsample=0.6; total time=   8.7s
[CV] END colsample_bytree=0.6, learning_rate=0.05, max_depth=5, min_child_weight=3, n_estimators=50, subsample=0.8; total time=   6.6s
[CV] END colsample_bytree=0.6, learning_rate=0.05, max_depth=5, min_child_weight=3, n_estimators=100, subsample=0.8; total time=  11.0s
[CV] END colsample_bytree=0.6, learning_rate=0.05, max_depth=7, min_child_weight=1, n_estimators=100, subsample=0.6; total time=  19.8s
[CV] END colsample_bytree=0.6, learning_rate=0.1, max_depth=3, min_child_weight=1, n_estimators=50, subsample=0.6; total time=   4.3s
[CV] END colsample_bytree=0.6, learning_rate=0.1, max_depth=3, min_child_weight=1, n_estimators=100, subsample=0.6; total time=   8.2s
[CV] END colsample_bytree=0.6, learning_rate=0.1, ma

[CV] END colsample_bytree=0.6, learning_rate=0.05, max_depth=3, min_child_weight=1, n_estimators=50, subsample=0.8; total time=   4.2s
[CV] END colsample_bytree=0.6, learning_rate=0.05, max_depth=3, min_child_weight=3, n_estimators=50, subsample=0.6; total time=   4.3s
[CV] END colsample_bytree=0.6, learning_rate=0.05, max_depth=5, min_child_weight=1, n_estimators=50, subsample=0.6; total time=   6.9s
[CV] END colsample_bytree=0.6, learning_rate=0.05, max_depth=5, min_child_weight=1, n_estimators=100, subsample=0.8; total time=  12.9s
[CV] END colsample_bytree=0.6, learning_rate=0.05, max_depth=7, min_child_weight=1, n_estimators=50, subsample=0.6; total time=  10.5s
[CV] END colsample_bytree=0.6, learning_rate=0.05, max_depth=7, min_child_weight=3, n_estimators=50, subsample=0.8; total time=   9.2s
[CV] END colsample_bytree=0.6, learning_rate=0.05, max_depth=7, min_child_weight=3, n_estimators=100, subsample=0.8; total time=  17.7s
[CV] END colsample_bytree=0.6, learning_rate=0.1, max

### assigning grid search for the model and exported as pmml files

In [45]:
# Import necessary libraries
import os
import warnings
warnings.filterwarnings('ignore')
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV, train_test_split
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn2pmml import make_pmml_pipeline, sklearn2pmml

# Create a one-hot encoder
one_hot_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

# Create a preprocessor that applies the one-hot encoder to the 'timeslot' column
preprocessor = ColumnTransformer(transformers=[('one_hot', one_hot_encoder, ['timeslot'])], remainder='passthrough')

# Define the parameter grid
param_grid = {
    'model__learning_rate': [0.05, 0.1, 0.15],
    'model__max_depth': [3, 5, 7],
    'model__n_estimators': [50, 100],
    'model__min_child_weight': [1, 3],
    'model__subsample': [0.6, 0.8],
    'model__colsample_bytree': [0.6, 0.8]
}

# Get a list of all unique location ids
location_ids = df['LocationID'].unique()

# Create a dictionary to store the models for each location id
models = {}

# Define the folder to store the PMML files
export_folder = "grid_searchModel"

# Create the folder if it doesn't exist
if not os.path.exists(export_folder):
    os.makedirs(export_folder)

# Define a function to export the model as PMML
def export_model_as_pmml(model, filename):
    try:
        # Convert Scikit-Learn model to PMML pipeline
        pmml_pipeline = make_pmml_pipeline(model)
        # Export the PMML pipeline
        sklearn2pmml(pmml_pipeline, filename)
        print(f"Model exported as PMML: {filename}")
    except Exception as e:
        print(f"Error exporting the model: {e}")

for id in location_ids:
    # Filter the data for the current location id
    df_location = df[df['LocationID'] == id]
    
    # Separate the target feature
    y = df_location['busyness']
    X = df_location.drop(['busyness', 'LocationID'], axis=1)

    # Create a pipeline that preprocesses the data and then trains a model
    pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', XGBRegressor(random_state=42))])
    
    # Instantiate the grid search model
    grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=3, scoring='neg_root_mean_squared_error', verbose=0, n_jobs=-1)
    
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Fit the grid search model
    grid_search.fit(X_train, y_train)
    
    # Store the model in the dictionary
    models[id] = grid_search
    
    # Export the model as PMML
    pmml_filename = os.path.join(export_folder, f"model_{id}.pmml")
    export_model_as_pmml(grid_search.best_estimator_, pmml_filename)
    
    # Make predictions
    y_pred = grid_search.predict(X_test)
    
    # Calculate and print metrics
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    
    print(f"Location ID: {id}")
    print(f"Best parameters: {grid_search.best_params_}")
    print(f"MAE: {mae}")
    print(f"MSE: {mse}")
    print(f"RMSE: {rmse}")
    print(f"R^2: {r2}")
    print("--------------------")



Model exported as PMML: grid_searchModel/model_4.pmml
Location ID: 4
Best parameters: {'model__colsample_bytree': 0.8, 'model__learning_rate': 0.15, 'model__max_depth': 5, 'model__min_child_weight': 3, 'model__n_estimators': 100, 'model__subsample': 0.8}
MAE: 2.242282551840395
MSE: 9.944266763777485
RMSE: 3.153453149133103
R^2: 0.8711686677299045
--------------------
Model exported as PMML: grid_searchModel/model_12.pmml
Location ID: 12
Best parameters: {'model__colsample_bytree': 0.8, 'model__learning_rate': 0.05, 'model__max_depth': 7, 'model__min_child_weight': 3, 'model__n_estimators': 100, 'model__subsample': 0.8}
MAE: 1.0287385174511794
MSE: 2.1666341073743522
RMSE: 1.4719490845047434
R^2: 0.6752595552647621
--------------------
Model exported as PMML: grid_searchModel/model_13.pmml
Location ID: 13
Best parameters: {'model__colsample_bytree': 0.8, 'model__learning_rate': 0.1, 'model__max_depth': 7, 'model__min_child_weight': 3, 'model__n_estimators': 100, 'model__subsample': 0.8}

Model exported as PMML: grid_searchModel/model_125.pmml
Location ID: 125
Best parameters: {'model__colsample_bytree': 0.8, 'model__learning_rate': 0.1, 'model__max_depth': 7, 'model__min_child_weight': 1, 'model__n_estimators': 100, 'model__subsample': 0.8}
MAE: 3.429499077351873
MSE: 21.007282073995913
RMSE: 4.583370165500045
R^2: 0.8720542724539432
--------------------
Model exported as PMML: grid_searchModel/model_127.pmml
Location ID: 127
Best parameters: {'model__colsample_bytree': 0.8, 'model__learning_rate': 0.1, 'model__max_depth': 3, 'model__min_child_weight': 1, 'model__n_estimators': 100, 'model__subsample': 0.8}
MAE: 0.7434631791564076
MSE: 0.971409221166632
RMSE: 0.9856009441790485
R^2: 0.4982774950661224
--------------------
Model exported as PMML: grid_searchModel/model_137.pmml
Location ID: 137
Best parameters: {'model__colsample_bytree': 0.8, 'model__learning_rate': 0.1, 'model__max_depth': 7, 'model__min_child_weight': 3, 'model__n_estimators': 100, 'model__subsample'

Model exported as PMML: grid_searchModel/model_229.pmml
Location ID: 229
Best parameters: {'model__colsample_bytree': 0.8, 'model__learning_rate': 0.15, 'model__max_depth': 7, 'model__min_child_weight': 3, 'model__n_estimators': 100, 'model__subsample': 0.6}
MAE: 8.18298607037982
MSE: 145.55213178299795
RMSE: 12.064498820216194
R^2: 0.9548871967245881
--------------------
Model exported as PMML: grid_searchModel/model_230.pmml
Location ID: 230
Best parameters: {'model__colsample_bytree': 0.8, 'model__learning_rate': 0.15, 'model__max_depth': 7, 'model__min_child_weight': 3, 'model__n_estimators': 100, 'model__subsample': 0.8}
MAE: 13.898213948493007
MSE: 460.186980919524
RMSE: 21.45196916181645
R^2: 0.9287897227033561
--------------------
Model exported as PMML: grid_searchModel/model_231.pmml
Location ID: 231
Best parameters: {'model__colsample_bytree': 0.8, 'model__learning_rate': 0.15, 'model__max_depth': 7, 'model__min_child_weight': 3, 'model__n_estimators': 100, 'model__subsample

### print out the location ids with low metrics

In [25]:
# Calculate the total number of rows in the dataset
total_rows = df.shape[0]

location_ids = [120, 202, 194, 153, 128]

# Create a dictionary to store the number of rows for each location id
location_counts = {}

for id in location_ids:
    # Filter the data for the current location iddf_location = df[df['LocationID'] == id]
    df_location = df[df['LocationID'] == id]
    
    # Count the number of rows for the current location id
    location_counts[id] = df_location.shape[0]

# Create a dictionary to store the proportion of rows for each location id
location_proportions = {}

for id, count in location_counts.items():
    # Calculate the proportion of rows for the current location id
    location_proportions[id] = count / total_rows * 100

location_proportions

{120: 0.3078438615933998,
 202: 1.2576178739192663,
 194: 0.6556569589871231,
 153: 0.30441215952973566,
 128: 0.37728536217577985}

### print out the proportions for all location id

In [26]:
# Calculate the total number of rows in the dataset
total_rows = df.shape[0]

# Get a list of all unique location ids
location_ids_total = df['LocationID'].unique().tolist()

# Create a dictionary to store the number of rows for each location id
location_counts_all = {}

for id in location_ids_total:
    # Filter the data for the current location id
    df_location = df[df['LocationID'] == id]
    
    # Count the number of rows for the current location id
    location_counts_all[id] = df_location.shape[0]

# Create a dictionary to store the proportion of rows for each location id
location_proportions_all = {}

for id, count in location_counts_all.items():
    # Calculate the proportion of rows for the current location id
    location_proportions_all[id] = count / total_rows * 100

location_proportions_all

{4: 1.571113950676348,
 12: 1.1253964120545599,
 13: 1.586657542376474,
 24: 1.5931172168492533,
 41: 1.6060365657948124,
 42: 1.6056328361402639,
 43: 1.599576891322033,
 45: 1.5485050900216197,
 48: 1.613505564403964,
 50: 1.6120925106130433,
 68: 1.6139092940585125,
 74: 1.6106794568221228,
 75: 1.612496240267592,
 79: 1.6126981050948663,
 87: 1.603008593385697,
 88: 1.5565796831125942,
 90: 1.612496240267592,
 100: 1.612899969922141,
 107: 1.6133036995766896,
 113: 1.6070458899311844,
 114: 1.6100738623402995,
 116: 1.604017917522069,
 125: 1.6009899451129534,
 127: 1.413861250229621,
 137: 1.611285051303946,
 140: 1.611890645785769,
 141: 1.612899969922141,
 142: 1.6120925106130433,
 143: 1.607853349240282,
 144: 1.5963470540856433,
 148: 1.6116887809584943,
 151: 1.6088626733766536,
 152: 1.5870612720310227,
 158: 1.5991731616674842,
 161: 1.6133036995766896,
 162: 1.612899969922141,
 163: 1.6133036995766896,
 164: 1.6133036995766896,
 166: 1.6003843506311302,
 170: 1.61330369957

### implement ensemble models with metrics printed out

In [42]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
import os
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import StackingRegressor, RandomForestRegressor
from sklearn.linear_model import LinearRegression

# Create a one-hot encoder
one_hot_encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')

# Create a preprocessor that applies the one-hot encoder to the 'timeslot' column
preprocessor = ColumnTransformer(transformers=[('one_hot', one_hot_encoder, ['timeslot'])], remainder='passthrough')

# Define the base models
base_models = [("XGB_model", XGBRegressor(random_state=42)),
               ("RF_model", RandomForestRegressor(random_state=42))]

# Define the final model
final_model = LinearRegression()

# Define the stacking regressor
stacked_model = StackingRegressor(estimators=base_models, final_estimator=final_model)

# Create a pipeline that preprocesses the data and then trains a model
pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', stacked_model)])

# Get a list of all unique location ids
location_ids = df['LocationID'].unique()

# Create a dictionary to store the models for each location id
models = {}

# Create a directory to store the PMML files
os.makedirs("ensembled_model", exist_ok=True)

for id in location_ids:
    # Filter the data for the current location id
    df_location = df[df['LocationID'] == id]
    
    # Separate the target feature
    y = df_location['busyness']
    X = df_location.drop(['busyness', 'LocationID'], axis=1)
    
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Train the model on the preprocessed data
    pipeline.fit(X_train, y_train)
    
    # Store the model in the dictionary
    models[id] = pipeline
    
    # Export the trained model to a PMML file
    pmml_pipeline = make_pmml_pipeline(pipeline, active_fields=X_train.columns)
    sklearn2pmml(pmml_pipeline, f"ensembled_model/model_{id}.pmml")
    
    # Make predictions
    y_pred = pipeline.predict(X_test)
    
    # Calculate and print metrics
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    
    print(f"Location ID: {id}")
    print(f"MAE: {mae}")
    print(f"MSE: {mse}")
    print(f"RMSE: {rmse}")
    print(f"R^2: {r2}")
    print("--------------------")


Location ID: 4
MAE: 2.248169451010268
MSE: 9.990016732007298
RMSE: 3.160698772741132
R^2: 0.8705759614501585
--------------------
Location ID: 12
MAE: 1.0260297291160492
MSE: 2.1306795348774243
RMSE: 1.4596847381806197
R^2: 0.680648514952592
--------------------
Location ID: 13
MAE: 4.114659663775586
MSE: 33.19905543703136
RMSE: 5.761862150123982
R^2: 0.9191687332546271
--------------------
Location ID: 24
MAE: 2.479583447926964
MSE: 11.160755686669445
RMSE: 3.340771720227146
R^2: 0.8595379968334557
--------------------
Location ID: 41
MAE: 3.1080527365643356
MSE: 17.47772100179404
RMSE: 4.180636435017286
R^2: 0.8652031281334571
--------------------
Location ID: 42
MAE: 2.2988769287176813
MSE: 9.102270125412364
RMSE: 3.016996871959327
R^2: 0.7412907206039578
--------------------
Location ID: 43
MAE: 8.671705529218853
MSE: 158.0971478356541
RMSE: 12.573668829568167
R^2: 0.9371041849420855
--------------------
Location ID: 45
MAE: 1.912294599808721
MSE: 6.6187575564175205
RMSE: 2.5726946

Location ID: 202
MAE: 0.6634257608696755
MSE: 0.7824716394258736
RMSE: 0.8845742701581781
R^2: 0.2609242739217338
--------------------
Location ID: 194
MAE: 0.4530876043424717
MSE: 0.5004941126087781
RMSE: 0.7074560852864141
R^2: 0.31862008133941044
--------------------
Location ID: 153
MAE: 0.13332619540476726
MSE: 0.04746996712469981
RMSE: 0.21787603614142564
R^2: -0.006147079163635105
--------------------
Location ID: 128
MAE: 0.17928706494797067
MSE: 0.07059907807986054
RMSE: 0.2657048702599569
R^2: -0.003059080294420813
--------------------


In [46]:
X_train.columns

Index(['temp', 'precip', 'vis', 'wind_spd', 'day_of_week', 'day', 'month',
       'hour', 'timeslot'],
      dtype='object')

In [28]:
df

Unnamed: 0,LocationID,busyness,temp,precip,vis,wind_spd,day_of_week,day,month,hour,timeslot
0,4,16.5,10.6,0.0,10,1.6,5,1,1,0,Late Night
1,12,1.0,10.6,0.0,10,1.6,5,1,1,0,Late Night
2,13,17.5,10.6,0.0,10,1.6,5,1,1,0,Late Night
3,24,12.5,10.6,0.0,10,1.6,5,1,1,0,Late Night
4,41,13.5,10.6,0.0,10,1.6,5,1,1,0,Late Night
...,...,...,...,...,...,...,...,...,...,...,...
495376,163,0.5,2.2,0.0,10,2.1,3,1,12,22,Late Night
495377,50,0.5,2.2,0.0,10,1.5,3,1,12,23,Late Night
495378,68,0.5,2.2,0.0,10,1.5,3,1,12,23,Late Night
495379,79,0.5,2.2,0.0,10,1.5,3,1,12,23,Late Night
