### Group 1: Year-on-Year Comparison (2023 vs 2024)
Focus: Perform a year-on-year comparison between the 2023 and 2024 food drives, analyzing donations, and route completion times.
ML Task: Comparative Analysis
Objective: Build a model that predicts how donation predictions from 2023 align with 2024.

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [2]:
#load the 2024 dataset as the test dataset
data_2024 = pd.read_csv('cleaned_data_2024.csv', encoding='latin1')

In [3]:
#load the 2023 dataset as the test dataset
data_2023 = pd.read_csv('cleaned_data_2023.csv', encoding='latin1')

In [4]:
data_2024.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 446 entries, 0 to 445
Data columns (total 14 columns):
 #   Column                                                Non-Null Count  Dtype 
---  ------                                                --------------  ----- 
 0   date                                                  446 non-null    object
 1   drop_off_location                                     446 non-null    object
 2   stake                                                 446 non-null    object
 3   route_number/name                                     446 non-null    object
 4   time_spent_collecting_donations                       446 non-null    object
 5   #_of_adult_volunteers_who_participated_in_this_route  446 non-null    int64 
 6   #_of_youth_volunteers_who_participated_in_this_route  446 non-null    int64 
 7   #_of_doors_in_route                                   446 non-null    int64 
 8   #_of_donation_bags_collected                          446 non-null    

In [5]:
data_2023.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 454 entries, 0 to 453
Data columns (total 12 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Date                           454 non-null    object 
 1   Location                       454 non-null    object 
 2   Stake                          454 non-null    object 
 3   # of Adult Volunteers          454 non-null    int64  
 4   # of Youth Volunteers          454 non-null    int64  
 5   Donation Bags Collected        454 non-null    int64  
 6   Time to Complete (min)         454 non-null    float64
 7   Completed More Than One Route  454 non-null    int64  
 8   Ward/Branch                    454 non-null    object 
 9   Routes Completed               454 non-null    int64  
 10  Doors in Route                 454 non-null    int64  
 11  Time Spent                     454 non-null    float64
dtypes: float64(2), int64(6), object(4)
memory usage: 4

Feature Engineering


In [6]:
#drop the time spent in 2023 data

data_2023 = data_2023.drop(columns = ['Time Spent', 'Date'])

In [7]:
data_2023.columns

Index(['Location', 'Stake', '# of Adult Volunteers', '# of Youth Volunteers',
       'Donation Bags Collected', 'Time to Complete (min)',
       'Completed More Than One Route', 'Ward/Branch', 'Routes Completed',
       'Doors in Route'],
      dtype='object')

In [8]:
data_2024.columns

Index(['date', 'drop_off_location', 'stake', 'route_number/name',
       'time_spent_collecting_donations',
       '#_of_adult_volunteers_who_participated_in_this_route',
       '#_of_youth_volunteers_who_participated_in_this_route',
       '#_of_doors_in_route', '#_of_donation_bags_collected',
       'did_you_complete_more_than_1_route?', 'Number of routes completed',
       'ward', 'Form Completion Time', 'Total Volunteers'],
      dtype='object')

In [9]:
# Renaming columns in data_2023 to match data_2024
data_2023 = data_2023.rename(columns={

    'Location': 'drop_off_location',
    'Stake': 'stake',
    'Time to Complete (min)': 'time_spent_collecting_donations',
    'Number of routes completed': 'Number of routes completed',
    'Doors in Route': '#_of_doors_in_route',
    'Donation Bags Collected': '#_of_donation_bags_collected',
    'Routes Completed': 'Number of routes completed',
    'Ward/Branch': 'ward',
    '# of Adult Volunteers': '#_of_adult_volunteers_who_participated_in_this_route',
    '# of Youth Volunteers': '#_of_youth_volunteers_who_participated_in_this_route',
    'Total Volunteers': 'Total Volunteers',


})



In [10]:
data_2023.columns

Index(['drop_off_location', 'stake',
       '#_of_adult_volunteers_who_participated_in_this_route',
       '#_of_youth_volunteers_who_participated_in_this_route',
       '#_of_donation_bags_collected', 'time_spent_collecting_donations',
       'Completed More Than One Route', 'ward', 'Number of routes completed',
       '#_of_doors_in_route'],
      dtype='object')

In [11]:
data_2024 = data_2024.rename(columns={
    'did_you_complete_more_than_1_route?': 'Completed More Than One Route'})

In [12]:
data_2024 = data_2024.drop(columns = ['Form Completion Time', 'date'])

In [13]:
# Print the updated column names in 2023 for verification
data_2023.columns


Index(['drop_off_location', 'stake',
       '#_of_adult_volunteers_who_participated_in_this_route',
       '#_of_youth_volunteers_who_participated_in_this_route',
       '#_of_donation_bags_collected', 'time_spent_collecting_donations',
       'Completed More Than One Route', 'ward', 'Number of routes completed',
       '#_of_doors_in_route'],
      dtype='object')

In [14]:
data_2024 = data_2024.drop(columns = ['route_number/name'])

In [15]:
data_2024.columns

Index(['drop_off_location', 'stake', 'time_spent_collecting_donations',
       '#_of_adult_volunteers_who_participated_in_this_route',
       '#_of_youth_volunteers_who_participated_in_this_route',
       '#_of_doors_in_route', '#_of_donation_bags_collected',
       'Completed More Than One Route', 'Number of routes completed', 'ward',
       'Total Volunteers'],
      dtype='object')

In [16]:
data_2024 = data_2024.drop(columns = ['Total Volunteers'])

In [17]:
data_2024.head()

Unnamed: 0,drop_off_location,stake,time_spent_collecting_donations,#_of_adult_volunteers_who_participated_in_this_route,#_of_youth_volunteers_who_participated_in_this_route,#_of_doors_in_route,#_of_donation_bags_collected,Completed More Than One Route,Number of routes completed,ward
0,Londonberry Chapel,Bonnie Doon Stake,0 - 30 Minutes,1,0,1,1,No,1,Clareview Ward
1,Gateway Stake Centre,Gateway Stake,0 - 30 Minutes,2,2,20,20,No,1,Lee Ridge Ward
2,Bonnie Doon Stake Centre,Bonnie Doon Stake,0 - 30 Minutes,2,2,20,15,No,1,Forest Heights Ward
3,Bearspaw Chapel,Gateway Stake,30 - 60 Minutes,2,3,144,25,Yes,2,Lee Ridge Ward
4,Gateway Stake Centre,Gateway Stake,30 - 60 Minutes,1,0,230,21,No,1,Silver Berry Ward


In [18]:
data_2023.head()

Unnamed: 0,drop_off_location,stake,#_of_adult_volunteers_who_participated_in_this_route,#_of_youth_volunteers_who_participated_in_this_route,#_of_donation_bags_collected,time_spent_collecting_donations,Completed More Than One Route,ward,Number of routes completed,#_of_doors_in_route
0,Londonderry Chapel,Bonnie Doon Stake,1,3,14,25.0,0,Clareview Ward,2,14
1,Gateway Stake Centre,Gateway Stake,3,1,18,25.0,1,Crawford Plains Ward,2,144
2,Gateway Stake Centre,Gateway Stake,2,0,20,30.0,0,Silver Berry Ward,2,186
3,Gateway Stake Centre,Gateway Stake,2,0,20,25.0,1,Crawford Plains Ward,1,194
4,Londonderry Chapel,Bonnie Doon Stake,1,0,2,5.0,0,Londonderry Ward,2,1


In [19]:
data_2024['time_spent_collecting_donations'].unique()

array(['0 - 30 Minutes', '30 - 60 Minutes', '60 - 90 Minutes',
       '90 - 120 Minutes'], dtype=object)

In [20]:
import pandas as pd

# Convert `time_spent_collecting_donations` to numerical values
# Define bins and labels
bins = [0, 30, 60, 90, float('inf')]
labels = ['0 - 30 Minutes', '30 - 60 Minutes', '60 - 90 Minutes',
       '90 - 120 Minutes']

# Ensure 'time_spent_collecting_donations' is of type string before applying str methods
data_2023['time_spent_collecting_donations'] = data_2023['time_spent_collecting_donations'].astype(str)

# Apply the binning
data_2023['time_spent_collecting_donations'] = pd.cut(
    data_2023['time_spent_collecting_donations'].str.extract(r'(\d+)', expand=False).astype(float),  # Extract numbers
    bins=bins,
    labels=labels,
    right=True  # This includes the rightmost edge in each bin except the last
)

# View the resulting DataFrame
print(data_2023[['time_spent_collecting_donations', 'time_spent_collecting_donations']]) # Changed df to data_2023

    time_spent_collecting_donations time_spent_collecting_donations
0                    0 - 30 Minutes                  0 - 30 Minutes
1                    0 - 30 Minutes                  0 - 30 Minutes
2                    0 - 30 Minutes                  0 - 30 Minutes
3                    0 - 30 Minutes                  0 - 30 Minutes
4                    0 - 30 Minutes                  0 - 30 Minutes
..                              ...                             ...
449                  0 - 30 Minutes                  0 - 30 Minutes
450                  0 - 30 Minutes                  0 - 30 Minutes
451                 30 - 60 Minutes                 30 - 60 Minutes
452                90 - 120 Minutes                90 - 120 Minutes
453                  0 - 30 Minutes                  0 - 30 Minutes

[454 rows x 2 columns]


In [21]:
data_2024['time_spent_collecting_donations'].unique()

array(['0 - 30 Minutes', '30 - 60 Minutes', '60 - 90 Minutes',
       '90 - 120 Minutes'], dtype=object)

In [22]:
data_2023['time_spent_collecting_donations'].unique()

['0 - 30 Minutes', '30 - 60 Minutes', '60 - 90 Minutes', '90 - 120 Minutes']
Categories (4, object): ['0 - 30 Minutes' < '30 - 60 Minutes' < '60 - 90 Minutes' < '90 - 120 Minutes']

In [23]:
data_2024.head()

Unnamed: 0,drop_off_location,stake,time_spent_collecting_donations,#_of_adult_volunteers_who_participated_in_this_route,#_of_youth_volunteers_who_participated_in_this_route,#_of_doors_in_route,#_of_donation_bags_collected,Completed More Than One Route,Number of routes completed,ward
0,Londonberry Chapel,Bonnie Doon Stake,0 - 30 Minutes,1,0,1,1,No,1,Clareview Ward
1,Gateway Stake Centre,Gateway Stake,0 - 30 Minutes,2,2,20,20,No,1,Lee Ridge Ward
2,Bonnie Doon Stake Centre,Bonnie Doon Stake,0 - 30 Minutes,2,2,20,15,No,1,Forest Heights Ward
3,Bearspaw Chapel,Gateway Stake,30 - 60 Minutes,2,3,144,25,Yes,2,Lee Ridge Ward
4,Gateway Stake Centre,Gateway Stake,30 - 60 Minutes,1,0,230,21,No,1,Silver Berry Ward


**Preparing dataset for prediction models**

Using 2023 as train and 2024 as test

In [24]:
#copy of 2023
data_2023_copy = data_2023.copy()

In [25]:
#copy of 2024
data_2024_copy = data_2024.copy()

In [26]:
# Get unique wards in both datasets
data_2023_wards = set(data_2023_copy['ward'].unique())
data_2024_wards = set(data_2024_copy['ward'].unique())

# Find wards that appear in both datasets
common_wards = data_2023_wards.intersection(data_2024_wards)

# Filter data_2023_copy and data_2024_copy to keep only rows with common wards
data_2023_filtered = data_2023_copy[data_2023_copy['ward'].isin(common_wards)].reset_index(drop=True)
data_2024_filtered = data_2024_copy[data_2024_copy['ward'].isin(common_wards)].reset_index(drop=True)

# Display filtered data for verification
print("Filtered Data 2023:")
print(data_2023_filtered.head())
print("\nFiltered Data 2024:")
print(data_2024_filtered.head())


Filtered Data 2023:
      drop_off_location              stake  \
0    Londonderry Chapel  Bonnie Doon Stake   
1  Gateway Stake Centre      Gateway Stake   
2  Gateway Stake Centre      Gateway Stake   
3  Gateway Stake Centre      Gateway Stake   
4    Londonderry Chapel  Bonnie Doon Stake   

   #_of_adult_volunteers_who_participated_in_this_route  \
0                                                  1      
1                                                  3      
2                                                  2      
3                                                  2      
4                                                  1      

   #_of_youth_volunteers_who_participated_in_this_route  \
0                                                  3      
1                                                  1      
2                                                  0      
3                                                  0      
4                                                  0 

In [27]:
data_2024_filtered['ward'].unique()

array(['Clareview Ward', 'Lee Ridge Ward', 'Forest Heights Ward',
       'Silver Berry Ward', 'Crawford Plains Ward', 'Londonderry Ward',
       'Woodbend Ward', 'Blackmud Creek Ward', 'Connors Hill Ward',
       'Griesbach Ward', 'Rutherford Ward', 'Rabbit Hill Ward',
       'Namao Ward', 'Ellerslie Ward', 'Greenfield Ward',
       'Southgate Ward', 'Terwillegar Park Ward', 'Wild Rose Ward',
       'Rio Vista Ward', 'Beaumont Ward', 'Wainwright Branch'],
      dtype=object)

In [28]:
data_2023_filtered['ward'].nunique()

21

In [29]:
#get the count of each ward
ward_counts = data_2023_filtered['ward'].value_counts()
ward_counts

Unnamed: 0_level_0,count
ward,Unnamed: 1_level_1
Crawford Plains Ward,51
Silver Berry Ward,41
Lee Ridge Ward,41
Griesbach Ward,32
Ellerslie Ward,31
Londonderry Ward,29
Blackmud Creek Ward,29
Clareview Ward,28
Forest Heights Ward,20
Southgate Ward,19


## **Predictions for number of donation bags collected per route**

In [30]:
# Use only 2023 as the train model
X_2023 = data_2023_filtered[['#_of_adult_volunteers_who_participated_in_this_route', '#_of_youth_volunteers_who_participated_in_this_route', 'ward', 'time_spent_collecting_donations',  'Number of routes completed', '#_of_doors_in_route']]
y_2023 = data_2023_filtered['#_of_donation_bags_collected']

In [31]:
X_2024 = data_2024_filtered[['#_of_adult_volunteers_who_participated_in_this_route', '#_of_youth_volunteers_who_participated_in_this_route', 'time_spent_collecting_donations', 'ward', 'Number of routes completed', '#_of_doors_in_route']]
y_2024 = data_2024_filtered['#_of_donation_bags_collected']

In [32]:
# Download the number of donation bags used for model testing
import pandas as pd
df = data_2024_filtered[['#_of_donation_bags_collected', 'ward']]

# Convert the NumPy array to a Pandas DataFrame
df = pd.DataFrame(df)

# Save the DataFrame to a CSV file
df.to_csv('Actual Donation Bags 2024 .csv', index=False)


In [33]:
X_train = X_2023
y_train = y_2023
X_test = X_2024
y_test = y_2024

In [34]:
X_train.isnull().sum()

Unnamed: 0,0
#_of_adult_volunteers_who_participated_in_this_route,0
#_of_youth_volunteers_who_participated_in_this_route,0
ward,0
time_spent_collecting_donations,0
Number of routes completed,0
#_of_doors_in_route,0


In [35]:
y_train.isnull().sum()

0

In [36]:
X_test.isnull().sum()

Unnamed: 0,0
#_of_adult_volunteers_who_participated_in_this_route,0
#_of_youth_volunteers_who_participated_in_this_route,0
time_spent_collecting_donations,0
ward,0
Number of routes completed,0
#_of_doors_in_route,0


In [37]:
y_test.isnull().sum()

0

In [38]:
X_train.head()

Unnamed: 0,#_of_adult_volunteers_who_participated_in_this_route,#_of_youth_volunteers_who_participated_in_this_route,ward,time_spent_collecting_donations,Number of routes completed,#_of_doors_in_route
0,1,3,Clareview Ward,0 - 30 Minutes,2,14
1,3,1,Crawford Plains Ward,0 - 30 Minutes,2,144
2,2,0,Silver Berry Ward,0 - 30 Minutes,2,186
3,2,0,Crawford Plains Ward,0 - 30 Minutes,1,194
4,1,0,Londonderry Ward,0 - 30 Minutes,2,1


In [39]:
X_test.head()

Unnamed: 0,#_of_adult_volunteers_who_participated_in_this_route,#_of_youth_volunteers_who_participated_in_this_route,time_spent_collecting_donations,ward,Number of routes completed,#_of_doors_in_route
0,1,0,0 - 30 Minutes,Clareview Ward,1,1
1,2,2,0 - 30 Minutes,Lee Ridge Ward,1,20
2,2,2,0 - 30 Minutes,Forest Heights Ward,1,20
3,2,3,30 - 60 Minutes,Lee Ridge Ward,2,144
4,1,0,30 - 60 Minutes,Silver Berry Ward,1,230


In [40]:
X_train.columns


Index(['#_of_adult_volunteers_who_participated_in_this_route',
       '#_of_youth_volunteers_who_participated_in_this_route', 'ward',
       'time_spent_collecting_donations', 'Number of routes completed',
       '#_of_doors_in_route'],
      dtype='object')

In [41]:
X_test.columns

Index(['#_of_adult_volunteers_who_participated_in_this_route',
       '#_of_youth_volunteers_who_participated_in_this_route',
       'time_spent_collecting_donations', 'ward', 'Number of routes completed',
       '#_of_doors_in_route'],
      dtype='object')

Encoding the 'ward' and 'time spent collecting donations' column

In [42]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.tree import DecisionTreeClassifier

#encoding the time spent collecting donation
# Create a label encoder object
label_encoder = LabelEncoder()

# Fit and transform the 'time' column
X_train['time_spent_collecting_donations'] = label_encoder.fit_transform(X_train['time_spent_collecting_donations'])
X_test['time_spent_collecting_donations'] = label_encoder.transform(X_test['time_spent_collecting_donations'])




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train['time_spent_collecting_donations'] = label_encoder.fit_transform(X_train['time_spent_collecting_donations'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test['time_spent_collecting_donations'] = label_encoder.transform(X_test['time_spent_collecting_donations'])


In [43]:
X_train['time_spent_collecting_donations'].unique()

#show the label each bin represents
for i, label in enumerate(label_encoder.classes_):
    print(f"{label}: {i}")



0 - 30 Minutes: 0
30 - 60 Minutes: 1
60 - 90 Minutes: 2
90 - 120 Minutes: 3


In [44]:
X_test['time_spent_collecting_donations'].unique()

#show the labeleach class represents
for i, label in enumerate(label_encoder.classes_):
    print(f"{label}: {i}")

0 - 30 Minutes: 0
30 - 60 Minutes: 1
60 - 90 Minutes: 2
90 - 120 Minutes: 3


In [45]:
from sklearn.preprocessing import OneHotEncoder
import pandas as pd

# Initialize OneHotEncoder with handle_unknown='ignore' to avoid errors for missing categories in test data
ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

# Fit and transform on the training data, then transform on the test data
encoded_ward_train = ohe.fit_transform(X_train[['ward']])
encoded_ward_test = ohe.transform(X_test[['ward']])

# Convert encoded arrays to DataFrames with column names
encoded_ward_train_df = pd.DataFrame(encoded_ward_train, columns=ohe.get_feature_names_out(['ward']), dtype=int)
encoded_ward_test_df = pd.DataFrame(encoded_ward_test, columns=ohe.get_feature_names_out(['ward']), dtype=int)

# Align columns in both train and test sets to ensure they match
encoded_ward_train_df, encoded_ward_test_df = encoded_ward_train_df.align(encoded_ward_test_df, fill_value=0, axis=1)

# Drop the original 'ward' column from X_train and X_test
X_train = X_train.drop(columns=['ward']).reset_index(drop=True)
X_test = X_test.drop(columns=['ward']).reset_index(drop=True)

# Concatenate the original DataFrames with the encoded columns
X_train_encoded = pd.concat([X_train, encoded_ward_train_df], axis=1)
X_test_encoded = pd.concat([X_test, encoded_ward_test_df], axis=1)

# Display the final encoded data
print("Encoded X_train for Linear Regression:")
print(X_train_encoded.head())
print("\nEncoded X_test for Linear Regression:")
print(X_test_encoded.head())


Encoded X_train for Linear Regression:
   #_of_adult_volunteers_who_participated_in_this_route  \
0                                                  1      
1                                                  3      
2                                                  2      
3                                                  2      
4                                                  1      

   #_of_youth_volunteers_who_participated_in_this_route  \
0                                                  3      
1                                                  1      
2                                                  0      
3                                                  0      
4                                                  0      

   time_spent_collecting_donations  Number of routes completed  \
0                                0                           2   
1                                0                           2   
2                                0                   

In [46]:
X_train = X_train_encoded
X_train.head()

Unnamed: 0,#_of_adult_volunteers_who_participated_in_this_route,#_of_youth_volunteers_who_participated_in_this_route,time_spent_collecting_donations,Number of routes completed,#_of_doors_in_route,ward_Beaumont Ward,ward_Blackmud Creek Ward,ward_Clareview Ward,ward_Connors Hill Ward,ward_Crawford Plains Ward,...,ward_Namao Ward,ward_Rabbit Hill Ward,ward_Rio Vista Ward,ward_Rutherford Ward,ward_Silver Berry Ward,ward_Southgate Ward,ward_Terwillegar Park Ward,ward_Wainwright Branch,ward_Wild Rose Ward,ward_Woodbend Ward
0,1,3,0,2,14,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,3,1,0,2,144,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,2,0,0,2,186,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,2,0,0,1,194,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,2,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [47]:
X_test = X_test_encoded
X_test.head()

Unnamed: 0,#_of_adult_volunteers_who_participated_in_this_route,#_of_youth_volunteers_who_participated_in_this_route,time_spent_collecting_donations,Number of routes completed,#_of_doors_in_route,ward_Beaumont Ward,ward_Blackmud Creek Ward,ward_Clareview Ward,ward_Connors Hill Ward,ward_Crawford Plains Ward,...,ward_Namao Ward,ward_Rabbit Hill Ward,ward_Rio Vista Ward,ward_Rutherford Ward,ward_Silver Berry Ward,ward_Southgate Ward,ward_Terwillegar Park Ward,ward_Wainwright Branch,ward_Wild Rose Ward,ward_Woodbend Ward
0,1,0,0,1,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,2,0,1,20,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,2,0,1,20,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2,3,1,2,144,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,1,1,230,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [48]:
X_train.shape

(423, 26)

In [49]:
y_train.shape

(423,)

In [50]:
# Fit a regression model to predict the number of donation bags
model = LinearRegression()
model.fit(X_train, y_train)

In [51]:
# Continue with model training and prediction

y_pred = model.predict(X_test)

In [52]:
# Evaluate the model using root mean squared error
from sklearn.metrics import root_mean_squared_error

rmse = root_mean_squared_error(y_test, y_pred)
print(f"Root Mean Squared Error: {rmse}")

Root Mean Squared Error: 35.77113100761932


**Using other models on our dataset**

In [53]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [54]:
# Import necessary libraries
from sklearn.preprocessing import StandardScaler # Import StandardScaler from sklearn.preprocessing
from sklearn.ensemble import GradientBoostingRegressor # Import GradientBoostingRegressor


# Standardize features (required for KNN)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [55]:
# Implement Decision Tree Regression
decision_tree = DecisionTreeRegressor(max_depth=5)  # You can adjust the maximum depth
decision_tree.fit(X_train, y_train)
y_pred_decision_tree = decision_tree.predict(X_test)

# Implement K-Nearest Neighbors (KNN) Regression
knn = KNeighborsRegressor(n_neighbors=5)  # You can adjust the number of neighbors
knn.fit(X_train, y_train)
y_pred_knn = knn.predict(X_test)

# Implement Random Forest Regression
random_forest = RandomForestRegressor(n_estimators=100)  # You can adjust the number of trees (n_estimators)
random_forest.fit(X_train, y_train)
y_pred_random_forest = random_forest.predict(X_test)

# Implement Gradient Boosting Regression
gradient_boosting = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1)  # You can adjust hyperparameters
gradient_boosting.fit(X_train, y_train)
y_pred_gradient_boosting = gradient_boosting.predict(X_test)

In [56]:
# Evaluate the models
from sklearn.metrics import root_mean_squared_error
def evaluate_model(y_true, y_pred, model_name):
    rmse = root_mean_squared_error(y_true, y_pred)

    print(f"{model_name} - Root Mean Squared Error: {rmse}")




In [57]:
evaluate_model(y_test, y_pred_decision_tree, "Decision Tree")
evaluate_model(y_test, y_pred_knn, "K-Nearest Neighbors (KNN)")
evaluate_model(y_test, y_pred_random_forest, "Random Forest")
evaluate_model(y_test, y_pred_gradient_boosting, "Gradient Boosting")

Decision Tree - Root Mean Squared Error: 40.336059900804344
K-Nearest Neighbors (KNN) - Root Mean Squared Error: 33.20421113636355
Random Forest - Root Mean Squared Error: 31.238743645942428
Gradient Boosting - Root Mean Squared Error: 31.81706963467191


Random Forest has the least Root Mean Squared Error. It is the best performing model

### **We will optimize the model using cross validation**

### **Cross Validation**




In [58]:
from sklearn.model_selection import KFold, cross_val_score

In [59]:
models = []
models.append(('KNN', KNeighborsRegressor()))
models.append(('RFC', RandomForestRegressor(n_estimators=100)))
models.append(('DTR', DecisionTreeRegressor()))
models.append(('XGB', XGBRegressor()))

In [60]:
# Define the models to evaluate
models = [
    ("Decision Tree", DecisionTreeRegressor()),
    ("K-Nearest Neighbors (KNN)", KNeighborsRegressor()),
    ("Random Forest", RandomForestRegressor()),
    ("Gradient Boosting", GradientBoostingRegressor())
]

# Define the number of folds for k-fold cross-validation
num_folds = 5

# Initialize empty lists to store results and names
results = []
names = []

# Iterate through the models
for name, model in models:
    # Create a KFold object
    kfold = KFold(n_splits=num_folds, shuffle=True, random_state=42)

    # Perform cross-validation using a regression metric (e.g., 'neg_mean_squared_error')
    cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring='neg_root_mean_squared_error')

    cv_results = np.abs(cv_results)

    # Store the results
    results.append(cv_results)
    names.append(name)

    # Print the mean and standard deviation of the scores
    print(f"{name}: {cv_results.mean():.4f} ({cv_results.std():.4f})")

Decision Tree: 35.5217 (19.1642)
K-Nearest Neighbors (KNN): 26.4989 (16.7310)
Random Forest: 27.1209 (16.1065)
Gradient Boosting: 29.8472 (16.9440)


K-Nearest Neighbors performs best in cross validation.

---



Hyperparameter Tuning of the models

In [61]:
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import numpy as np

# Define a dictionary of models and their respective parameter grids
models = {
    'KNN': {
        'model': KNeighborsRegressor(),
        'params': {
            'knn__n_neighbors': [3, 5, 7, 9, 11],
            'knn__weights': ['uniform', 'distance'],
            'knn__metric': ['euclidean', 'manhattan']
        }
    },
    'RandomForest': {
        'model': RandomForestRegressor(),
        'params': {
            'randomforest__n_estimators': [100, 150, 200],
            'randomforest__max_depth': [None, 10, 20],
            'randomforest__min_samples_split': [2, 5],
            'randomforest__min_samples_leaf': [1, 2],
            'randomforest__max_features': ['sqrt', 'log2'],
            'randomforest__bootstrap': [True, False],
        }
    },
    'GradientBoosting': {
        'model': GradientBoostingRegressor(),
        'params': {
            'gradientboosting__n_estimators': [100, 150, 200],
            'gradientboosting__learning_rate': [0.1, 0.05, 0.01],
            'gradientboosting__max_depth': [3, 5, 7],
            'gradientboosting__min_samples_split': [2, 5],
            'gradientboosting__min_samples_leaf': [1, 2],
            'gradientboosting__max_features': ['sqrt', 'log2']
        }
    },
    'DecisionTree': {
        'model': DecisionTreeRegressor(),
        'params': {
            'decisiontree__max_depth': [None, 10, 20, 30, 40],
            'decisiontree__min_samples_split': [2, 5, 10],
            'decisiontree__min_samples_leaf': [1, 2, 4],
            'decisiontree__max_features': ['auto', 'sqrt', 'log2'],
            'decisiontree__splitter': ['best', 'random']
        }
    }
}

# Store results for each model
best_models = {}
results = {}

# Iterate over each model and perform GridSearchCV
for model_name, config in models.items():
    # Set up the pipeline with a scaler and the model
    pipeline = Pipeline([
        ('scaler', StandardScaler()),  # Standardize features
        (model_name.lower(), config['model'])  # Add model to pipeline with a lowercase name
    ])

    # Initialize GridSearchCV with model's parameter grid
    grid_search = GridSearchCV(
        pipeline,
        config['params'],
        cv=5,
        scoring='neg_mean_squared_error',
        n_jobs=-1
    )

    # Fit the GridSearchCV object to the training data
    grid_search.fit(X_train, y_train)

    # Store the best model and its performance metrics
    best_models[model_name] = grid_search.best_estimator_
    best_params = grid_search.best_params_
    best_rmse = np.sqrt(-grid_search.best_score_)  # Convert negative MSE to RMSE

    # Store results for easy access and printing
    results[model_name] = {
        'best_params': best_params,
        'best_rmse': best_rmse
    }


# Make predictions# Find the best model name based on lowest RMSE
best_model_name = min(results, key=lambda k: results[k]['best_rmse'])

# Make predictions using the best model
y_pred = best_models[best_model_name].predict(X_test)

# Print the results for each model
for model_name, result in results.items():
    print(f"{model_name} Best RMSE: {result['best_rmse']:.4f}")

KNN Best RMSE: 31.0321
RandomForest Best RMSE: 29.1358
GradientBoosting Best RMSE: 29.0189
DecisionTree Best RMSE: 29.4979


450 fits failed out of a total of 1350.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
254 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/pipeline.py", line 473, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 1466, in wrapper
    estimator._validate_params()
  Fil

Random Forest performed best after hyperparameter tuning

In [62]:

import pandas as pd
#downloading th prediction of the donation bags using random forest to csv
# Convert the NumPy array to a Pandas DataFrame
df = pd.DataFrame(y_pred_random_forest)

# Save the DataFrame to a CSV file
df.to_csv('y_pred_random_forest.csv', index=False)

## Implementing the route completion time model

Log transform time completion time and print it based on log information


In [63]:
new_data_2023 = data_2023_filtered.copy()
new_data_2024 = data_2024_filtered.copy()

In [64]:
#encode time colum
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
new_data_2023['time_spent_collecting_donations'] = label_encoder.fit_transform(new_data_2023['time_spent_collecting_donations'])
new_data_2024['time_spent_collecting_donations'] = label_encoder.transform(new_data_2024['time_spent_collecting_donations'])

In [65]:
log_transformed_2023_time = np.log1p(new_data_2023['time_spent_collecting_donations'])

# Display the result
log_transformed_2023_time

Unnamed: 0,time_spent_collecting_donations
0,0.000000
1,0.000000
2,0.000000
3,0.000000
4,0.000000
...,...
418,0.693147
419,1.386294
420,0.000000
421,0.000000


In [66]:
log_transformed_2024_time = np.log1p(new_data_2024['time_spent_collecting_donations'])

# Display the result
log_transformed_2024_time

Unnamed: 0,time_spent_collecting_donations
0,0.000000
1,0.000000
2,0.000000
3,0.693147
4,0.693147
...,...
436,0.693147
437,0.000000
438,0.693147
439,1.098612


In [67]:
#save and download log transformed time as csv
import pandas as pd
df = pd.DataFrame(log_transformed_2023_time)
df.to_csv('Actual time completion 2024.csv', index=False)

### Prediction for Route Completion Time

In [68]:
#copy of data_2023_filtered
data_2023_route = data_2023_filtered.copy()
data_2024_route = data_2024_filtered.copy()

In [69]:
# Use only 2023 as the train model
X_2023_route = data_2023_route[['#_of_adult_volunteers_who_participated_in_this_route', '#_of_youth_volunteers_who_participated_in_this_route', 'ward', '#_of_donation_bags_collected', 'Number of routes completed', '#_of_doors_in_route', ]]
y_2023_route = data_2023_route['time_spent_collecting_donations']

In [70]:
# Use only 2024 as the train model
X_2024_route = data_2024_route[['#_of_adult_volunteers_who_participated_in_this_route', '#_of_youth_volunteers_who_participated_in_this_route', '#_of_donation_bags_collected', 'ward', 'Number of routes completed', '#_of_doors_in_route']]
y_2024_route = data_2024_route['time_spent_collecting_donations']

In [71]:
X_train_route = X_2023_route
y_train_route = y_2023_route
X_test_route = X_2024_route
y_test_route = y_2024_route

In [72]:
y_test_route.unique()

array(['0 - 30 Minutes', '30 - 60 Minutes', '60 - 90 Minutes',
       '90 - 120 Minutes'], dtype=object)

In [73]:
y_train_route.unique()

['0 - 30 Minutes', '30 - 60 Minutes', '90 - 120 Minutes', '60 - 90 Minutes']
Categories (4, object): ['0 - 30 Minutes' < '30 - 60 Minutes' < '60 - 90 Minutes' < '90 - 120 Minutes']

In [74]:
#encoding the time spent collecting donation
# Create a label encoder object
label_encoder = LabelEncoder()

# Fit and transform the 'time' column
y_train_route = label_encoder.fit_transform(y_train_route)
y_test_route = label_encoder.transform(y_test_route)

In [75]:


log_transformed_y_train = np.log1p(y_train_route)

# Display the result
log_transformed_y_train

array([0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.69314718, 0.        , 0.69314718,
       0.        , 0.69314718, 0.69314718, 0.        , 0.69314718,
       0.        , 0.69314718, 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.69314718, 1.38629436, 0.        ,
       0.        , 0.        , 0.69314718, 0.        , 0.        ,
       0.69314718, 0.69314718, 0.69314718, 0.69314718, 0.        ,
       0.69314718, 0.        , 0.        , 0.69314718, 0.        ,
       0.69314718, 0.69314718, 0.69314718, 0.69314718, 0.69314718,
       0.69314718, 0.        , 0.        , 1.38629436, 0.        ,
       0.        , 0.69314718, 0.        , 1.09861229, 0.69314718,
       0.69314718, 0.69314718, 0.        , 0.69314718, 0.69314718,
       0.69314718, 0.        , 0.69314718, 0.69314718, 0.69314718,
       1.38629436, 0.69314718, 1.09861229, 0.        , 0.69314718,
       0.69314718, 0.69314718, 0.        , 0.69314718, 0.     

In [76]:
y_train_route = log_transformed_y_train

In [77]:
log_transformed_y_test = np.log1p(y_test_route)

# Display the result
log_transformed_y_test

array([0.        , 0.        , 0.        , 0.69314718, 0.69314718,
       0.        , 0.69314718, 0.69314718, 1.09861229, 0.69314718,
       0.        , 0.69314718, 0.        , 0.        , 0.69314718,
       0.69314718, 0.        , 0.69314718, 0.        , 0.69314718,
       0.69314718, 1.09861229, 0.69314718, 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.69314718, 0.        ,
       0.69314718, 0.        , 1.38629436, 0.        , 0.69314718,
       0.        , 0.69314718, 0.        , 0.69314718, 0.        ,
       0.        , 0.69314718, 0.69314718, 0.69314718, 0.69314718,
       0.        , 0.69314718, 0.        , 1.09861229, 0.        ,
       0.        , 1.09861229, 1.09861229, 0.69314718, 0.69314718,
       0.        , 1.09861229, 0.69314718, 0.69314718, 0.69314718,
       0.        , 0.69314718, 0.        , 0.        , 1.09861229,
       0.69314718, 1.09861229, 1.09861229, 0.        , 0.69314718,
       0.69314718, 0.        , 0.69314718, 0.        , 1.38629

In [78]:
y_test_route = log_transformed_y_test

In [79]:
from sklearn.preprocessing import OneHotEncoder
import pandas as pd

# Initialize OneHotEncoder with handle_unknown='ignore' to avoid errors for missing categories in test data
ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

# Fit and transform on the training data, then transform on the test data
encoded_ward_train_route = ohe.fit_transform(X_train_route[['ward']])
encoded_ward_test_route = ohe.transform(X_test_route[['ward']])

# Convert encoded arrays to DataFrames with column names
encoded_ward_train_route_df = pd.DataFrame(encoded_ward_train_route, columns=ohe.get_feature_names_out(['ward']), dtype=int)
encoded_ward_test_route_df = pd.DataFrame(encoded_ward_test_route, columns=ohe.get_feature_names_out(['ward']), dtype=int)

# Align columns in both train and test sets to ensure they match
encoded_ward_train_route_df, encoded_ward_testc_df = encoded_ward_train_route_df.align(encoded_ward_test_route_df, fill_value=0, axis=1)

# Drop the original 'ward' column from X_train and X_test
X_train_route = X_train_route.drop(columns=['ward']).reset_index(drop=True)
X_test_route = X_test_route.drop(columns=['ward']).reset_index(drop=True)

# Concatenate the original DataFrames with the encoded columns
X_train_route_encoded = pd.concat([X_train_route, encoded_ward_train_route_df], axis=1)
X_test_route_encoded = pd.concat([X_test_route, encoded_ward_test_route_df], axis=1)

# Display the final encoded data
print("Encoded X_train for Linear Regression:")
print(X_train_route_encoded.head())
print("\nEncoded X_test for Linear Regression:")
print(X_test_route_encoded.head())

Encoded X_train for Linear Regression:
   #_of_adult_volunteers_who_participated_in_this_route  \
0                                                  1      
1                                                  3      
2                                                  2      
3                                                  2      
4                                                  1      

   #_of_youth_volunteers_who_participated_in_this_route  \
0                                                  3      
1                                                  1      
2                                                  0      
3                                                  0      
4                                                  0      

   #_of_donation_bags_collected  Number of routes completed  \
0                            14                           2   
1                            18                           2   
2                            20                           2   

In [80]:
model_route = LinearRegression()
model_route.fit(X_train_route, y_train_route)

In [81]:

# Continue with model training and prediction

y_pred_route= model_route.predict(X_test_route)


In [82]:
# Evaluate the model
rmse_route = root_mean_squared_error(y_test_route, y_pred_route)
print(f"Root_Mean Squared Error: {rmse_route}")

Root_Mean Squared Error: 0.42938520101598265


In [83]:
# Standardize features (required for KNN)
scaler = StandardScaler()
X_train_route= scaler.fit_transform(X_train_route)
X_test_route = scaler.transform(X_test_route)

In [84]:
# Implement Decision Tree Regression
decision_tree_route = DecisionTreeRegressor(max_depth=5)  # You can adjust the maximum depth
decision_tree_route.fit(X_train_route, y_train_route)
y_pred_decision_tree_route = decision_tree_route.predict(X_test_route)

# Implement K-Nearest Neighbors (KNN) Regression
knn_route = KNeighborsRegressor(n_neighbors=5)  # You can adjust the number of neighbors
knn_route.fit(X_train_route, y_train_route)
y_pred_knn_route = knn_route.predict(X_test_route)

# Implement Random Forest Regression
random_forest_route = RandomForestRegressor(n_estimators=100)  # You can adjust the number of trees (n_estimators)
random_forest_route.fit(X_train_route, y_train_route)
y_pred_random_forest_route = random_forest_route.predict(X_test_route)

# Implement Gradient Boosting Regression
gradient_boosting_route = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1)  # You can adjust hyperparameters
gradient_boosting_route.fit(X_train_route, y_train_route)
y_pred_gradient_boosting_route = gradient_boosting_route.predict(X_test_route)

In [85]:
# Evaluate the models
def evaluate_model(y_true, y_pred_route, model_name):
    rmse_route = root_mean_squared_error(y_true, y_pred_route)

    print(f"{model_name} - Root Mean Squared Error: {rmse_route}")


In [86]:
evaluate_model(y_test_route, y_pred_decision_tree_route, "Decision Tree")
evaluate_model(y_test_route, y_pred_knn_route, "K-Nearest Neighbors (KNN)")
evaluate_model(y_test_route, y_pred_random_forest_route, "Random Forest")
evaluate_model(y_test_route, y_pred_gradient_boosting_route, "Gradient Boosting")

Decision Tree - Root Mean Squared Error: 0.41976413532449586
K-Nearest Neighbors (KNN) - Root Mean Squared Error: 0.4319874805368724
Random Forest - Root Mean Squared Error: 0.43185385159437195
Gradient Boosting - Root Mean Squared Error: 0.42560099481607505


**Performing cross validation**

In [87]:
models_route = []
models_route.append(('KNN', KNeighborsRegressor()))
models_route.append(('RFC', RandomForestRegressor(n_estimators=100)))
models_route.append(('DTR', DecisionTreeRegressor()))
models_route.append(('XGB', XGBRegressor()))

In [88]:
# Define the models to evaluate
models_route = [
    ("Decision Tree", DecisionTreeRegressor()),
    ("K-Nearest Neighbors (KNN)", KNeighborsRegressor()),
    ("Random Forest", RandomForestRegressor()),
    ("Gradient Boosting", GradientBoostingRegressor())
]

# Define the number of folds for k-fold cross-validation
num_folds = 5

# Initialize empty lists to store results and names
results_route = []
names_route = []

# Iterate through the models
for name, model in models_route:
    # Create a KFold object
    kfold = KFold(n_splits=num_folds, shuffle=True, random_state = 42)

    # Perform cross-validation using a regression metric (e.g., 'neg_mean_squared_error')
    cv_results_route = cross_val_score(model, X_train_route, y_train_route, cv=kfold, scoring='neg_root_mean_squared_error')

    cv_results_route = np.abs(cv_results_route)

    # Store the results
    results_route.append(cv_results_route)
    names_route.append(name)

    # Print the mean and standard deviation of the scores
    print(f"{name}: {cv_results_route.mean():.4f} ({cv_results_route.std():.4f})")

Decision Tree: 0.4857 (0.0364)
K-Nearest Neighbors (KNN): 0.3974 (0.0391)
Random Forest: 0.3881 (0.0238)
Gradient Boosting: 0.3773 (0.0224)


Hyperparameter tuning on the models





In [89]:
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import numpy as np

# Define a dictionary of models and their respective parameter grids
models = {
    'KNN': {
        'model': KNeighborsRegressor(),
        'params': {
            'knn__n_neighbors': [3, 5, 7, 9, 11],
            'knn__weights': ['uniform', 'distance'],
            'knn__metric': ['euclidean', 'manhattan']
        }
    },
    'RandomForest': {
        'model': RandomForestRegressor(),
        'params': {
            'randomforest__n_estimators': [100, 150, 200],
            'randomforest__max_depth': [None, 10, 20],
            'randomforest__min_samples_split': [2, 5],
            'randomforest__min_samples_leaf': [1, 2],
            'randomforest__max_features': ['sqrt', 'log2'],
            'randomforest__bootstrap': [True, False],
        }
    },
    'GradientBoosting': {
        'model': GradientBoostingRegressor(),
        'params': {
            'gradientboosting__n_estimators': [100, 150, 200],
            'gradientboosting__learning_rate': [0.1, 0.05, 0.01],
            'gradientboosting__max_depth': [3, 5, 7],
            'gradientboosting__min_samples_split': [2, 5],
            'gradientboosting__min_samples_leaf': [1, 2],
            'gradientboosting__max_features': ['sqrt', 'log2']
        }
    },
    'DecisionTree': {
        'model': DecisionTreeRegressor(),
        'params': {
            'decisiontree__max_depth': [None, 10, 20, 30, 40],
            'decisiontree__min_samples_split': [2, 5, 10],
            'decisiontree__min_samples_leaf': [1, 2, 4],
            'decisiontree__max_features': ['auto', 'sqrt', 'log2'],
            'decisiontree__splitter': ['best', 'random']
        }
    }
}

# Store results for each model
best_models_route = {}
results_route = {}

# Iterate over each model and perform GridSearchCV
for model_name, config in models.items():
    # Set up the pipeline with a scaler and the model
    pipeline = Pipeline([
        ('scaler', StandardScaler()),  # Standardize features
        (model_name.lower(), config['model'])  # Add model to pipeline with a lowercase name
    ])

    # Initialize GridSearchCV with model's parameter grid
    grid_search = GridSearchCV(
        pipeline,
        config['params'],
        cv=5,
        scoring='neg_mean_squared_error',
        n_jobs=-1
    )


    # Fit the GridSearchCV object to the training data
    grid_search.fit(X_train_route, y_train_route)  # Added fitting step

    # Store the best model and its performance metrics
    best_models_route[model_name] = grid_search.best_estimator_
    best_params_route = grid_search.best_params_
    best_rmse_route = np.sqrt(-grid_search.best_score_)  # Convert negative MSE to RMSE

    # Store results for easy access and printing
    results[model_name] = {
        'best_params_route': best_params_route,
        'best_rmse_route': best_rmse_route
    }


# Make predictions# Find the best model name based on lowest RMSE
best_model_name_route = min(results, key=lambda k: results[k]['best_rmse_route'])

# Make predictions using the best model
y_pred_route = best_models_route[best_model_name_route].predict(X_test_route)

# Print the results for each model
for model_name, result in results.items():
    print(f"{model_name} Best RMSE: {result['best_rmse_route']:.4f}")

  _data = np.array(data, dtype=dtype, copy=copy,


KNN Best RMSE: 0.3729
RandomForest Best RMSE: 0.3660
GradientBoosting Best RMSE: 0.3598
DecisionTree Best RMSE: 0.3868


450 fits failed out of a total of 1350.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
158 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/pipeline.py", line 473, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 1466, in wrapper
    estimator._validate_params()
  Fil

Gradient Boosting has the best performance at the cross validation and hyperparameter tuning.

In [90]:

import pandas as pd
#downloading th prediction of the donation bags using random forest to csv
# Convert the NumPy array to a Pandas DataFrame
df = pd.DataFrame(y_pred_random_forest_route)

# Save the DataFrame to a CSV file
df.to_csv('Predicted completion time 2024.csv', index=False)

In [91]:

df = pd.DataFrame(y_test_route)

# Save the DataFrame to a CSV file
df.to_csv('y_test_route', index=False)

### Developing a model that predicts the route completion time based on specific ward information



Train on 2023

Test on 2024

Drop Londonderry ward and use the model to predict the route completion time

In [213]:
# make a copy of the original dataset
data_2023_time = data_2023_filtered.copy()
data_2024_time = data_2024_filtered.copy()

In [214]:
#make a copy of this
data_2023_time_copy = data_2023_time.copy()
data_2024_time_copy = data_2024_time.copy()

In [215]:
# Output only 'time_spent_collecting_donations' for rows where 'ward' is 'Londonderry Ward'
londonderry_donations_time = data_2024_time_copy.loc[data_2024_time_copy['ward'] == 'Londonderry Ward', 'time_spent_collecting_donations']
print(londonderry_donations_time)

7       30 - 60 Minutes
12       0 - 30 Minutes
13       0 - 30 Minutes
31       0 - 30 Minutes
32     90 - 120 Minutes
33       0 - 30 Minutes
38      30 - 60 Minutes
41      30 - 60 Minutes
65      30 - 60 Minutes
67      60 - 90 Minutes
82      30 - 60 Minutes
109     30 - 60 Minutes
111     30 - 60 Minutes
114     30 - 60 Minutes
117     30 - 60 Minutes
126     30 - 60 Minutes
129     30 - 60 Minutes
139     30 - 60 Minutes
144      0 - 30 Minutes
145     30 - 60 Minutes
151     30 - 60 Minutes
159     30 - 60 Minutes
200      0 - 30 Minutes
236     30 - 60 Minutes
237     60 - 90 Minutes
242     60 - 90 Minutes
252     30 - 60 Minutes
293      0 - 30 Minutes
340     30 - 60 Minutes
358     30 - 60 Minutes
368    90 - 120 Minutes
371      0 - 30 Minutes
379      0 - 30 Minutes
414     60 - 90 Minutes
Name: time_spent_collecting_donations, dtype: object


In [216]:
#reset index
londonderry_donations_time.reset_index(drop=True, inplace=True)

In [217]:
londonderry_donations_time

Unnamed: 0,time_spent_collecting_donations
0,30 - 60 Minutes
1,0 - 30 Minutes
2,0 - 30 Minutes
3,0 - 30 Minutes
4,90 - 120 Minutes
5,0 - 30 Minutes
6,30 - 60 Minutes
7,30 - 60 Minutes
8,30 - 60 Minutes
9,60 - 90 Minutes


In [218]:
# label encode
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
londonderry_donations_time = label_encoder.fit_transform(londonderry_donations_time)

In [219]:
londonderry_donations_time

array([1, 0, 0, 0, 3, 0, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
       0, 1, 2, 2, 1, 0, 1, 1, 3, 0, 0, 2])

In [220]:
#log transformation

log_transformed_londonderry = np.log1p(londonderry_donations_time)

# Display the result
print(log_transformed_londonderry)

[0.69314718 0.         0.         0.         1.38629436 0.
 0.69314718 0.69314718 0.69314718 1.09861229 0.69314718 0.69314718
 0.69314718 0.69314718 0.69314718 0.69314718 0.69314718 0.69314718
 0.         0.69314718 0.69314718 0.69314718 0.         0.69314718
 1.09861229 1.09861229 0.69314718 0.         0.69314718 0.69314718
 1.38629436 0.         0.         1.09861229]


In [None]:
#save and download log_transformed_londonderry
import pandas as pd
df = pd.DataFrame(log_transformed_londonderry)
df.to_csv('Londonderry actual completion time.csv', index=False)

In [221]:
data_2024_time['ward'].unique()

array(['Clareview Ward', 'Lee Ridge Ward', 'Forest Heights Ward',
       'Silver Berry Ward', 'Crawford Plains Ward', 'Londonderry Ward',
       'Woodbend Ward', 'Blackmud Creek Ward', 'Connors Hill Ward',
       'Griesbach Ward', 'Rutherford Ward', 'Rabbit Hill Ward',
       'Namao Ward', 'Ellerslie Ward', 'Greenfield Ward',
       'Southgate Ward', 'Terwillegar Park Ward', 'Wild Rose Ward',
       'Rio Vista Ward', 'Beaumont Ward', 'Wainwright Branch'],
      dtype=object)

In [222]:
test_ward = data_2024_time[data_2024_time['ward'] == 'Londonderry Ward']

In [224]:
#reset test ward
test_ward.reset_index(drop=True, inplace=True)

In [226]:
test_ward

Unnamed: 0,drop_off_location,stake,time_spent_collecting_donations,#_of_adult_volunteers_who_participated_in_this_route,#_of_youth_volunteers_who_participated_in_this_route,#_of_doors_in_route,#_of_donation_bags_collected,Completed More Than One Route,Number of routes completed,ward
0,Londonberry Chapel,Bonnie Doon Stake,30 - 60 Minutes,2,3,21,23,No,1,Londonderry Ward
1,Londonberry Chapel,Bonnie Doon Stake,0 - 30 Minutes,2,0,18,18,No,1,Londonderry Ward
2,Londonberry Chapel,Bonnie Doon Stake,0 - 30 Minutes,2,1,10,12,No,1,Londonderry Ward
3,Londonberry Chapel,Bonnie Doon Stake,0 - 30 Minutes,1,3,7,10,No,1,Londonderry Ward
4,Londonberry Chapel,Bonnie Doon Stake,90 - 120 Minutes,2,0,40,15,No,1,Londonderry Ward
5,Londonberry Chapel,Bonnie Doon Stake,0 - 30 Minutes,1,0,60,17,No,1,Londonderry Ward
6,Londonberry Chapel,Bonnie Doon Stake,30 - 60 Minutes,2,0,13,14,No,1,Londonderry Ward
7,Londonberry Chapel,Bonnie Doon Stake,30 - 60 Minutes,2,3,15,18,No,1,Londonderry Ward
8,Londonberry Chapel,Bonnie Doon Stake,30 - 60 Minutes,0,2,177,19,No,1,Londonderry Ward
9,Londonberry Chapel,Bonnie Doon Stake,60 - 90 Minutes,1,1,0,51,No,1,Londonderry Ward


In [227]:
#drop Londonderry Ward from dataframe
data_2024_time = data_2024_time[data_2024_time['ward'] != 'Londonderry Ward']

In [228]:
#reset index in data_2024_time
data_2024_time.reset_index(drop=True, inplace=True)

In [229]:
data_2024_time

Unnamed: 0,drop_off_location,stake,time_spent_collecting_donations,#_of_adult_volunteers_who_participated_in_this_route,#_of_youth_volunteers_who_participated_in_this_route,#_of_doors_in_route,#_of_donation_bags_collected,Completed More Than One Route,Number of routes completed,ward
0,Londonberry Chapel,Bonnie Doon Stake,0 - 30 Minutes,1,0,1,1,No,1,Clareview Ward
1,Gateway Stake Centre,Gateway Stake,0 - 30 Minutes,2,2,20,20,No,1,Lee Ridge Ward
2,Bonnie Doon Stake Centre,Bonnie Doon Stake,0 - 30 Minutes,2,2,20,15,No,1,Forest Heights Ward
3,Bearspaw Chapel,Gateway Stake,30 - 60 Minutes,2,3,144,25,Yes,2,Lee Ridge Ward
4,Gateway Stake Centre,Gateway Stake,30 - 60 Minutes,1,0,230,21,No,1,Silver Berry Ward
...,...,...,...,...,...,...,...,...,...,...
402,Riverbend Stake Centre,Riverbend Stake,30 - 60 Minutes,2,0,211,47,No,1,Greenfield Ward
403,Riverbend Stake Centre,Riverbend Stake,0 - 30 Minutes,2,1,58,6,No,1,Rabbit Hill Ward
404,Bonnie Doon Stake Centre,Bonnie Doon Stake,30 - 60 Minutes,2,0,150,23,No,1,Connors Hill Ward
405,Riverbend Stake Centre,Riverbend Stake,60 - 90 Minutes,1,0,140,33,No,1,Greenfield Ward


In [230]:
#drop Londonderry Ward from dataframe
data_2023_time = data_2023_time[data_2023_time['ward'] != 'Londonderry Ward']

In [231]:
# reset index in data_2023_time
data_2023_time.reset_index(drop=True, inplace=True)

In [232]:
data_2023_time

Unnamed: 0,drop_off_location,stake,#_of_adult_volunteers_who_participated_in_this_route,#_of_youth_volunteers_who_participated_in_this_route,#_of_donation_bags_collected,time_spent_collecting_donations,Completed More Than One Route,ward,Number of routes completed,#_of_doors_in_route
0,Londonderry Chapel,Bonnie Doon Stake,1,3,14,0 - 30 Minutes,0,Clareview Ward,2,14
1,Gateway Stake Centre,Gateway Stake,3,1,18,0 - 30 Minutes,1,Crawford Plains Ward,2,144
2,Gateway Stake Centre,Gateway Stake,2,0,20,0 - 30 Minutes,0,Silver Berry Ward,2,186
3,Gateway Stake Centre,Gateway Stake,2,0,20,0 - 30 Minutes,1,Crawford Plains Ward,1,194
4,Bearspaw Chapel,Gateway Stake,2,0,20,0 - 30 Minutes,0,Blackmud Creek Ward,2,179
...,...,...,...,...,...,...,...,...,...,...
389,Bonnie Doon Stake Centre,Bonnie Doon Stake,2,2,36,30 - 60 Minutes,1,Forest Heights Ward,2,150
390,Morinville,Edmonton North Stake,1,3,80,90 - 120 Minutes,1,Namao Ward,2,420
391,North Stake Centre,Edmonton North Stake,2,0,20,0 - 30 Minutes,0,Namao Ward,2,150
392,North Stake Centre,Edmonton North Stake,2,0,20,0 - 30 Minutes,0,Namao Ward,2,150


In [233]:
#label encode time_spent_collecting_donations
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
data_2023_time['time_spent_collecting_donations'] = label_encoder.fit_transform(data_2023_time['time_spent_collecting_donations'])

In [234]:
log_transformed_2023_time = np.log1p(data_2023_time['time_spent_collecting_donations'])

# Display the result
log_transformed_2023_time

Unnamed: 0,time_spent_collecting_donations
0,0.000000
1,0.000000
2,0.000000
3,0.000000
4,0.000000
...,...
389,0.693147
390,1.386294
391,0.000000
392,0.000000


In [235]:
#insert the log_transformed_2023_time into data_2023_time for column 'time spent collecting donations" where index is true
data_2023_time['time_spent_collecting_donations'] = log_transformed_2023_time

In [236]:
data_2023_time

Unnamed: 0,drop_off_location,stake,#_of_adult_volunteers_who_participated_in_this_route,#_of_youth_volunteers_who_participated_in_this_route,#_of_donation_bags_collected,time_spent_collecting_donations,Completed More Than One Route,ward,Number of routes completed,#_of_doors_in_route
0,Londonderry Chapel,Bonnie Doon Stake,1,3,14,0.000000,0,Clareview Ward,2,14
1,Gateway Stake Centre,Gateway Stake,3,1,18,0.000000,1,Crawford Plains Ward,2,144
2,Gateway Stake Centre,Gateway Stake,2,0,20,0.000000,0,Silver Berry Ward,2,186
3,Gateway Stake Centre,Gateway Stake,2,0,20,0.000000,1,Crawford Plains Ward,1,194
4,Bearspaw Chapel,Gateway Stake,2,0,20,0.000000,0,Blackmud Creek Ward,2,179
...,...,...,...,...,...,...,...,...,...,...
389,Bonnie Doon Stake Centre,Bonnie Doon Stake,2,2,36,0.693147,1,Forest Heights Ward,2,150
390,Morinville,Edmonton North Stake,1,3,80,1.386294,1,Namao Ward,2,420
391,North Stake Centre,Edmonton North Stake,2,0,20,0.000000,0,Namao Ward,2,150
392,North Stake Centre,Edmonton North Stake,2,0,20,0.000000,0,Namao Ward,2,150


In [237]:
#label encode completion time in 2024 data
data_2024_time['time_spent_collecting_donations'] = label_encoder.transform(data_2024_time['time_spent_collecting_donations'])

In [238]:
#log transform 2024 time spent collecting donations
log_transformed_2024_time = np.log1p(data_2024_time['time_spent_collecting_donations'])

# Display the result
log_transformed_2024_time

Unnamed: 0,time_spent_collecting_donations
0,0.000000
1,0.000000
2,0.000000
3,0.693147
4,0.693147
...,...
402,0.693147
403,0.000000
404,0.693147
405,1.098612


In [239]:
#insert the log transformed 2024 time into the data_2024_tim3e column
data_2024_time['time_spent_collecting_donations'] = log_transformed_2024_time

In [240]:
data_2024_time

Unnamed: 0,drop_off_location,stake,time_spent_collecting_donations,#_of_adult_volunteers_who_participated_in_this_route,#_of_youth_volunteers_who_participated_in_this_route,#_of_doors_in_route,#_of_donation_bags_collected,Completed More Than One Route,Number of routes completed,ward
0,Londonberry Chapel,Bonnie Doon Stake,0.000000,1,0,1,1,No,1,Clareview Ward
1,Gateway Stake Centre,Gateway Stake,0.000000,2,2,20,20,No,1,Lee Ridge Ward
2,Bonnie Doon Stake Centre,Bonnie Doon Stake,0.000000,2,2,20,15,No,1,Forest Heights Ward
3,Bearspaw Chapel,Gateway Stake,0.693147,2,3,144,25,Yes,2,Lee Ridge Ward
4,Gateway Stake Centre,Gateway Stake,0.693147,1,0,230,21,No,1,Silver Berry Ward
...,...,...,...,...,...,...,...,...,...,...
402,Riverbend Stake Centre,Riverbend Stake,0.693147,2,0,211,47,No,1,Greenfield Ward
403,Riverbend Stake Centre,Riverbend Stake,0.000000,2,1,58,6,No,1,Rabbit Hill Ward
404,Bonnie Doon Stake Centre,Bonnie Doon Stake,0.693147,2,0,150,23,No,1,Connors Hill Ward
405,Riverbend Stake Centre,Riverbend Stake,1.098612,1,0,140,33,No,1,Greenfield Ward


In [241]:
#showing only information for Londonderry Ward
test_ward.columns

Index(['drop_off_location', 'stake', 'time_spent_collecting_donations',
       '#_of_adult_volunteers_who_participated_in_this_route',
       '#_of_youth_volunteers_who_participated_in_this_route',
       '#_of_doors_in_route', '#_of_donation_bags_collected',
       'Completed More Than One Route', 'Number of routes completed', 'ward'],
      dtype='object')

In [244]:
data_2024_time.columns

Index(['drop_off_location', 'stake', 'time_spent_collecting_donations',
       '#_of_adult_volunteers_who_participated_in_this_route',
       '#_of_youth_volunteers_who_participated_in_this_route',
       '#_of_doors_in_route', '#_of_donation_bags_collected',
       'Completed More Than One Route', 'Number of routes completed', 'ward'],
      dtype='object')

In [245]:
data_2023_time.columns

Index(['drop_off_location', 'stake',
       '#_of_adult_volunteers_who_participated_in_this_route',
       '#_of_youth_volunteers_who_participated_in_this_route',
       '#_of_donation_bags_collected', 'time_spent_collecting_donations',
       'Completed More Than One Route', 'ward', 'Number of routes completed',
       '#_of_doors_in_route'],
      dtype='object')

In [246]:
# Use only 2023 as the train model
#features column
X_2023_time = data_2023_time[['stake', '#_of_adult_volunteers_who_participated_in_this_route', '#_of_youth_volunteers_who_participated_in_this_route', '#_of_donation_bags_collected', 'Completed More Than One Route',  'ward',  'Number of routes completed', '#_of_doors_in_route' ]]

#target column
y_2023_time = data_2023_time['time_spent_collecting_donations']

In [247]:
y_2023_time

Unnamed: 0,time_spent_collecting_donations
0,0.000000
1,0.000000
2,0.000000
3,0.000000
4,0.000000
...,...
389,0.693147
390,1.386294
391,0.000000
392,0.000000


In [248]:
# 2024 as test model

X_2024_time = data_2024_time[['stake', '#_of_adult_volunteers_who_participated_in_this_route', '#_of_youth_volunteers_who_participated_in_this_route', '#_of_donation_bags_collected', 'Completed More Than One Route',  'ward',  'Number of routes completed', '#_of_doors_in_route' ]]

y_2024_time = data_2024_time['time_spent_collecting_donations']

In [249]:
y_2024_time

Unnamed: 0,time_spent_collecting_donations
0,0.000000
1,0.000000
2,0.000000
3,0.693147
4,0.693147
...,...
402,0.693147
403,0.000000
404,0.693147
405,1.098612


In [None]:

import pandas as pd
#downloading the time_spent_collecting_donations in Londonderry ward
# Convert the NumPy array to a Pandas DataFrame
df = pd.DataFrame(X_2024_time)

# Save the DataFrame to a CSV file
df.to_csv('y_2024_time.csv', index=False)

In [250]:
X_train_time = X_2023_time
y_train_time = y_2023_time
X_test_time = X_2024_time
y_test_time = y_2024_time

In [None]:
import pandas as pd
#downloading the time_spent_collecting_donations in Londonderry ward
# Convert the NumPy array to a Pandas DataFrame
df = pd.DataFrame(y_test_time)

# Save the DataFrame to a CSV file
df.to_csv('Londonderry 2024 Actual.csv', index=False)


In [251]:

X_train_time.head()

Unnamed: 0,stake,#_of_adult_volunteers_who_participated_in_this_route,#_of_youth_volunteers_who_participated_in_this_route,#_of_donation_bags_collected,Completed More Than One Route,ward,Number of routes completed,#_of_doors_in_route
0,Bonnie Doon Stake,1,3,14,0,Clareview Ward,2,14
1,Gateway Stake,3,1,18,1,Crawford Plains Ward,2,144
2,Gateway Stake,2,0,20,0,Silver Berry Ward,2,186
3,Gateway Stake,2,0,20,1,Crawford Plains Ward,1,194
4,Gateway Stake,2,0,20,0,Blackmud Creek Ward,2,179


In [253]:
# Encode categorical variables
le = LabelEncoder()
X_train_time['stake'] = le.fit_transform(X_train_time['stake'])
X_train_time['ward'] = le.fit_transform(X_train_time['ward'])
X_train_time['Completed More Than One Route'] = le.fit_transform(X_train_time['Completed More Than One Route'])
X_test_time['stake'] = le.fit_transform(X_test_time['stake'])
X_test_time['ward'] = le.fit_transform(X_test_time['ward'])
X_test_time['Completed More Than One Route'] = le.fit_transform(X_test_time['Completed More Than One Route'])

y_train_time = le.fit_transform(y_train_time)
y_test_time = le.fit_transform(y_test_time)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_time['stake'] = le.fit_transform(X_train_time['stake'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_time['ward'] = le.fit_transform(X_train_time['ward'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_time['Completed More Than One Route'] = le.fit_transform(X_train_time['

In [None]:
X_train_time.head()

Unnamed: 0,stake,#_of_adult_volunteers_who_participated_in_this_route,#_of_youth_volunteers_who_participated_in_this_route,#_of_donation_bags_collected,Completed More Than One Route,ward,Number of routes completed,#_of_doors_in_route
0,0,1,3,14,0,2,2,14
1,2,3,1,18,1,4,2,144
2,2,2,0,20,0,14,2,186
3,2,2,0,20,1,4,1,194
5,2,2,0,20,0,1,2,179


In [None]:
X_test_time.head()

Unnamed: 0,stake,#_of_adult_volunteers_who_participated_in_this_route,#_of_youth_volunteers_who_participated_in_this_route,#_of_donation_bags_collected,Completed More Than One Route,ward,Number of routes completed,#_of_doors_in_route
0,0,1,0,1,0,2,1,1
1,2,2,2,20,0,9,1,20
2,0,2,2,15,0,6,1,20
3,2,2,3,25,1,9,2,144
4,2,1,0,21,0,14,1,230


In [None]:
y_test_time

array([0, 0, 0, 1, 1, 0, 1, 2, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 2, 1, 0, 0,
       0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 2, 0, 0, 2,
       2, 1, 1, 0, 2, 1, 1, 1, 0, 1, 0, 0, 2, 2, 0, 1, 1, 0, 1, 0, 3, 0,
       1, 0, 1, 2, 0, 2, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 2,
       2, 1, 1, 1, 3, 0, 1, 2, 1, 2, 1, 1, 1, 2, 0, 0, 1, 2, 1, 1, 1, 2,
       1, 2, 0, 1, 1, 3, 2, 0, 1, 2, 2, 1, 1, 2, 1, 1, 1, 0, 1, 1, 2, 0,
       2, 0, 1, 0, 2, 1, 0, 0, 3, 2, 1, 2, 2, 1, 2, 1, 1, 0, 2, 0, 0, 3,
       2, 2, 2, 2, 3, 2, 2, 1, 1, 1, 2, 0, 1, 0, 1, 3, 2, 2, 1, 2, 0, 0,
       0, 1, 1, 1, 0, 1, 2, 0, 1, 2, 2, 1, 0, 2, 1, 1, 3, 2, 3, 1, 2, 0,
       2, 1, 2, 2, 0, 1, 0, 2, 1, 0, 3, 1, 2, 2, 2, 2, 2, 1, 3, 1, 0, 3,
       3, 2, 0, 1, 2, 1, 2, 2, 1, 2, 0, 1, 0, 1, 2, 2, 1, 2, 3, 1, 2, 1,
       2, 2, 2, 2, 1, 2, 1, 3, 1, 0, 2, 2, 2, 1, 0, 2, 1, 3, 1, 0, 3, 1,
       1, 3, 1, 2, 3, 2, 1, 2, 2, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 2, 3, 1,
       3, 2, 2, 2, 0, 3, 0, 2, 1, 1, 1, 0, 3, 1, 0,

In [254]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

# Create a StandardScaler instance
scaler = StandardScaler()

# Ensure X_train_time and X_test_time have the same columns and order
X_test_time = X_test_time[X_train_time.columns] # Reorder columns in X_test_time

# Scale the features
X_train_time_scaled = scaler.fit_transform(X_train_time)
X_test_time_scaled = scaler.transform(X_test_time)

In [255]:
#Instantiate the model
# Initialize LogisticRegression with increased max_iter
model_time = LogisticRegression(max_iter=1000) # Increased max_iter to 1000

# Fit the model using the scaled data
model_time.fit(X_train_time_scaled, y_train_time)

In [256]:
#model training and prediction
y_pred_time = model_time.predict(X_test_time_scaled)


In [257]:
# Evaluate the model
from sklearn.metrics import root_mean_squared_error  # Import the function

rmse_time = root_mean_squared_error(y_test_time, y_pred_time)
print(f"Root Mean Squared Error: {rmse_time}")

Root Mean Squared Error: 0.9233525072147974


Try other models

In [258]:

from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

# Implement Decision Tree Regression
decision_tree_time = DecisionTreeRegressor(max_depth=5)
decision_tree_time.fit(X_train_time_scaled, y_train_time)
y_pred_decision_tree_time = decision_tree_time.predict(X_test_time_scaled)

# Implement K-Nearest Neighbors (KNN) Regression
knn_time = KNeighborsRegressor(n_neighbors=5)
knn_time.fit(X_train_time_scaled, y_train_time)
y_pred_knn_time = knn_time.predict(X_test_time_scaled)

# Implement Random Forest Regression
random_forest_time = RandomForestRegressor(n_estimators=100)
random_forest_time.fit(X_train_time_scaled, y_train_time)
y_pred_random_forest_time = random_forest_time.predict(X_test_time_scaled)

# Implement Gradient Boosting Regression
gradient_boosting_time = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1)
gradient_boosting_time.fit(X_train_time_scaled, y_train_time)
y_pred_gradient_boosting_time = gradient_boosting_time.predict(X_test_time_scaled)

In [259]:
# Evaluate the models

def evaluate_model(y_true, y_pred_time, model_name_time):
    rmse_time = root_mean_squared_error(y_true, y_pred_time)

    print(f"{model_name_time} - Root Mean Squared Error: {rmse_time}")

In [260]:
#evaluate models
evaluate_model(y_test_time, y_pred_decision_tree_time, "Decision Tree")
evaluate_model(y_test_time, y_pred_knn_time, "K-Nearest Neighbors (KNN)")
evaluate_model(y_test_time, y_pred_random_forest_time, "Random Forest")
evaluate_model(y_test_time, y_pred_gradient_boosting_time, "Gradient Boosting")



Decision Tree - Root Mean Squared Error: 0.9061505239162914
K-Nearest Neighbors (KNN) - Root Mean Squared Error: 0.877958319517966
Random Forest - Root Mean Squared Error: 0.8493052515552993
Gradient Boosting - Root Mean Squared Error: 0.8610738840083917


Performing Cross Validation

In [261]:
# Define the models to evaluate
models_time = [
    ("LogisticRegression(max_iter=1000)", LogisticRegression(max_iter=1000)),  # Changed this line
    ("Decision Tree", DecisionTreeRegressor()),
    ("K-Nearest Neighbors (KNN)", KNeighborsRegressor()),
    ("Random Forest", RandomForestRegressor()),
    ("Gradient Boosting", GradientBoostingRegressor())
]

# Define the number of folds for k-fold cross-validation
num_folds = 5

# Initialize empty lists to store results and names
results_time = []
names_time = []

# Iterate through the models
for name, model in models_time:
    # Create a KFold object
    kfold = KFold(n_splits=num_folds, shuffle=True, random_state = 42)

    # Perform cross-validation using a regression metric (e.g., 'neg_mean_squared_error')
    cv_results_time = cross_val_score(model, X_train_time_scaled, y_train_time, cv=kfold, scoring='neg_root_mean_squared_error')

    cv_results_time = np.abs(cv_results_time)

    # Store the results
    results_time.append(cv_results_time)
    names_time.append(name)

    # Print the mean and standard deviation of the scores
    print(f"{name}: {cv_results_time.mean():.4f} ({cv_results_time.std():.4f})")

LogisticRegression(max_iter=1000): 0.7474 (0.0860)
Decision Tree: 0.9627 (0.0863)
K-Nearest Neighbors (KNN): 0.7287 (0.0572)
Random Forest: 0.6942 (0.0730)
Gradient Boosting: 0.7077 (0.0803)


Hyperparameter tuning on all the models

In [262]:
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import numpy as np

# Define a dictionary of models and their respective parameter grids
models = {
    'KNN': {
        'model': KNeighborsRegressor(),
        'params': {
            'knn__n_neighbors': [3, 5, 7, 9, 11],
            'knn__weights': ['uniform', 'distance'],
            'knn__metric': ['euclidean', 'manhattan']
        }
    },
    'RandomForest': {
        'model': RandomForestRegressor(),
        'params': {
            'randomforest__n_estimators': [100, 150, 200],
            'randomforest__max_depth': [None, 10, 20],
            'randomforest__min_samples_split': [2, 5],
            'randomforest__min_samples_leaf': [1, 2],
            'randomforest__max_features': ['sqrt', 'log2'],
            'randomforest__bootstrap': [True, False],
        }
    },
    'GradientBoosting': {
        'model': GradientBoostingRegressor(),
        'params': {
            'gradientboosting__n_estimators': [100, 150, 200],
            'gradientboosting__learning_rate': [0.1, 0.05, 0.01],
            'gradientboosting__max_depth': [3, 5, 7],
            'gradientboosting__min_samples_split': [2, 5],
            'gradientboosting__min_samples_leaf': [1, 2],
            'gradientboosting__max_features': ['sqrt', 'log2']
        }
    },
    'DecisionTree': {
        'model': DecisionTreeRegressor(),
        'params': {
            'decisiontree__max_depth': [None, 10, 20, 30, 40],
            'decisiontree__min_samples_split': [2, 5, 10],
            'decisiontree__min_samples_leaf': [1, 2, 4],
            'decisiontree__max_features': ['auto', 'sqrt', 'log2'],
            'decisiontree__splitter': ['best', 'random']
        }
    }
}

# Store results for each model
best_models_time = {}
results_time = {}

# Iterate over each model and perform GridSearchCV
for model_name, config in models.items():
    # Set up the pipeline with a scaler and the model
    pipeline = Pipeline([
        ('scaler', StandardScaler()),  # Standardize features
        (model_name.lower(), config['model'])  # Add model to pipeline with a lowercase name
    ])

    # Initialize GridSearchCV with model's parameter grid
    grid_search = GridSearchCV(
        pipeline,
        config['params'],
        cv=5,
        scoring='neg_mean_squared_error',
        n_jobs=-1
    )


    # Fit the GridSearchCV object to the training data
    grid_search.fit(X_train_time_scaled, y_train_time)  # Added fitting step

    # Store the best model and its performance metrics
    best_models_time[model_name] = grid_search.best_estimator_
    best_params_time = grid_search.best_params_
    best_rmse_time = np.sqrt(-grid_search.best_score_)  # Convert negative MSE to RMSE

    # Store results for easy access and printing
    results[model_name] = {
        'best_params_time': best_params_time,
        'best_rmse_time': best_rmse_time
    }


# Make predictions# Find the best model name based on lowest RMSE
best_model_name_time = min(results, key=lambda k: results[k]['best_rmse_time'])

# Make predictions using the best model
y_pred_time = best_models_time[best_model_name_time].predict(X_test_time)

# Print the results for each model
for model_name, result in results.items():
    print(f"{model_name} Best RMSE: {result['best_rmse_time']:.4f}")

  _data = np.array(data, dtype=dtype, copy=copy,


KNN Best RMSE: 0.7191
RandomForest Best RMSE: 0.6863
GradientBoosting Best RMSE: 0.6778
DecisionTree Best RMSE: 0.7427


450 fits failed out of a total of 1350.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
450 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/pipeline.py", line 473, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 1466, in wrapper
    estimator._validate_params()
  Fil

In [268]:
test_ward.columns

Index(['stake', '#_of_adult_volunteers_who_participated_in_this_route',
       '#_of_youth_volunteers_who_participated_in_this_route',
       '#_of_donation_bags_collected', 'Completed More Than One Route', 'ward',
       'Number of routes completed', '#_of_doors_in_route',
       'time_spent_collecting_donations'],
      dtype='object')

In [264]:
test_ward.isnull().sum()

Unnamed: 0,0
drop_off_location,0
stake,0
time_spent_collecting_donations,0
#_of_adult_volunteers_who_participated_in_this_route,0
#_of_youth_volunteers_who_participated_in_this_route,0
#_of_doors_in_route,0
#_of_donation_bags_collected,0
Completed More Than One Route,0
Number of routes completed,0
ward,0


In [269]:

#Encode categorical features in test ward

from sklearn.preprocessing import LabelEncoder

# Create a label encoder object
le = LabelEncoder()

# Fit and transform the 'time' column

test_ward['stake'] = le.fit_transform(test_ward['stake'])
test_ward['ward'] = le.fit_transform(test_ward['ward'])
test_ward['Completed More Than One Route'] = le.fit_transform(test_ward['Completed More Than One Route'])

test_ward['time_spent_collecting_donations'] = le.fit_transform(test_ward['time_spent_collecting_donations'])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_ward['stake'] = le.fit_transform(test_ward['stake'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_ward['ward'] = le.fit_transform(test_ward['ward'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_ward['Completed More Than One Route'] = le.fit_transform(test_ward['Completed More Tha

In [270]:
#log transform time spent collecting donations in test ward
log_transformed_test_ward = np.log1p(test_ward['time_spent_collecting_donations'])

# Display the result
log_transformed_test_ward

Unnamed: 0,time_spent_collecting_donations
0,0.693147
1,0.0
2,0.0
3,0.0
4,1.386294
5,0.0
6,0.693147
7,0.693147
8,0.693147
9,1.098612


In [271]:
# Standardize numerical features using the same StandardScaler used for training data
X_test_ward_scaled = scaler.transform(test_ward.drop(['time_spent_collecting_donations'], axis=1))

In [272]:
X_test_ward = test_ward.drop(['time_spent_collecting_donations'], axis=1)

In [276]:
# Make predictions on the test data
predictions = random_forest_time.predict(X_test_ward_scaled)

In [277]:
#evaluate  model performance

rmse_prediction = root_mean_squared_error(test_ward['time_spent_collecting_donations'], predictions)
print(f"Mean Squared Error: {rmse_prediction}")

Mean Squared Error: 0.7400099363402852


In [None]:
import pandas as pd
#downloading th prediction of the donation bags using random forest to csv
# Convert the NumPy array to a Pandas DataFrame
df = pd.DataFrame(predictions)

# Save the DataFrame to a CSV file
df.to_csv('predictions.csv', index=False)

In [278]:
predictions

array([0.58, 0.8 , 0.36, 0.34, 0.45, 0.84, 0.34, 1.13, 1.05, 1.57, 1.02,
       1.96, 1.05, 0.4 , 0.46, 0.37, 0.52, 0.81, 0.5 , 0.63, 0.71, 1.07,
       0.43, 1.48, 1.33, 1.29, 0.75, 0.78, 0.38, 0.11, 2.  , 0.51, 0.34,
       0.45])