In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler


In [2]:
df = pd.read_csv("data/dummy_data.csv")

In [3]:
df.head()

Unnamed: 0,Order Id,Time Of Order,Item Ordered,Quantity,Amount,Location,Delivery Time
0,1,9/5/2023,E,14,280,East-Legon,56
1,2,31/10/2023,B,7,140,Kaneshie,495
2,3,17/05/2023,B,1,20,Kaneshie,139
3,4,6/4/2023,A,4,80,Accra,316
4,5,19/11/2023,B,15,300,Kaneshie,293


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 600 entries, 0 to 599
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Order Id       600 non-null    int64 
 1   Time Of Order  600 non-null    object
 2   Item Ordered   600 non-null    object
 3   Quantity       600 non-null    int64 
 4   Amount         600 non-null    int64 
 5   Location       600 non-null    object
 6   Delivery Time  600 non-null    int64 
dtypes: int64(4), object(3)
memory usage: 32.9+ KB


In [5]:
df.describe(include="all")

Unnamed: 0,Order Id,Time Of Order,Item Ordered,Quantity,Amount,Location,Delivery Time
count,600.0,600,600,600.0,600.0,600,600.0
unique,,293,5,,,5,
top,,26/11/2023,D,,,Malllam,
freq,,6,134,,,130,
mean,300.5,,,10.508333,210.166667,,276.851667
std,173.349358,,,5.828369,116.567372,,136.5584
min,1.0,,,1.0,20.0,,30.0
25%,150.75,,,6.0,120.0,,155.0
50%,300.5,,,10.5,210.0,,291.0
75%,450.25,,,16.0,320.0,,386.0


In [6]:
df.isna().sum()

Order Id         0
Time Of Order    0
Item Ordered     0
Quantity         0
Amount           0
Location         0
Delivery Time    0
dtype: int64

In [7]:
df.duplicated().sum()

0

In [8]:
df.columns

Index(['Order Id', 'Time Of Order', 'Item Ordered', 'Quantity', 'Amount',
       'Location', 'Delivery Time'],
      dtype='object')

In [9]:
df.drop(axis=1, columns="Order Id", inplace=True)

In [10]:
df['Time Of Order'] = pd.to_datetime(df['Time Of Order'], format='%d/%m/%Y')

# Extract month and day features from 'Time Of Order'
df['Month'] = df['Time Of Order'].dt.month
df['Day'] = df['Time Of Order'].dt.day

le = LabelEncoder()
df['Item Ordered'] = le.fit_transform(df['Item Ordered'])
df['Location'] = le.fit_transform(df['Location'])

# Select relevant columns for modeling
features = ['Item Ordered', 'Quantity', "Location", 'Delivery Time', 'Month', 'Day']
target = 'Amount'
X = df[features]
y = df[target]

In [11]:
X.describe()

Unnamed: 0,Item Ordered,Quantity,Location,Delivery Time,Month,Day
count,600.0,600.0,600.0,600.0,600.0,600.0
mean,1.95,10.508333,2.001667,276.851667,6.59,16.488333
std,1.383482,5.828369,1.447465,136.5584,3.448921,8.747613
min,0.0,1.0,0.0,30.0,1.0,1.0
25%,1.0,6.0,1.0,155.0,4.0,9.0
50%,2.0,10.5,2.0,291.0,7.0,17.0
75%,3.0,16.0,3.0,386.0,10.0,24.0
max,4.0,20.0,4.0,500.0,12.0,31.0


In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Scale the numerical features using StandardScaler
numerical_columns = ['Quantity', 'Delivery Time']
scaler = StandardScaler()
X_train[numerical_columns] = scaler.fit_transform(X_train[numerical_columns])
X_test[numerical_columns] = scaler.transform(X_test[numerical_columns])


In [13]:
# model = RandomForestRegressor(random_state=42)
# model.fit(X_train, y_train)

# # Make predictions on the test set
# y_pred = model.predict(X_test)

# # Calculate evaluation metrics
# mse = mean_squared_error(y_test, y_pred)
# mae = mean_absolute_error(y_test, y_pred)

# # Print the evaluation metrics
# print("Mean Squared Error (MSE):", mse)
# print("Mean Absolute Error (MAE):", mae)

In [14]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error


# Define the parameter grid for the grid search
param_grid = {
    'n_estimators': [100, 200, 300],      # Number of trees in the forest
    'max_depth': [None, 5, 10, 20],       # Maximum depth of each tree
    'min_samples_split': [2, 5, 10],      # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4]         # Minimum number of samples required to be at a leaf node
}

# Create the RandomForestRegressor model
rf_model = RandomForestRegressor()

# Perform grid search with cross-validation
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

# Get the best parameters and best estimator from the grid search
best_params = grid_search.best_params_
best_estimator = grid_search.best_estimator_

# Print the best parameters
print("Best Parameters:", best_params)

# # Make predictions using the best estimator
# y_pred = best_estimator.predict(X_train)

# # Calculate the root mean squared error (RMSE)
# rmse = mean_squared_error(y_test, y_pred, squared=False)
# print("Root Mean Squared Error (RMSE):", rmse)


Best Parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}


In [19]:
rf_model = RandomForestRegressor(max_depth=None, min_samples_leaf= 1, min_samples_split=2, n_estimators=100)
rf_model.fit(X_train, y_train)

# # Make predictions on the test set
y_pred = rf_model.predict(X_test)

# # Calculate evaluation metrics
mse = mean_squared_error(y_test, y_pred, squared=False)
mae = mean_absolute_error(y_test, y_pred)

In [20]:
mse

0.0

In [21]:
 y_test

110    180
419    360
565    140
77      80
181    260
      ... 
399    160
340    220
148    340
494    140
439    340
Name: Amount, Length: 120, dtype: int64

In [22]:
y_pred

array([180., 360., 140.,  80., 260., 180., 380., 220., 340., 340., 300.,
       120., 180., 360., 200., 180., 380.,  60., 260., 120.,  20., 360.,
       240., 300., 220., 200.,  60.,  80., 120., 360., 400., 340., 240.,
       320., 140.,  60.,  20., 180., 360., 120., 280., 320.,  20., 300.,
       180., 360., 380., 140., 160.,  40., 200., 320., 360., 380., 320.,
       220., 180.,  60., 380., 400., 220., 320., 220.,  60., 240., 360.,
       260., 400., 280., 340., 140., 240.,  80., 300., 400.,  60.,  20.,
       200., 320., 360.,  60., 340., 200., 360., 140., 280.,  20., 320.,
       300., 300.,  60., 240., 360., 280., 340., 320.,  40.,  60., 200.,
        40., 200., 320., 260., 140., 240., 220., 340., 100., 280.,  20.,
       360., 140., 220., 260., 320., 160., 220., 340., 140., 340.])

In [23]:
import pickle

with open("rf_model.pkl", "wb") as file:
    pickle.dump(rf_model, file)