In [2]:
#importing all necessary libraries
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder
import joblib

In [3]:
# Step 1: Load the dataset
dataset = pd.read_csv("ai_price_estimator_dataset.csv")
print("First 5 Rows of Dataset:")
print(dataset.head())

First 5 Rows of Dataset:
   weight_ton  distance_km delivery_type load_category truck_body_type  \
0          24       112.23        single          bulk         trailer   
1          17       349.08         multi       fragile      heavy-duty   
2          28      1984.02        single    perishable         trailer   
3          21      1696.67        single    perishable         flatbed   
4          28       680.49        single        others      heavy-duty   

  truck_size_category urgency_level  delivery_timeline_days  estimated_price  
0          heavy-duty           low                     1.0          4785.30  
1               small        medium                     0.9          7304.40  
2               small          high                     4.7         24981.53  
3          heavy-duty          high                     3.9         26210.21  
4         extra-large        medium                     1.6         12209.03  


In [4]:
# Step 2: Show basic info
print("\nDataset Info:")
print(dataset.info())


Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 9 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   weight_ton              100 non-null    int64  
 1   distance_km             100 non-null    float64
 2   delivery_type           100 non-null    object 
 3   load_category           100 non-null    object 
 4   truck_body_type         100 non-null    object 
 5   truck_size_category     100 non-null    object 
 6   urgency_level           100 non-null    object 
 7   delivery_timeline_days  100 non-null    float64
 8   estimated_price         100 non-null    float64
dtypes: float64(3), int64(1), object(5)
memory usage: 7.2+ KB
None


In [5]:
# Step 3: Check for null values
print("\nNull Values in Each Column:")
print(dataset.isnull().sum())


Null Values in Each Column:
weight_ton                0
distance_km               0
delivery_type             0
load_category             0
truck_body_type           0
truck_size_category       0
urgency_level             0
delivery_timeline_days    0
estimated_price           0
dtype: int64


In [6]:
# Step 4: Encode categorical columns
from sklearn.preprocessing import LabelEncoder

categorical_columns = [
    'delivery_type',         # instead of 'stop_type'
    'load_category',
    'truck_body_type',       # instead of 'body_type'
    'truck_size_category',   # instead of 'size_category'
    'urgency_level'
]

label_encoders = {}

for column in categorical_columns:
    le = LabelEncoder()
    dataset[column] = le.fit_transform(dataset[column].astype(str))
    label_encoders[column] = le 

In [7]:
#checking for null values
print(dataset.isnull().sum())

weight_ton                0
distance_km               0
delivery_type             0
load_category             0
truck_body_type           0
truck_size_category       0
urgency_level             0
delivery_timeline_days    0
estimated_price           0
dtype: int64


In [8]:
#basic statistics
print(dataset.describe())

       weight_ton  distance_km  delivery_type  load_category  truck_body_type  \
count  100.000000    100.00000      100.00000     100.000000       100.000000   
mean    17.330000   1008.34500        0.43000       2.690000         3.700000   
std      8.431608    577.17441        0.49757       1.721492         2.750574   
min      1.000000     96.25000        0.00000       0.000000         0.000000   
25%      9.000000    536.74750        0.00000       1.000000         1.000000   
50%     17.000000    929.95500        0.00000       3.000000         3.000000   
75%     25.000000   1479.92000        1.00000       4.000000         6.000000   
max     30.000000   1984.02000        1.00000       5.000000         8.000000   

       truck_size_category  urgency_level  delivery_timeline_days  \
count           100.000000       100.0000              100.000000   
mean              2.020000         0.9900                2.453000   
std               1.517441         0.8468                1.3354

In [9]:
#feature and target separation
x = dataset.drop(columns=["estimated_price"], axis=1)
y = dataset["estimated_price"]
print("Feature Columns:")
print(x.columns)
print("Target Column:")
print(y.name)

Feature Columns:
Index(['weight_ton', 'distance_km', 'delivery_type', 'load_category',
       'truck_body_type', 'truck_size_category', 'urgency_level',
       'delivery_timeline_days'],
      dtype='object')
Target Column:
estimated_price


In [10]:
#train-test split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [11]:
print("Training set shape:",x_train.shape)
print("Test set shape",x_test.shape)

Training set shape: (80, 8)
Test set shape (20, 8)


In [12]:
#model creation
model = RandomForestRegressor(n_estimators=100, random_state=42)

In [13]:
#training the model
model.fit(x_train, y_train)

In [14]:
# Step 9: Predict on training and test sets
train_predictions = model.predict(x_train)
test_predictions = model.predict(x_test)

In [15]:
# Step 10: Evaluate the model
train_mse = mean_squared_error(y_train, train_predictions)
test_mse = mean_squared_error(y_test, test_predictions)
train_r2 = r2_score(y_train, train_predictions)
test_r2 = r2_score(y_test, test_predictions)


In [16]:
print("\n--- MODEL PERFORMANCE ---")
print(f"Train MSE: {train_mse:.2f}")
print(f"Test MSE: {test_mse:.2f}")
print(f"Train R2 Score: {train_r2:.2f}")
print(f"Test R2 Score: {test_r2:.2f}")


--- MODEL PERFORMANCE ---
Train MSE: 778093.75
Test MSE: 5559953.55
Train R2 Score: 0.98
Test R2 Score: 0.86


In [17]:
# --- SAVE MODEL TO FILE (.pkl) ---
import joblib
joblib.dump(model, "price_estimator_model.pkl")  # Save model

# --- LOAD AND TEST THE SAVED MODEL ---
loaded_model = joblib.load("price_estimator_model.pkl")