In [2]:
import pandas as pd

#Load cleaned dataset
df = pd.read_csv("../data/processed/contracts_clean.csv")
df.head()


Unnamed: 0,As of Date,Fiscal Year,Region,Borrower Country / Economy,Borrower Country / Economy Code,Project ID,Project Name,Project Global Practice,Procurement Category,Procurement Method,...,Contract Description,Borrower Contract Reference Number,Supplier ID,Supplier,Supplier Country / Economy,Supplier Country / Economy Code,Supplier Contract Amount (USD),Review type,Signing Year,Signing Month
0,12/04/2025,2022,Europe and Central Asia,Kyrgyz Republic,KG,P151416,Urban Development Project,Public Admin;Energy & Extractives;Transportati...,Consultant Services,Quality And Cost-Based Selection,...,Technical Supervision of construction in pilot...,UDP PPG-QCBS-2016-4-12,316882.0,M/S. AIRES INGEGNERIA,Italy,IT,161538.89,Prior,2022,5
1,12/04/2025,2022,Europe and Central Asia,Kyrgyz Republic,KG,P151416,Urban Development Project,Public Admin;Energy & Extractives;Transportati...,Consultant Services,Quality And Cost-Based Selection,...,Technical Supervision of construction in pilot...,UDP PPG-QCBS-2016-4-12,325453.0,ALL INGEGNERIA,Italy,IT,161538.89,Prior,2022,5
2,12/04/2025,2021,"Middle East, North Africa, Afghanistan, and Pa...",Pakistan,PK,P154036,Disaster and Climate Resilience Improvement Pr...,Public Admin;Transportation;Water/Sanit/Waste,Goods,Request for Quotations,...,Fixtures and Furnishers for establishment of P...,PK-PDMA PUNJAB-2495-GO-RFQ,330376.0,FREEZ POINT,Pakistan,PK,34059.2,Post,2020,12
3,12/04/2025,2020,Eastern and Southern Africa,Eastern and Southern Africa,3E,P111556,AFR RI-East Africa Public Health Laboratory Ne...,Info & Communication;Health,Goods,Request for Bids,...,"SUPPLY OF REAGENTS, SUPPLIES CONSUMABLES AND C...",TZ-MOH-13063-GO-RFB,341173.0,YUKOS ENTERPRISES (E.A),Tanzania,TZ,59575.27,Post,2020,6
4,12/04/2025,2021,South Asia,Bangladesh,BD,P149553,Bangladesh NATP-2: National Agricultural Techn...,Agriculture,Consultant Services,Individual Consultant Selection,...,"ICT Specialist of PMU, NATP-2",SD/PMU/IC-07A,646978.0,INDIVIDUAL CONSULTANT,Unknown,Unknown,238123.91,Post,2020,11


In [3]:
#Prepare features & target

target = "Supplier Contract Amount (USD)"

X = df.drop(columns=[target])
y = df[target]


In [4]:
#Encode categorical columns

from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()

for col in X.select_dtypes(include=['object']).columns:
    X[col] = encoder.fit_transform(X[col])


In [5]:
#Train/Test Split

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [6]:
# Model 1: Linear Regression

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred_lr = lr.predict(X_test)

# Compute RMSE manually (instead of squared=False)
lr_rmse = np.sqrt(mean_squared_error(y_test, y_pred_lr))

lr_mae = mean_absolute_error(y_test, y_pred_lr)
lr_r2 = r2_score(y_test, y_pred_lr)

lr_rmse, lr_mae, lr_r2


(np.float64(3998312.276049407), 744087.0163119598, 0.06984661848303986)

In [7]:
# Model 2: Random Forest Regressor

from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(
    n_estimators=50,          # fewer trees
    max_depth=12,             # limit depth
    max_features='sqrt',      # fewer features per split
    min_samples_split=10,     # avoid deep branches
    random_state=42,
    n_jobs=-1
)

rf.fit(X_train, y_train)

y_pred_rf = rf.predict(X_test)

rf_rmse = np.sqrt(mean_squared_error(y_test, y_pred_rf))
rf_mae = mean_absolute_error(y_test, y_pred_rf)
rf_r2 = r2_score(y_test, y_pred_rf)

rf_rmse, rf_mae, rf_r2


(np.float64(3238621.061890663), 371919.69728492596, 0.38973084425177706)

In [8]:
# Model 3: XGBoost Regressor

from xgboost import XGBRegressor

xgb = XGBRegressor(
    n_estimators=300,
    learning_rate=0.1,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

xgb.fit(X_train, y_train)

y_pred_xgb = xgb.predict(X_test)

xgb_rmse = np.sqrt(mean_squared_error(y_test, y_pred_xgb))
xgb_mae = mean_absolute_error(y_test, y_pred_xgb)
xgb_r2 = r2_score(y_test, y_pred_xgb)

xgb_rmse, xgb_mae, xgb_r2


(np.float64(3230108.9759352584), 403917.2159356293, 0.3929345762283549)

In [9]:
results = pd.DataFrame({
    "Model": ["Linear Regression", "Random Forest", "XGBoost"],
    "RMSE": [lr_rmse, rf_rmse, xgb_rmse],
    "MAE": [lr_mae, rf_mae, xgb_mae],
    "R² Score": [lr_r2, rf_r2, xgb_r2]
})

results


Unnamed: 0,Model,RMSE,MAE,R² Score
0,Linear Regression,3998312.0,744087.016312,0.069847
1,Random Forest,3238621.0,371919.697285,0.389731
2,XGBoost,3230109.0,403917.215936,0.392935


In [10]:
import joblib

# Save the best model
joblib.dump(rf, "../models/best_model.pkl")

# Save the encoder (needed for prediction)
joblib.dump(encoder, "../models/label_encoder.pkl")


['../models/label_encoder.pkl']