Load & Clean Dataset

In [111]:
import pandas as pd

# Load CSV
raw_df = pd.read_csv('Africa_Quantity.csv')

# Drop all 'S_YYYY' columns
clean_df = raw_df[[col for col in raw_df.columns if not col.startswith('S_')]]

# Rename for clarity
clean_df.rename(columns={'Land Area': 'Country'}, inplace=True)

# Melt to long format
year_cols = [str(y) for y in range(2000, 2016)]
melt_df = pd.melt(clean_df, id_vars=['Country', 'Trade flow', 'Commodity'], value_vars=year_cols, var_name='Year', value_name='Quantity')

# Clean data types
melt_df['Year'] = melt_df['Year'].astype(int)
melt_df['Quantity'] = pd.to_numeric(melt_df['Quantity'], errors='coerce')
melt_df.dropna(subset=['Quantity'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clean_df.rename(columns={'Land Area': 'Country'}, inplace=True)


Restructure for Multi-Output

In [112]:
# Aggregate and pivot
grouped = melt_df.groupby(['Country', 'Year', 'Trade flow'])['Quantity'].sum().reset_index()
pivot_df = grouped.pivot(index=['Country', 'Year'], columns='Trade flow', values='Quantity').reset_index()
pivot_df.fillna(0, inplace=True)  # Fill missing trade values with 0

Encode & Split Data

In [113]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

le = LabelEncoder()
pivot_df['Encoded_Country'] = le.fit_transform(pivot_df['Country'])

X = pivot_df[['Encoded_Country', 'Year']]
y = pivot_df[['Import', 'Export']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Feature Scaling

In [114]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

Train & Evaluate Models

In [115]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np

# Linear Regression
lr = MultiOutputRegressor(LinearRegression())
lr.fit(X_train_scaled, y_train)

# Decision Tree
dt = MultiOutputRegressor(DecisionTreeRegressor(random_state=42))
dt.fit(X_train_scaled, y_train)

# Random Forest
rf = MultiOutputRegressor(RandomForestRegressor(random_state=42))
rf.fit(X_train_scaled, y_train)

# Evaluation
models = {'Linear Regression': lr, 'Decision Tree': dt, 'Random Forest': rf}
for name, model in models.items():
    y_pred = model.predict(X_test_scaled)
    print(f"\n{name}")
    print("MAE:", mean_absolute_error(y_test, y_pred))
    print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))


Linear Regression
MAE: 47975.509784918024
RMSE: 91302.8847525799

Decision Tree
MAE: 6913.983606557376
RMSE: 25130.424566680907

Random Forest
MAE: 6597.288333333333
RMSE: 20638.358557926207


Save Best Model

In [116]:
import joblib

# Save best performing model and scaler
# joblib.dump(rf, 'API/best_model.joblib')
joblib.dump(rf, 'best_model.joblib')
# joblib.dump(scaler, 'API/scaler.joblib')
joblib.dump(scaler, 'scaler.joblib')
# joblib.dump(le, 'API/label_encoder.joblib')
joblib.dump(le, 'label_encoder.joblib')

['label_encoder.joblib']

Define Prediction Function

In [117]:
# helper function to use in API
import numpy as np

def predict_trade_volumes(model, scaler, encoder, country: str, year: int):
    encoded_country = encoder.transform([country])[0]
    input_scaled = scaler.transform([[encoded_country, year]])
    prediction = model.predict(input_scaled)
    return prediction[0].tolist()