In [None]:
# ======================================================
# AI-DRIVEN MARKET FORECASTING FOR THV DEVICES
# (USING ACTUAL WORLD BANK COLUMN STRUCTURE)
# ======================================================

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error

import matplotlib.pyplot as plt

# ------------------------------------------------------
# 1. LOAD DATA
# ------------------------------------------------------

econ = pd.read_csv("economic_metadata.csv")
edu = pd.read_csv("Education_indicators_raw_data.csv")

# ------------------------------------------------------
# 2. CONVERT WIDE DATA TO LONG FORMAT
# ------------------------------------------------------

year_cols = [col for col in econ.columns if "[YR" in col]

econ_long = econ.melt(
    id_vars=["Country Name", "Country Code", "Series Name"],
    value_vars=year_cols,
    var_name="Year",
    value_name="Value"
)

edu_long = edu.melt(
    id_vars=["Country Name", "Country Code", "Series Name"],
    value_vars=year_cols,
    var_name="Year",
    value_name="Value"
)

# Clean year column
econ_long["Year"] = econ_long["Year"].str.extract(r"(\d{4})").astype(int)
edu_long["Year"] = edu_long["Year"].str.extract(r"(\d{4})").astype(int)

# ------------------------------------------------------
# 3. FILTER REQUIRED INDICATORS
# ------------------------------------------------------

gdp = econ_long[econ_long["Series Name"] == "GDP per capita (current US$)"]
urban = econ_long[econ_long["Series Name"] == "Urban population (% of total population)"]

enroll = edu_long[edu_long["Series Name"] == "School enrollment, secondary (% gross)"]
out_school = edu_long[edu_long["Series Name"] == "Children out of school (% of primary school age)"]

# Rename value columns
gdp = gdp.rename(columns={"Value": "GDP_per_capita"})
urban = urban.rename(columns={"Value": "Urban_population_percent"})
enroll = enroll.rename(columns={"Value": "School_enrollment_rate"})
out_school = out_school.rename(columns={"Value": "Out_of_school_percent"})

# ------------------------------------------------------
# 4. MERGE ALL INDICATORS INTO PANEL DATASET
# ------------------------------------------------------

df = gdp.merge(
    urban,
    on=["Country Name", "Country Code", "Year"],
    how="inner"
).merge(
    enroll,
    on=["Country Name", "Country Code", "Year"],
    how="inner"
).merge(
    out_school,
    on=["Country Name", "Country Code", "Year"],
    how="inner"
)

# ------------------------------------------------------
# 5. DATA CLEANING
# ------------------------------------------------------

df = df.dropna()

# ------------------------------------------------------
# 6. CONSTRUCT EDUCATION INDEX
# ------------------------------------------------------

df["Education_Index"] = (
    df["School_enrollment_rate"] -
    df["Out_of_school_percent"]
)

# Normalize Education Index
df["Education_Index"] = (
    (df["Education_Index"] - df["Education_Index"].min()) /
    (df["Education_Index"].max() - df["Education_Index"].min())
)

# ------------------------------------------------------
# 7. CONSTRUCT MARKET GROWTH INDEX (MGI)
# ------------------------------------------------------

df["MGI"] = (
    0.40 * df["GDP_per_capita"] +
    0.30 * df["Urban_population_percent"] +
    0.30 * df["Education_Index"]
)

# Normalize MGI
df["MGI"] = (
    (df["MGI"] - df["MGI"].min()) /
    (df["MGI"].max() - df["MGI"].min())
)

# ------------------------------------------------------
# 8. MODEL FEATURES & TARGET
# ------------------------------------------------------

features = [
    "GDP_per_capita",
    "Urban_population_percent",
    "Education_Index",
    "School_enrollment_rate",
    "Out_of_school_percent"
]

X = df[features]
y = df["MGI"]

# ------------------------------------------------------
# 9. TRAIN–TEST SPLIT
# ------------------------------------------------------

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.25,
    random_state=42
)

# ------------------------------------------------------
# 10. STANDARD SCALING
# ------------------------------------------------------

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# ------------------------------------------------------
# 11. LINEAR REGRESSION
# ------------------------------------------------------

lr = LinearRegression()
lr.fit(X_train_scaled, y_train)

y_pred_lr = lr.predict(X_test_scaled)

print("LINEAR REGRESSION")
print("R²:", round(r2_score(y_test, y_pred_lr), 3))
print("RMSE:", round(np.sqrt(mean_squared_error(y_test, y_pred_lr)), 3))

# ------------------------------------------------------
# 12. RANDOM FOREST REGRESSION
# ------------------------------------------------------

rf = RandomForestRegressor(
    n_estimators=300,
    max_depth=8,
    random_state=42
)

rf.fit(X_train_scaled, y_train)

y_pred_rf = rf.predict(X_test_scaled)

print("\nRANDOM FOREST")
print("R²:", round(r2_score(y_test, y_pred_rf), 3))
print("RMSE:", round(np.sqrt(mean_squared_error(y_test, y_pred_rf)), 3))

# ------------------------------------------------------
# 13. FEATURE IMPORTANCE
# ------------------------------------------------------

feature_importance = pd.DataFrame({
    "Feature": features,
    "Importance": rf.feature_importances_
}).sort_values(by="Importance", ascending=False)

print("\nFEATURE IMPORTANCE")
print(feature_importance)

# ------------------------------------------------------
# 14. FEATURE IMPORTANCE PLOT
# ------------------------------------------------------

plt.figure(figsize=(8,5))
plt.barh(feature_importance["Feature"], feature_importance["Importance"])
plt.gca().invert_yaxis()
plt.title("Feature Importance – THV Market Growth Index")
plt.xlabel("Importance")
plt.show()

# ------------------------------------------------------
# 15. COUNTRY-WISE MGI TRENDS
# ------------------------------------------------------

for country in df["Country Name"].unique():
    subset = df[df["Country Name"] == country]
    plt.plot(subset["Year"], subset["MGI"], label=country)

plt.title("Market Growth Index Trends (2010–2024)")
plt.xlabel("Year")
plt.ylabel("MGI")
plt.legend()
plt.show()
