In [1]:
import pandas as pd
from sklearn.linear_model import ElasticNet
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import matplotlib.pyplot as plt
import numpy as np

In [11]:
df = pd.read_excel("ML_data/Initial_ML_data.xlsx")
df = df.sort_values(["country", "Date"]).reset_index(drop=True)

# Ensure Price is numeric
df["Price"] = pd.to_numeric(df["Price"], errors="coerce")

df["Price_lag1"] = df.groupby("country")["Price"].shift(1)

# Drop all rows with NaN
tone_cols = [col for col in df.columns if col.startswith("tone_")]
#cols_to_check = ["Price", "Price_lag1"] + tone_cols
cols_to_check = ["Price", "Price_lag1", "1w", "1m", "3m", "6m", "12m", "Equity_value"] + tone_cols

df = df.dropna(subset=cols_to_check).reset_index(drop=True)



In [13]:

# ==========================
# Split 80% train / 20% test per country (time-respecting)
# ==========================
train_list, test_list = [], []

for country, group in df.groupby("country"):
    group = group.sort_values("Date")
    split_idx = int(len(group) * 0.8)
    train_list.append(group.iloc[:split_idx])
    test_list.append(group.iloc[split_idx:])

train = pd.concat(train_list)
test = pd.concat(test_list)

# ==========================
# Define X and y
# ==========================

#X_cols = ["Price_lag1"] + tone_cols
X_cols = ["Price_lag1","1w", "1m", "3m", "6m", "12m", "Equity_value" ] + tone_cols

X_train = train[X_cols]
y_train = train["Price"]

X_test = test[X_cols]
y_test = test["Price"]

# ==========================
# 5. Train ElasticNet (robust linear ML model)
# ==========================
model = ElasticNet(alpha=1, l1_ratio=0.1, random_state=42)
model.fit(X_train, y_train)

# ==========================
# Evaluate
# ==========================
y_pred = model.predict(X_test)

# Metrics
rmse = mean_squared_error(y_test, y_pred, squared=False)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"RMSE: {rmse:.4f}")
print(f"MAE:  {mae:.4f}")
print(f"R²:   {r2:.4f}")

# Inspect coefficients
coef_df = pd.DataFrame({
    "Feature": X_train.columns,
    "Coefficient": model.coef_
}).sort_values(by="Coefficient", ascending=False)

print("\nTop 10 most influential tone features:")
print(coef_df.head(10))

RMSE: 0.8945
MAE:  0.5382
R²:   0.9962

Top 10 most influential tone features:
                    Feature  Coefficient
0                Price_lag1     0.992376
21          tone_4_finacial     0.007645
16          tone_3_finacial     0.004798
6              Equity_value     0.000010
2                        1m     0.000000
13         tone_3_Political     0.000000
20   tone_4_Economic_policy     0.000000
19  tone_4_Military_cryssis     0.000000
18         tone_4_Political     0.000000
17  tone_4_Economic_general     0.000000




593     32.96
594     32.47
595     32.47
596     32.46
597     31.97
        ...  
2878    25.22
2879    24.65
2880    25.16
2881    24.36
2882    24.19
Name: Price, Length: 578, dtype: float64