In [None]:
import pandas as pd 
import pandas_datareader as pdr 
import numpy as np  
import matplotlib.pyplot as plt 
import seaborn as sns 
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from datetime import datetime
import plotly.express as px
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, mean_absolute_percentage_error
import matplotlib.pyplot as plt

In [None]:
START_DATE="1990-01-01"
END_DATE=datetime.today().strftime("%Y-%m-%d")
import logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
INDICATORS = {
    
    'CPI': 'CPIAUCSL',
    'Core_CPI': 'CPILFESL',
    'PPI': 'PPIACO',
    'PCE': 'PCE',
    'Core_PCE': 'PCEPILFE',
    'GDP_Deflator': 'GDPDEF',  
    
    'Unemployment_Rate': 'UNRATE',  
    'Employment_Pop_Ratio': 'EMRATIO',  
    'Nonfarm_Payrolls': 'PAYEMS',  
    'Initial_Claims': 'IC4WSA',  
    'JOLTS_Job_Openings': 'JTSJOL',  
    'Average_Hourly_Earnings': 'CES0500000003', 
    
   
    'Housing_Starts': 'HOUST',  
    'Building_Permits': 'PERMIT',  
    'Existing_Home_Sales': 'EXHOSLUSM495S',  
    'New_Home_Sales': 'HSN1F', 
    'Case_Shiller_Home_Price': 'CSUSHPISA', 
    'Mortgage_Rate_30Y': 'MORTGAGE30US',  
    
    
    'Industrial_Production': 'INDPRO', 
    'Capacity_Utilization': 'TCU',  
    'Durable_Goods_Orders': 'DGORDER',  
    'Factory_Orders': 'NEWORDER',  
    'ISM_Manufacturing': 'NAPM',  
    'Retail_Sales': 'RSAFS',  
    
    
    'Fed_Funds_Rate': 'FEDFUNDS',  
    '10Y_Treasury_Rate': 'DGS10',  
    'Yield_Curve': 'T10Y2Y',  
    'S&P500': 'SP500',
    'VIX': 'VIXCLS',
    'Dollar_Index': 'DTWEXBGS',
    
    'GDP': 'GDP',
    'GDP_Growth': 'GDPC1',
    'Personal_Income': 'PI',
    'Personal_Spending': 'PCE',
    'Business_Inventories': 'BUSINV',
    'Corporate_Profits': 'CP',
    
    'Trade_Balance': 'BOPGSTB',
    'Current_Account': 'NETFI',
    'Export_Price_Index': 'IQ',
    'Import_Price_Index': 'IR',
    
    'Consumer_Confidence': 'CSCICP03USM665S',
    'Michigan_Consumer_Sentiment': 'UMCSENT',
    'NFIB_Small_Business_Optimism': 'NFIB',
    'Leading_Economic_Index': 'USSLIND',
    
    'Total_Credit': 'TOTALSL',
    'Consumer_Credit': 'TOTALSL',
    'Delinquency_Rate': 'DRSFRMACBS',
    
    'Government_Revenue': 'FGRECPT',
    'Government_Spending': 'FGEXPND',
    'Federal_Debt': 'GFDEBTN',
    'Budget_Balance': 'MTSDS133FMS'
}


In [None]:
def donwload_data(indicators, start_date, end_date):
    data = pd.DataFrame()
    for name, code in indicators.items():
        try:
            
            series = pdr.DataReader(code, 'fred', start_date, end_date)
            series.rename(columns={series.columns[0]: name}, inplace=True)
            data = pd.concat([data, series], axis=1)
        except Exception as e:
            logger.error(f"Error downloading {name} ({code}): {e}")
    return data
def preprocess_data(data):
    data = data.fillna(method='ffill').fillna(method='bfill')
    return data
df = donwload_data(INDICATORS, START_DATE, END_DATE)
df = preprocess_data(df)   



In [95]:


def create_features(df):
    # Lag features
    df['CPI_lag1'] = df['CPI'].shift(1)
    df['CPI_lag2'] = df['CPI'].shift(2)
    
    # Rolling windows
    df['CPI_rolling_3'] = df['CPI'].rolling(window=3).mean()
    df['CPI_rolling_6'] = df['CPI'].rolling(window=6).mean()
    
    # Percent change
    df['CPI_pct_change'] = df['CPI'].pct_change()
    df['GDP_pct_change'] = df['GDP'].pct_change()
    df['Unemployment_diff'] = df['Unemployment_Rate'].diff()
    df['FedFunds_lag1'] = df['Fed_Funds_Rate'].shift(1)
    
    # Trend
    df['Trend'] = np.arange(len(df))
    
    return df

df = create_features(df)
# Features to use
features = [
    'CPI_lag1', 'CPI_lag2',
    'CPI_rolling_3', 'CPI_rolling_6',
    'CPI_pct_change',
    'Nonfarm_Payrolls',
    'Personal_Spending',
    'Unemployment_Rate', 'Unemployment_diff',
    'GDP', 'GDP_pct_change',
    'Fed_Funds_Rate', 'FedFunds_lag1',
    'Industrial_Production',
    'Consumer_Confidence',
    'PCE',
    'Trend'
]


In [96]:
# Walk forward validation
df_model=df[features + ['CPI']].dropna()
X=df_model[features]
y=df_model['CPI']
initial_train_size = 120  # 10 years of monthly data
step_size = 12            # 1 year step
n_splits = (len(X) - initial_train_size)// step_size
all_predictions=[]
all_actuals=[]
dates=[]
for i in range(n_splits + 1):
    # Define training and test sets
    train_end = initial_train_size + i * step_size
    test_end = train_end + step_size
    
    # Ensure we don't go beyond the data length
    if test_end > len(X):
        test_end = len(X)
    
    X_train, X_test = X.iloc[:train_end], X.iloc[train_end:test_end]
    y_train, y_test = y.iloc[:train_end], y.iloc[train_end:test_end]
    
    # Skip if test set is empty
    if len(X_test) == 0:
        continue
    
    # Train model
    model = XGBRegressor(n_estimators=300, learning_rate=0.05, random_state=42)
    model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test)
    
    # Store results
    all_predictions.extend(y_pred)
    all_actuals.extend(y_test)
    dates.extend(X_test.index)
    
    # Print progress
    print(f"Fold {i+1}/{n_splits} - Train: {X_train.index[0].date()} to {X_train.index[-1].date()}, "
          f"Test: {X_test.index[0].date()} to {X_test.index[-1].date()}")
    
    # Calculate and print metrics for this fold
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    mape = mean_absolute_percentage_error(y_test, y_pred)
    print(f"  MAE: {mae:.4f}, RMSE: {rmse:.4f}, R²: {r2:.4f}, MAPE: {mape:.4f}")

# Calculate overall metrics
overall_mae = mean_absolute_error(all_actuals, all_predictions)
overall_rmse = np.sqrt(mean_squared_error(all_actuals, all_predictions))
overall_r2 = r2_score(all_actuals, all_predictions)
overall_mape = mean_absolute_percentage_error(all_actuals, all_predictions)

print("\n" + "="*50)
print("OVERALL MODEL ACCURACY SCORES")
print("="*50)
print(f"Mean Absolute Error (MAE): {overall_mae:.4f}")
print(f"Root Mean Squared Error (RMSE): {overall_rmse:.4f}")
print(f"R² Score: {overall_r2:.4f}")
print(f"Mean Absolute Percentage Error (MAPE): {overall_mape:.4f}")
print(f"Accuracy Percentage: {(1 - overall_mape) * 100:.2f}%")
print("="*50)

# Additional accuracy analysis
print("\nACCURACY ANALYSIS:")
print(f"- Average CPI value: {np.mean(all_actuals):.2f}")
print(f"- Average prediction error: {overall_mae:.2f}")
print(f"- Error as % of average CPI: {(overall_mae / np.mean(all_actuals)) * 100:.2f}%")
print(f"- Model explains {overall_r2 * 100:.1f}% of variance in CPI")

# Plot predictions vs actuals
plt.figure(figsize=(15, 6))
plt.plot(dates, all_actuals, label='Actual CPI', color='blue')
plt.plot(dates, all_predictions, label='Predicted CPI', color='red', linestyle='--')
plt.title('Walk-Forward Validation: Actual vs Predicted CPI')
plt.xlabel('Date')
plt.ylabel('CPI')
plt.legend()
plt.grid(True)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


# Feature importance
final_model = XGBRegressor(n_estimators=300, learning_rate=0.05, random_state=42)
final_model.fit(X, y)  # Train on full dataset for feature importance

# Plot feature importance
feature_importance = pd.Series(final_model.feature_importances_, index=features).sort_values(ascending=False)
plt.figure(figsize=(12, 6))
feature_importance.plot(kind='bar')
plt.title('Feature Importance')
plt.tight_layout()
plt.show()

Fold 1/937 - Train: 1990-01-06 to 1990-05-24, Test: 1990-05-25 to 1990-06-07
  MAE: 0.4002, RMSE: 0.5659, R²: -1.0012, MAPE: 0.0031
Fold 2/937 - Train: 1990-01-06 to 1990-06-07, Test: 1990-06-08 to 1990-06-21
  MAE: 0.0005, RMSE: 0.0005, R²: -284183621677284818944.0000, MAPE: 0.0000
Fold 3/937 - Train: 1990-01-06 to 1990-06-21, Test: 1990-06-22 to 1990-07-04
  MAE: 0.2003, RMSE: 0.3466, R²: -0.5013, MAPE: 0.0015
Fold 4/937 - Train: 1990-01-06 to 1990-07-04, Test: 1990-07-05 to 1990-07-18
  MAE: 0.0007, RMSE: 0.0007, R²: 0.0000, MAPE: 0.0000
Fold 5/937 - Train: 1990-01-06 to 1990-07-18, Test: 1990-07-19 to 1990-08-01
  MAE: 0.0920, RMSE: 0.3176, R²: -0.0915, MAPE: 0.0007
Fold 6/937 - Train: 1990-01-06 to 1990-08-01, Test: 1990-08-02 to 1990-08-15
  MAE: 0.0016, RMSE: 0.0016, R²: -3093565097999027666944.0000, MAPE: 0.0000
Fold 7/937 - Train: 1990-01-06 to 1990-08-15, Test: 1990-08-16 to 1990-08-29
  MAE: 0.0003, RMSE: 0.0003, R²: -122313442419316572160.0000, MAPE: 0.0000
Fold 8/937 - Tra

Exception ignored while calling ctypes callback function <bound method DataIter._next_wrapper of <xgboost.data.SingleBatchInternalIter object at 0x000002182D7BD0D0>>:
Traceback (most recent call last):
  File "c:\Users\emanu\AppData\Local\Python\pythoncore-3.14-64\Lib\site-packages\xgboost\core.py", line 630, in _next_wrapper
    def _next_wrapper(self, this: None) -> int:  # pylint: disable=unused-argument
KeyboardInterrupt: 


Fold 574/937 - Train: 1990-01-06 to 2012-03-29, Test: 2012-03-30 to 2012-04-11
  MAE: 0.3197, RMSE: 0.3497, R²: -5.0974, MAPE: 0.0014


XGBoostError: [23:57:14] C:\actions-runner\_work\xgboost\xgboost\src\data\quantile_dmatrix.cc:179: Check failed: accumulated_rows == info.num_row_ (7008 vs. 14016) : 