In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import StandardScaler

# 1. Load the data and pre-process
try:
    df = pd.read_excel('Financial_Market_Data.xlsx')
except FileNotFoundError:
    print("Error: 'Financial_Market_Data.xlsx' not found. Please ensure the file exists and the path is correct.")
    exit()

# Separate the "Data" column header if it exists
if 'Data' in df.columns:
    df = df.drop('Data', axis=1)

# 2. Handle Missing Values (if any):
for col in df.columns:
  if df[col].isnull().any():
    if pd.api.types.is_numeric_dtype(df[col]):
      df[col] = df[col].fillna(df[col].mean())
    else:
      df[col] = df[col].fillna(df[col].mode()[0])

if 'Data' in df.columns:
    df = df.sort_values(by="Data") # Sort by Date

# 3. Split data into training and testing sets based on date
    train_size = int(len(df) * 0.8)
    X = df.drop("Y", axis=1)
    y = df["Y"]
    X_train, X_test = X[:train_size], X[train_size:]
    y_train, y_test = y[:train_size], y[train_size:]
else:
    print("Warning: No 'Data' column found. Performing a random split. This might introduce data leakage if the data has a temporal component.")
    X = df.drop("Y", axis=1)
    y = df["Y"]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 4. Scale data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# 5. RandomForestClassifier
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# 6. Evaluation
y_pred = rf_model.predict(X_test)
print("Random Forest Classifier:")
print(classification_report(y_test, y_pred))
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")

# 7. Feature Importance
feature_importances = rf_model.feature_importances_
print("\nFeature Importances:")
for i, col in enumerate(X.columns):
    print(f"{col}: {feature_importances[i]}")

# VIX index Analysis
print(df[df['Y']==1]['VIX'].describe())
print(df[df['Y']==0]['VIX'].describe())


Random Forest Classifier:
              precision    recall  f1-score   support

           0       0.93      0.96      0.94       177
           1       0.82      0.72      0.77        46

    accuracy                           0.91       223
   macro avg       0.88      0.84      0.86       223
weighted avg       0.91      0.91      0.91       223

Accuracy: 0.9103139013452914

Feature Importances:
XAU BGNL: 0.019938961728417786
ECSURPUS: 0.014308479440818067
BDIY: 0.018979551723660882
CRY: 0.027333723729892415
DXY: 0.014923530639707578
JPY: 0.02523483670118365
GBP: 0.014637912188822261
Cl1: 0.027539226100066136
VIX: 0.17848619604113655
USGG30YR: 0.020956570237132086
GT10: 0.01901164907333174
USGG2YR: 0.016910995830013743
USGG3M: 0.010389305055473908
US0001M: 0.017364144253895906
GTDEM30Y: 0.010336129264500747
GTDEM10Y: 0.014955640394162004
GTDEM2Y: 0.012249363765749534
EONIA: 0.01689701666608823
GTITL30YR: 0.051734965262994856
GTITL10YR: 0.053943886167432904
GTITL2YR: 0.036938358675

In [2]:
y_pred = rf_model.predict(X_test)
print("Random Forest Classifier:")
print(classification_report(y_test, y_pred))
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")

Random Forest Classifier:
              precision    recall  f1-score   support

           0       0.93      0.96      0.94       177
           1       0.82      0.72      0.77        46

    accuracy                           0.91       223
   macro avg       0.88      0.84      0.86       223
weighted avg       0.91      0.91      0.91       223

Accuracy: 0.9103139013452914


In [3]:
# Update the dataframe to perform moving averages of 30 days

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import StandardScaler

numerical_features = X.select_dtypes(include=['number']).columns  # Get numerical features

for col in numerical_features:
    df[f'{col}_MA30'] = df[col].rolling(window=30).mean()

# Handle NaN values created by the moving average calculation
df.fillna(method='bfill', inplace=True) # Backfill for initial rows with no MA30 values

# Split data into training and testing sets AGAIN after feature engineering
if 'Data' in df.columns:
    df = df.sort_values(by="Data")
    train_size = int(len(df) * 0.8)
    X = df.drop("Y", axis=1)
    y = df["Y"]
    X_train, X_test = X[:train_size], X[train_size:]
    y_train, y_test = y[:train_size], y[train_size:]
else:
    X = df.drop("Y", axis=1)
    y = df["Y"]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


# Re-train the model
rf_model_ma = RandomForestClassifier(random_state=42)
rf_model_ma.fit(X_train, y_train)

# Evaluation
y_pred_ma = rf_model_ma.predict(X_test)
print("\nRandom Forest Classifier (with 30-day Moving Averages):")
print(classification_report(y_test, y_pred_ma))
print(f"Accuracy: {accuracy_score(y_test, y_pred_ma)}")

# Display the updated DataFrame (first few rows)
print("\nUpdated DataFrame with Moving Averages:")
print(df.head())

  df.fillna(method='bfill', inplace=True) # Backfill for initial rows with no MA30 values



Random Forest Classifier (with 30-day Moving Averages):
              precision    recall  f1-score   support

           0       0.93      0.97      0.95       177
           1       0.85      0.74      0.79        46

    accuracy                           0.92       223
   macro avg       0.89      0.85      0.87       223
weighted avg       0.92      0.92      0.92       223

Accuracy: 0.9192825112107623

Updated DataFrame with Moving Averages:
   Y  XAU BGNL  ECSURPUS  BDIY     CRY     DXY     JPY     GBP    Cl1    VIX  \
0  0    283.25     0.077  1388  157.26  100.56  105.86  1.6460  25.77  22.50   
1  0    287.65     0.043  1405  165.01  101.86  105.47  1.6383  28.85  21.50   
2  0    287.15     0.135  1368  167.24  102.41  106.04  1.6496  28.28  23.02   
3  0    282.75     0.191  1311  166.85  104.92  107.85  1.6106  28.22  23.45   
4  1    298.40     0.312  1277  165.43  104.22  109.30  1.6108  28.02  21.25   

   ...  LP01TREU_MA30  EMUSTRUU_MA30  LF94TRUU_MA30  MXUS_MA30   

In [18]:
!pip install openai
import os
import pandas as pd
from openai import OpenAI

openai_api_key = os.getenv("OPENAI_API_KEY")

client = OpenAI()
predictions = y_pred_ma

prompt = f"""
Based on the following model predictions for financial market anomalies from the dataset {df}, propose a data-driven investment strategy focusing on minimizing losses and maximizing returns.  The predictions (1 indicates an anomaly, 0 indicates no anomaly).  Consider the following in your analysis and recommendations:

* Previous prediction results: {y_pred_ma}; dataframe: {df}
* Historical market trends indicated by the provided dataset.
* The accuracy of the predictive model (shown in the preceding output).
* Risk tolerance (assume moderate risk tolerance).
* Potential diversification strategies.  Consider different asset classes (stocks, bonds, real estate, etc.) and how they might perform during predicted anomalies.
* Actions to take during predicted anomalies (buy, sell, hold).  Provide specific details and rationales for each action.  What constitutes a "buy" signal? What's a "sell" signal?

Provide a detailed investment strategy, specifying when to buy, sell, or hold assets based on the model's predictions.  The strategy should adapt based on the model's performance in previous periods. For example, if the model has been consistently inaccurate, the strategy should reduce reliance on the predictions and incorporate other indicators.  If the model has been accurate, the strategy should leverage those predictions more aggressively.  Clearly articulate how the strategy changes based on the model's predictive accuracy.  Justify each decision.
"""

response = client.chat.completions.create(
  model="gpt-4o",
  messages=[
        {"role": "system", "content": "You are a financial expert providing investment advice."},
        {"role": "user", "content": prompt}
    ]
)

print("\nInvestment Strategy Recommendation:")
response.choices[0].message.content


Investment Strategy Recommendation:


"To develop a data-driven investment strategy that minimizes losses and maximizes returns, we will consider the model's anomaly predictions, historical market trends, and a moderate risk tolerance. The goal is to use the model's predictions to optimize asset allocation and trading decisions.\n\n### Strategy Overview\n\n1. **Model Accuracy Assessment:**\n   - Evaluate the model's predictive accuracy over historical data. If the model consistently predicts anomalies accurately (e.g., high precision and recall), weight the predictions more heavily in decision-making. If accuracy is lacking, use predictions with caution and incorporate other market indicators.\n\n2. **Asset Classes Consideration:**\n   - Diversify across asset classes: equities (stocks), fixed income (bonds), commodities (like gold), currencies, and real estate.\n   - Leverage non-correlated asset classes to hedge against market downturns during predicted anomalies.\n\n3. **Risk Mitigation through Diversification:**\n   - 