In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt

# Data application ( prices + features)

In [2]:
aapl_with_features = pd.read_csv('../data/all_data/all_AAPL_data.csv')
googl_with_features = pd.read_csv('../data/all_data/all_GOOGL_data.csv')
msft_with_features = pd.read_csv('../data/all_data/all_MSFT_data.csv')

shares = ["AAPL", "GOOGL", "MSFT"]
for share, df in zip(shares, [aapl_with_features, googl_with_features, msft_with_features]):
    print(share)
    print(df.head()) 

print(shares)

AAPL
    index        DATE  Close_AAPL  High_AAPL  Low_AAPL  Open_AAPL  \
0  1508.0  2006-01-04    2.241830   2.241830  2.166852   2.170751   
1  1509.0  2006-01-05    2.248428   2.278719  2.234332   2.253227   
2  1510.0  2006-01-06    2.230734   2.246329  2.211840   2.244230   
3  1511.0  2006-01-09    2.288316   2.300313  2.235832   2.256826   
4  1512.0  2006-01-10    2.280819   2.315310  2.271523   2.301214   

   Volume_AAPL     RSI_14  MACD_12_26_9  MACDh_12_26_9  ...  High_GOOGL  \
0  807234400.0  63.192529      0.052154      -0.013956  ...   10.820894   
1  619603600.0  63.759171      0.053152      -0.010367  ...   11.150982   
2  449422400.0  61.045151      0.051916      -0.009282  ...   11.215310   
3  704457600.0  66.102053      0.054950      -0.004998  ...   11.685977   
4  675040800.0  64.920429      0.056103      -0.003076  ...   11.758006   

   Low_GOOGL  Open_GOOGL  Volume_GOOGL  log_return     cusum  cusum_pos  \
0  10.387482   10.494283   524323152.0         NaN    

# Target

In [3]:
def create_target(
    df,
    close_col,
    horizon=1
):
    df = df.copy()

    future_return = (
        df[close_col].shift(-horizon) / df[close_col] - 1
    )

    df["Target"] = (future_return > 0).astype(int)

    df = df.iloc[:-horizon]

    return df

for share, stock_data in zip(shares, [aapl_with_features, googl_with_features, msft_with_features]):
    create_target(
        stock_data,
        close_col=f"Close_{share}",
        horizon=1
    )


# Training baseline model (Logistic regression) + feature importance

In [4]:

from os import pipe

result_dict = {}

def train_baseline_model(data, share_name):

    pipe = Pipeline([
        ('scaler', StandardScaler()),
        ('classifier', LogisticRegression(max_iter=10000, random_state=42))
    ])

    features = data.drop(columns=['index','Target', 'DATE', f'Close_{share_name}'])
    features = features.dropna()

    target = data.loc[features.index, 'Target']
    X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, shuffle=False)

    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    y_proba = pipe.predict_proba(X_test)[:, 1]
    # plt.hist(y_proba, bins=50)
    # plt.show()
    
    acc = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_proba)

    print(f"Results for {share_name}:")
    print("Accuracy:", acc)
    print("Precision:", precision)
    print("Recall:", recall)
    print("ROC AUC Score:", roc_auc)
    
    result_dict[share_name] = [acc, precision, recall, roc_auc]

print(shares)
for share, df in zip(shares, [aapl_with_features, googl_with_features, msft_with_features]):
    train_baseline_model(df, share)

def train_feature_impact(data, share_name):

    features = data.drop(columns=['index','Target', 'DATE', f'Close_{share_name}'])
    features = features.dropna()

    target = data.loc[features.index, 'Target']

    X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, shuffle=False)

    pipe = Pipeline([
        ("scaler", StandardScaler()),
        ("clf", LogisticRegression(max_iter=10000, random_state=42))
    ])

    pipe.fit(X_train, y_train)
    pipe.predict(X_test)

    coefficients = pipe.named_steps["clf"].coef_[0]
    feature_names = X_train.columns
    feature_importance = pd.DataFrame({
        "Feature": feature_names,
        "Coefficient": coefficients,
        "Abs_Coefficient": np.abs(coefficients),
        "Odds_Ratio": np.exp(coefficients)
    }).sort_values(by="Abs_Coefficient", ascending=False)

    print(feature_importance)

 
for share, df in zip(shares, [aapl_with_features, googl_with_features, msft_with_features]):
    train_feature_impact(df, share)



['AAPL', 'GOOGL', 'MSFT']
Results for AAPL:
Accuracy: 0.5283018867924528
Precision: 0.5349127182044888
Recall: 0.807909604519774
ROC AUC Score: 0.5063895614743072
Results for GOOGL:
Accuracy: 0.464746772591857
Precision: 0.4934210526315789
Recall: 0.13966480446927373
ROC AUC Score: 0.5202345576290661
Results for MSFT:
Accuracy: 0.5253227408142999
Precision: 0.5252837977296182
Recall: 0.9658444022770398
ROC AUC Score: 0.5364879822896901
           Feature  Coefficient  Abs_Coefficient  Odds_Ratio
38           cusum    -0.533738         0.533738    0.586409
22        Open_VIX     0.458241         0.458241    1.581290
21         Low_VIX    -0.380758         0.380758    0.683343
39       cusum_pos     0.279309         0.279309    1.322216
11  BBU_20_2.0_2.0    -0.247041         0.247041    0.781109
40       cusum_neg     0.210755         0.210755    1.234610
20        High_VIX    -0.204554         0.204554    0.815011
1         Low_AAPL    -0.188306         0.188306    0.828361
33      Hig

In [5]:
df_to_save = pd.DataFrame.from_dict(
    result_dict, 
    orient='index',
    columns=['accuracy', 'precision', 'recall', 'roc_auc']
)
print(df_to_save)

df_to_save.to_csv('../models_results/LR_results.csv')

       accuracy  precision    recall   roc_auc
AAPL   0.528302   0.534913  0.807910  0.506390
GOOGL  0.464747   0.493421  0.139665  0.520235
MSFT   0.525323   0.525284  0.965844  0.536488
