In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# Data application ( prices + features)

In [None]:
aapl_with_features = pd.read_csv('../data/all_data/all_AAPL_data.csv')
googl_with_features = pd.read_csv('../data/all_data/all_GOOGL_data.csv')
msft_with_features = pd.read_csv('../data/all_data/all_MSFT_data.csv')

shares = ["AAPL", "GOOGL", "MSFT"]
for share, df in zip(shares, [aapl_with_features, googl_with_features, msft_with_features]):
    print(share)
    print(df.head()) 


AAPL
    index        DATE  Close_AAPL  High_AAPL  Low_AAPL  Open_AAPL  \
0  1508.0  2006-01-04    2.241830   2.241830  2.166852   2.170751   
1  1509.0  2006-01-05    2.248428   2.278719  2.234332   2.253227   
2  1510.0  2006-01-06    2.230734   2.246329  2.211840   2.244230   
3  1511.0  2006-01-09    2.288316   2.300313  2.235832   2.256826   
4  1512.0  2006-01-10    2.280819   2.315310  2.271523   2.301214   

   Volume_AAPL     RSI_14  MACD_12_26_9  MACDh_12_26_9  ...  Close_MSFT  \
0  807234400.0  63.192529      0.052154      -0.013956  ...   18.715057   
1  619603600.0  63.759171      0.053152      -0.010367  ...   18.805702   
2  449422400.0  61.045151      0.051916      -0.009282  ...   18.819654   
3  704457600.0  66.102053      0.054950      -0.004998  ...   18.763872   
4  675040800.0  64.920429      0.056103      -0.003076  ...   18.729006   

   High_MSFT   Low_MSFT  Open_MSFT  Volume_MSFT  Close_GOOGL  High_GOOGL  \
0  18.826622  18.199069  18.303661   79973000.0    10

In [13]:

def train_baseline_model(data, share_name):

    pipe = Pipeline([
        ('scaler', StandardScaler()),
        ('classifier', LogisticRegression(max_iter=1000))
    ])

    features = data.drop(columns=['Target', 'DATE', f'Close_{share_name}']).dropna()
    X_train, X_test, y_train, y_test = train_test_split(features, data['Target'], test_size=0.2, shuffle=False)

    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    y_proba = pipe.predict_proba(X_test)[:, 1]
    
    print(f"Results for {share_name}:")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Precision:", precision_score(y_test, y_pred))
    print("Recall:", recall_score(y_test, y_pred))
    
    print("ROC AUC Score:", roc_auc_score(y_test, y_proba))

for share, df in zip(shares, [aapl_with_features, googl_with_features, msft_with_features]):
    train_baseline_model(df, share)
    


Results for AAPL:
Accuracy: 0.5079365079365079
Precision: 0.5258493353028065
Recall: 0.6704331450094162
ROC AUC Score: 0.5031288617260261
Results for GOOGL:
Accuracy: 0.4652777777777778
Precision: 0.42857142857142855
Recall: 0.0111731843575419
ROC AUC Score: 0.5119026438458527
Results for MSFT:
Accuracy: 0.5337301587301587
Precision: 0.5457463884430177
Recall: 0.6451612903225806
ROC AUC Score: 0.5388442010832902
