In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score


In [2]:
aapl_with_features = pd.read_csv('../data/all_data/all_AAPL_data.csv')
googl_with_features = pd.read_csv('../data/all_data/all_GOOGL_data.csv')
msft_with_features = pd.read_csv('../data/all_data/all_MSFT_data.csv')

In [3]:
tickers = ['AAPL', 'GOOGL', 'MSFT']
data_dict = {
    'AAPL': aapl_with_features,
    'GOOGL': googl_with_features,
    'MSFT': msft_with_features
}
color_dict = {
     'AAPL': 'grey',
    'GOOGL': 'yellow',
    'MSFT': 'green'
}

In [4]:
def get_target(input_df, ticker):
    df = input_df.copy()
    df['Target'] = (df[f'Close_{ticker}'].shift(-1) > df[f'Close_{ticker}']).astype(int)
    df.dropna(inplace=True)
    return df

### BASELINE (1) based on yesterday target

In [5]:
result_dict_1 = {}
for share in tickers:
    df = get_target(data_dict[share], share)

    y_true = df['Target']
    y_pred_naive = df['Target'].shift(1)
    
    valid_indices = ~y_pred_naive.isna()
    y_true = y_true[valid_indices]
    y_pred_naive = y_pred_naive[valid_indices]
    
    accuracy = accuracy_score(y_true, y_pred_naive)
    precision = precision_score(y_true, y_pred_naive)
    recall = recall_score(y_true, y_pred_naive)
    roc_auc = roc_auc_score(y_true, y_pred_naive)
        
    print(f"{share} Accuracy: {accuracy:.2%}")
    print(f'{share} Precision: {precision:.2%}')
    print(f'{share} Recall: {recall:.2%}')
    print(f'{share} ROC AUC Score: {roc_auc:.2%}')
    print('*' * 40)
    result_dict_1[share] = [accuracy, precision, recall, roc_auc]

AAPL Accuracy: 49.94%
AAPL Precision: 52.52%
AAPL Recall: 52.52%
AAPL ROC AUC Score: 49.79%
****************************************
GOOGL Accuracy: 50.08%
GOOGL Precision: 52.30%
GOOGL Recall: 52.32%
GOOGL ROC AUC Score: 49.97%
****************************************
MSFT Accuracy: 48.25%
MSFT Precision: 50.19%
MSFT Recall: 50.21%
MSFT ROC AUC Score: 48.17%
****************************************


### BASELINE (2), ALWAYS PREDICT RISE 

In [6]:
result_dict_2 =  {}
for share in tickers:
    df = get_target(data_dict[share], share)

    y_true = df['Target']
    y_pred_naive = np.ones(len(y_true))
    
    accuracy = accuracy_score(y_true, y_pred_naive)
    precision = precision_score(y_true, y_pred_naive)
    recall = recall_score(y_true, y_pred_naive)
    roc_auc = roc_auc_score(y_true, y_pred_naive)
    
    print(f"{share} Accuracy: {accuracy:.2%}")
    print(f'{share} Precision: {precision:.2%}')
    print(f'{share} Recall: {recall:.2%}')
    print(f'{share} ROC AUC Score: {roc_auc:.2%}')
    print('*' * 40)
    result_dict_2[share] = [accuracy, precision, recall, roc_auc]

AAPL Accuracy: 52.71%
AAPL Precision: 52.71%
AAPL Recall: 100.00%
AAPL ROC AUC Score: 50.00%
****************************************
GOOGL Accuracy: 52.33%
GOOGL Precision: 52.33%
GOOGL Recall: 100.00%
GOOGL ROC AUC Score: 50.00%
****************************************
MSFT Accuracy: 51.96%
MSFT Precision: 51.96%
MSFT Recall: 100.00%
MSFT ROC AUC Score: 50.00%
****************************************


### BASELINE (3), predicts an upward movement whenever the current closing price exceeds the 20-day Simple Moving Average (SMA).

In [7]:
result_dict_3 = {}
for share in tickers:
    df = get_target(data_dict[share], share)
    
    # Calculate SMA
    sma_20 = df[f'Close_{share}'].rolling(window=20).mean()
    
    y_pred_sma = (df[f'Close_{share}'] > sma_20).astype(int)
    
    mask = ~sma_20.isna()
    y_true = df['Target'][mask]
    y_pred_sma = y_pred_sma[mask]
    
    accuracy = accuracy_score(y_true, y_pred_sma)
    precision = precision_score(y_true, y_pred_sma)
    recall = recall_score(y_true, y_pred_sma)
    roc_auc = roc_auc_score(y_true, y_pred_sma)
    
    print(f"{share} Accuracy: {accuracy:.2%}")
    print(f'{share} Precision: {precision:.2%}')
    print(f'{share} Recall: {recall:.2%}')
    print(f'{share} ROC AUC Score: {roc_auc:.2%}')
    print('*' * 40)
    result_dict_3[share] = [accuracy, precision, recall, roc_auc]

AAPL Accuracy: 52.29%
AAPL Precision: 54.10%
AAPL Recall: 63.08%
AAPL ROC AUC Score: 51.66%
****************************************
GOOGL Accuracy: 50.14%
GOOGL Precision: 52.07%
GOOGL Recall: 58.96%
GOOGL ROC AUC Score: 49.71%
****************************************
MSFT Accuracy: 49.94%
MSFT Precision: 51.53%
MSFT Recall: 61.36%
MSFT ROC AUC Score: 49.48%
****************************************


In [8]:
df_to_save_1 = pd.DataFrame.from_dict(
    result_dict_1, 
    orient='index',
    columns=['accuracy', 'precision', 'recall', 'roc_auc']
)
print(df_to_save_1)

df_to_save_1.to_csv('../models_results/bs1_results.csv')

       accuracy  precision    recall   roc_auc
AAPL   0.499404   0.525245  0.525245  0.497917
GOOGL  0.500795   0.522960  0.523159  0.499704
MSFT   0.482519   0.501911  0.502103  0.481725


In [9]:
df_to_save_2 = pd.DataFrame.from_dict(
    result_dict_2, 
    orient='index',
    columns=['accuracy', 'precision', 'recall', 'roc_auc']
)
print(df_to_save_2)

df_to_save_2.to_csv('../models_results/bs2_results.csv')

       accuracy  precision  recall  roc_auc
AAPL   0.527110   0.527110     1.0      0.5
GOOGL  0.523337   0.523337     1.0      0.5
MSFT   0.519563   0.519563     1.0      0.5


In [10]:
df_to_save_3 = pd.DataFrame.from_dict(
    result_dict_3, 
    orient='index',
    columns=['accuracy', 'precision', 'recall', 'roc_auc']
)
print(df_to_save_3)

df_to_save_3.to_csv('../models_results/bs3_results.csv')

       accuracy  precision    recall   roc_auc
AAPL   0.522927   0.541005  0.630763  0.516648
GOOGL  0.501396   0.520700  0.589558  0.497120
MSFT   0.499402   0.515308  0.613584  0.494759
