In [16]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score


In [17]:
aapl_with_features = pd.read_csv('../data/all_data/all_AAPL_data.csv')
googl_with_features = pd.read_csv('../data/all_data/all_GOOGL_data.csv')
msft_with_features = pd.read_csv('../data/all_data/all_MSFT_data.csv')

In [18]:
tickers = ['AAPL', 'GOOGL', 'MSFT']
data_dict = {
    'AAPL': aapl_with_features,
    'GOOGL': googl_with_features,
    'MSFT': msft_with_features
}
color_dict = {
     'AAPL': 'grey',
    'GOOGL': 'yellow',
    'MSFT': 'green'
}

In [19]:
def get_target(input_df, ticker):
    df = input_df.copy()
    df['Target'] = (df[f'Close_{ticker}'].shift(-1) > df[f'Close_{ticker}']).astype(int)
    df.dropna(inplace=True)
    return df

### BASELINE (1) based on yesterday target

In [20]:
result_dict_1 = {}
for share in tickers:
    df = get_target(data_dict[share], share)

    y_true = df['Target']
    y_pred_naive = df['Target'].shift(1)
    
    valid_indices = ~y_pred_naive.isna()
    y_true = y_true[valid_indices]
    y_pred_naive = y_pred_naive[valid_indices]
    
    accuracy = accuracy_score(y_true, y_pred_naive)
    precision = precision_score(y_true, y_pred_naive)
    recall = recall_score(y_true, y_pred_naive)
    roc_auc = roc_auc_score(y_true, y_pred_naive)
        
    print(f"{share} Accuracy: {accuracy:.2%}")
    print(f'{share} Precision: {precision:.2%}')
    print(f'{share} Recall: {recall:.2%}')
    print(f'{share} ROC AUC Score: {roc_auc:.2%}')
    print('*' * 40)
    result_dict_1[share] = [accuracy, precision, recall, roc_auc]

AAPL Accuracy: 49.93%
AAPL Precision: 52.50%
AAPL Recall: 52.52%
AAPL ROC AUC Score: 49.78%
****************************************
GOOGL Accuracy: 50.09%
GOOGL Precision: 52.31%
GOOGL Recall: 52.33%
GOOGL ROC AUC Score: 49.98%
****************************************
MSFT Accuracy: 48.26%
MSFT Precision: 50.21%
MSFT Recall: 50.23%
MSFT ROC AUC Score: 48.18%
****************************************


### BASELINE (2), ALWAYS PREDICT RISE 

In [21]:
result_dict_2 =  {}
for share in tickers:
    df = get_target(data_dict[share], share)

    y_true = df['Target']
    y_pred_naive = np.ones(len(y_true))
    
    accuracy = accuracy_score(y_true, y_pred_naive)
    precision = precision_score(y_true, y_pred_naive)
    recall = recall_score(y_true, y_pred_naive)
    roc_auc = roc_auc_score(y_true, y_pred_naive)
    
    print(f"{share} Accuracy: {accuracy:.2%}")
    print(f'{share} Precision: {precision:.2%}')
    print(f'{share} Recall: {recall:.2%}')
    print(f'{share} ROC AUC Score: {roc_auc:.2%}')
    print('*' * 40)
    result_dict_2[share] = [accuracy, precision, recall, roc_auc]

AAPL Accuracy: 52.72%
AAPL Precision: 52.72%
AAPL Recall: 100.00%
AAPL ROC AUC Score: 50.00%
****************************************
GOOGL Accuracy: 52.34%
GOOGL Precision: 52.34%
GOOGL Recall: 100.00%
GOOGL ROC AUC Score: 50.00%
****************************************
MSFT Accuracy: 51.97%
MSFT Precision: 51.97%
MSFT Recall: 100.00%
MSFT ROC AUC Score: 50.00%
****************************************


### BASELINE (3), predicts an upward movement whenever the current closing price exceeds the 20-day Simple Moving Average (SMA).

In [22]:
result_dict_3 = {}
for share in tickers:
    df = get_target(data_dict[share], share)
    
    # Calculate SMA
    sma_20 = df[f'Close_{share}'].rolling(window=20).mean()
    
    y_pred_sma = (df[f'Close_{share}'] > sma_20).astype(int)
    
    mask = ~sma_20.isna()
    y_true = df['Target'][mask]
    y_pred_sma = y_pred_sma[mask]
    
    accuracy = accuracy_score(y_true, y_pred_sma)
    precision = precision_score(y_true, y_pred_sma)
    recall = recall_score(y_true, y_pred_sma)
    roc_auc = roc_auc_score(y_true, y_pred_sma)
    
    print(f"{share} Accuracy: {accuracy:.2%}")
    print(f'{share} Precision: {precision:.2%}')
    print(f'{share} Recall: {recall:.2%}')
    print(f'{share} ROC AUC Score: {roc_auc:.2%}')
    print('*' * 40)
    result_dict_3[share] = [accuracy, precision, recall, roc_auc]

AAPL Accuracy: 52.30%
AAPL Precision: 54.10%
AAPL Recall: 63.08%
AAPL ROC AUC Score: 51.68%
****************************************
GOOGL Accuracy: 50.15%
GOOGL Precision: 52.07%
GOOGL Recall: 58.96%
GOOGL ROC AUC Score: 49.72%
****************************************
MSFT Accuracy: 49.93%
MSFT Precision: 51.51%
MSFT Recall: 61.36%
MSFT ROC AUC Score: 49.47%
****************************************


In [23]:
df_to_save_1 = pd.DataFrame.from_dict(
    result_dict_1, 
    orient='index',
    columns=['Accuracy', 'Precision', 'Recall', 'ROC AUC']
)
print(df_to_save_1)

df_to_save_1.to_csv('../models_results/bs1_results.csv')

       Accuracy  Precision    Recall   ROC AUC
AAPL   0.499305   0.525047  0.525245  0.497818
GOOGL  0.500894   0.523141  0.523340  0.499795
MSFT   0.482622   0.502102  0.502294  0.481821


In [24]:
df_to_save_2 = pd.DataFrame.from_dict(
    result_dict_2, 
    orient='index',
    columns=['Accuracy', 'Precision', 'Recall', 'ROC AUC']
)
print(df_to_save_2)

df_to_save_2.to_csv('../models_results/bs2_results.csv')

       Accuracy  Precision  Recall  ROC AUC
AAPL   0.527204   0.527204     1.0      0.5
GOOGL  0.523431   0.523431     1.0      0.5
MSFT   0.519658   0.519658     1.0      0.5


In [25]:
df_to_save_3 = pd.DataFrame.from_dict(
    result_dict_3, 
    orient='index',
    columns=['Accuracy', 'Precision', 'Recall', 'ROC AUC']
)
print(df_to_save_3)

df_to_save_3.to_csv('../models_results/bs3_results.csv')

       Accuracy  Precision    Recall   ROC AUC
AAPL   0.523022   0.541005  0.630763  0.516774
GOOGL  0.501495   0.520700  0.589558  0.497244
MSFT   0.499302   0.515142  0.613584  0.494681
