In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score


In [13]:
aapl_with_features = pd.read_csv('../data/all_data/all_AAPL_data.csv')
googl_with_features = pd.read_csv('../data/all_data/all_GOOGL_data.csv')
msft_with_features = pd.read_csv('../data/all_data/all_MSFT_data.csv')

In [14]:
tickers = ['AAPL', 'GOOGL', 'MSFT']
data_dict = {
    'AAPL': aapl_with_features,
    'GOOGL': googl_with_features,
    'MSFT': msft_with_features
}
color_dict = {
     'AAPL': 'grey',
    'GOOGL': 'yellow',
    'MSFT': 'green'
}

In [15]:
def get_target(input_df, ticker):
    df = input_df.copy()
    df['Target'] = (df[f'Close_{ticker}'].shift(-1) > df[f'Close_{ticker}']).astype(int)
    df.dropna(inplace=True)
    return df

### BASELINE based on yesterday target

In [16]:
for share in tickers:
    df = get_target(data_dict[share], share)

    y_true = df['Target']
    y_pred_naive = df['Target'].shift(1)
    
    valid_indices = ~y_pred_naive.isna()
    y_true = y_true[valid_indices]
    y_pred_naive = y_pred_naive[valid_indices]
        
    print(f"{share} Accuracy: {accuracy_score(y_true, y_pred_naive):.2%}")
    print(f'{share} Precision: {precision_score(y_true, y_pred_naive):.2%}')
    print(f'{share} Recall: {recall_score(y_true, y_pred_naive):.2%}')
    print(f'{share} ROC AUC Score: {roc_auc_score(y_true, y_pred_naive):.2%}')
    print('*' * 40)

AAPL Accuracy: 49.93%
AAPL Precision: 52.50%
AAPL Recall: 52.52%
AAPL ROC AUC Score: 49.78%
****************************************
GOOGL Accuracy: 50.09%
GOOGL Precision: 52.31%
GOOGL Recall: 52.33%
GOOGL ROC AUC Score: 49.98%
****************************************
MSFT Accuracy: 48.26%
MSFT Precision: 50.21%
MSFT Recall: 50.23%
MSFT ROC AUC Score: 48.18%
****************************************


### BASELINE, ALWAYS PREDICT RISE 

In [18]:
for share in tickers:
    df = get_target(data_dict[share], share)

    y_true = df['Target']
    y_pred_naive = np.ones(len(y_true))
    
    print(f"{share} Accuracy: {accuracy_score(y_true, y_pred_naive):.2%}")
    print(f'{share} Precision: {precision_score(y_true, y_pred_naive):.2%}')
    print(f'{share} Recall: {recall_score(y_true, y_pred_naive):.2%}')
    print(f'{share} ROC AUC Score: {roc_auc_score(y_true, y_pred_naive):.2%}')
    print('*' * 40)

AAPL Accuracy: 52.72%
AAPL Precision: 52.72%
AAPL Recall: 100.00%
AAPL ROC AUC Score: 50.00%
****************************************
GOOGL Accuracy: 52.34%
GOOGL Precision: 52.34%
GOOGL Recall: 100.00%
GOOGL ROC AUC Score: 50.00%
****************************************
MSFT Accuracy: 51.97%
MSFT Precision: 51.97%
MSFT Recall: 100.00%
MSFT ROC AUC Score: 50.00%
****************************************


### BASELINE, predicts an upward movement whenever the current closing price exceeds the 20-day Simple Moving Average (SMA).

In [20]:
for share in tickers:
    df = get_target(data_dict[share], share)
    
    # Calculate SMA
    sma_20 = df[f'Close_{share}'].rolling(window=20).mean()
    
    y_pred_sma = (df[f'Close_{share}'] > sma_20).astype(int)
    
    mask = ~sma_20.isna()
    y_true = df['Target'][mask]
    y_pred_sma = y_pred_sma[mask]
    
    print(f"{share} Accuracy: {accuracy_score(y_true, y_pred_sma):.2%}")
    print(f'{share} Precision: {precision_score(y_true, y_pred_sma):.2%}')
    print(f'{share} Recall: {recall_score(y_true, y_pred_sma):.2%}')
    print(f'{share} ROC AUC Score: {roc_auc_score(y_true, y_pred_sma):.2%}')
    print('*' * 40)

AAPL Accuracy: 52.30%
AAPL Precision: 54.10%
AAPL Recall: 63.08%
AAPL ROC AUC Score: 51.68%
****************************************
GOOGL Accuracy: 50.15%
GOOGL Precision: 52.07%
GOOGL Recall: 58.96%
GOOGL ROC AUC Score: 49.72%
****************************************
MSFT Accuracy: 49.93%
MSFT Precision: 51.51%
MSFT Recall: 61.36%
MSFT ROC AUC Score: 49.47%
****************************************
