Este notebook se usa para crear el modelo de datos que observamos en el análisis.

In [1]:
import sys
import os
import warnings
import pickle
from datetime import datetime
from dateutil.relativedelta import relativedelta

import pandas as pd
import numpy as np
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import RobustScaler
from sklearn.ensemble import RandomForestClassifier

warnings.filterwarnings("ignore")

app_path = '/Users/esanc147/Documents/business/bsm03/web_app'
if app_path in sys.path:
    from tools.tags import create_tags
else: 
    sys.path.append('/Users/esanc147/Documents/business/bsm03/web_app')
    from tools.tags import create_tags

## Valores por defecto

In [2]:
COLUMNS_TECH = ['symbol', 'date', 'MACD_Signal', 'MACD_Hist', 'MACD', 'SlowK', 'SlowD',
       'Chaikin A/D', 'OBV', 'RSI21', 'ADX21',
       'CCI21', 'Aroon Up21', 'Aroon Down21',
       'RSI28', 'ADX28', 'CCI28', 'Aroon Down28', 'Aroon Up28',
       'Real Lower Band28', 'Real Upper Band28', 'Real Middle Band28',
       'SMA50', 'RSI50', 'ADX50', 'CCI50', 'Aroon Up50',
       'Aroon Down50']
COLUMNS = ['symbol', 'date', 'close', 'volume', 'open', 'high', 'low']
U_COLUMNS = ['close', 'volume', 'MACD_Signal', 'MACD_Hist', 'MACD', 'SlowK', 'SlowD',
             'Chaikin A/D', 'OBV', 'RSI21', 'ADX21', 'CCI21', 'Aroon Up21', 'Aroon Down21',
             'RSI28', 'ADX28', 'CCI28', 'Aroon Down28', 'Aroon Up28', 'Real Lower Band28',
             'Real Upper Band28', 'Real Middle Band28', 'SMA50', 'RSI50', 'ADX50', 'CCI50',
             'Aroon Up50', 'Aroon Down50']
FULL_PATH = "/Users/esanc147/Documents/business/bsm03/web_app/data"
# SYMBOLS = [s.split('.csv')[0] for s in os.listdir(f"{FULL_PATH}/tech/") if '.L' not in s]
SYMBOLS = ['MSFT', 'AAPL', 'AMZN', 'INTC', 'NFLX', 'INTU', 'NVDA']
PERIOD = [7, 14, 21, 28]

## Carga de los datos

In [4]:
symbols = SYMBOLS

total_dataframes = list()

i = 1
for symbol in symbols:
    if (i % 1000) == 0:
        print("")
    path_close = f"{FULL_PATH}/close/{symbol}.csv"
    df_close = pd.read_csv(path_close, names=COLUMNS)
    df_close['date'] = pd.to_datetime(df_close['date'])
    df_close['volume'] = df_close['volume'].astype(float)

    path_tech = f"{FULL_PATH}/tech/{symbol}.csv"
    df_tech = pd.read_csv(path_tech, names=COLUMNS_TECH)
    df_tech['date'] = pd.to_datetime(df_tech['date'])

    list_df_tagged = []
    for period in PERIOD:
        df_aux = create_tags(df_close, period)
        df_aux[f"pct_change_{period}"] = df_aux[f"pct_change_{period}"].astype(float)
        df_aux[f"pct_change_{period}"] = df_aux[f"pct_change_{period}"].astype(float)
        list_df_tagged.append(df_aux)
    df_tagged = pd.concat(list_df_tagged, axis=1)
    df_tagged.dropna(inplace=True)

    df_close = df_close.set_index(['symbol', 'date'])
    df_tech = df_tech.set_index(['symbol', 'date'])
    dataframe = pd.concat([df_close, df_tech, df_tagged], join='inner', axis=1)
    total_dataframes.append(dataframe)
    i += 1
tot_dataframe = pd.concat(total_dataframes)

+1000 symbols loaded
+1000 symbols loaded
+1000 symbols loaded
+1000 symbols loaded
+1000 symbols loaded
+1000 symbols loaded


ValueError: cannot set a frame with no defined index and a scalar

In [5]:
tot_dataframe = pd.concat(total_dataframes)

In [6]:
tot_dataframe.shape

(15079105, 33)

In [7]:
dataframe_reset = tot_dataframe.reset_index()
dataframe_train = dataframe_reset[dataframe_reset['date'].dt.year <= 2019] \
                      .set_index(['symbol', 'date']) \
                      .sort_values(by='date', ascending=False)
dataframe_test = dataframe_reset[(dataframe_reset['date'].dt.year > 2019)
                                 & (dataframe_reset['date'].dt.month <= 6)] \
                      .set_index(['symbol', 'date']) \
                      .sort_values(by='date', ascending=False)

In [8]:
dataframe_train.shape, dataframe_test.shape

((14396728, 33), (682377, 33))

#### Remove Outliers

#### Keep the needed data for the model

#### Apply scaler

#### Apply model

In [9]:
param_grid_rf = {
    'criterion': ('entropy', 'gini'),
    'n_estimators': (10, 100),
    'class_weight': ('balanced', 'balanced_subsample', None),
    'warm_start': (False, True),
    'random_state': (None, 32)
}

# SAVING MODELS

In [10]:
days = PERIOD
for days_to_predict in days:
    # Train split into 20% of data
    df_train = dataframe_train[~dataframe_train[f"tag_{days_to_predict}"].isin((['outlier bull', 'outlier bear']))]
    y_train = df_train[f"tag_{days_to_predict}"].values

    df_train, X_test_garbish, y_train, y_test_garbish = \
        train_test_split(df_train, y_train, train_size=0.05, random_state=32)

    y_test = dataframe_test[f"tag_{days_to_predict}"].values
    y_test = np.where(y_test == 'outlier bear', 'strong bear', y_test)
    y_test = np.where(y_test == 'outlier bull', 'strong bull', y_test)

    X_train = df_train[U_COLUMNS].values
    X_test = dataframe_test[U_COLUMNS].values

    robust_scl = RobustScaler()
    X_train_scl = robust_scl.fit_transform(X_train)
    X_test_scl = robust_scl.transform(X_test)

    # Random Forest
    gscv_rf = GridSearchCV(RandomForestClassifier(), param_grid_rf, cv=5,
                           scoring=['accuracy', 'precision_micro', 'precision_macro'], refit='accuracy')
    gscv_rf.fit(X_train_scl, y_train);

    print('#'*5, f'Random Forest - {days_to_predict}', '#'*5)
    print(classification_report(y_test, gscv_rf.predict(X_test_scl)), '\n')
    
    pickle.dump(robust_scl, open(f"{FULL_PATH}/model/robust_scaler_{days_to_predict}.pkl", 'wb'))
    pickle.dump(gscv_rf, open(f"{FULL_PATH}/model/rf_{days_to_predict}.pkl", 'wb'))

##### Random Forest - 28 #####
              precision    recall  f1-score   support

        bear       0.10      0.09      0.10     65228
        bull       0.27      0.46      0.34    146843
        keep       0.16      0.03      0.06     67911
 strong bear       0.42      0.34      0.37    237156
 strong bull       0.35      0.35      0.35    165239

    accuracy                           0.31    682377
   macro avg       0.26      0.26      0.24    682377
weighted avg       0.31      0.31      0.30    682377
 

