In [51]:
import numpy as np
import pandas as pd
import sklearn as sk
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier

import mlflow

#----------------------------
from sklearn.metrics import (accuracy_score, classification_report, confusion_matrix, 
                             ConfusionMatrixDisplay, precision_recall_fscore_support, 
                             precision_score, recall_score, roc_auc_score)

In [77]:
#Loading data
AAPLData = pd.read_hdf('../data/randomForestData/randomDorestData.h5','AAPL')
AAPLData.sample(10)

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2022-08-09,164.020004,165.820007,163.25,164.919998,163.112732,63135500
2021-04-12,132.520004,132.850006,130.630005,131.240005,128.666855,91420000
2020-10-14,121.0,123.029999,119.620003,121.190002,118.432289,150712000
2021-04-30,131.779999,133.559998,131.070007,131.460007,128.882568,109839500
2022-07-28,156.979996,157.639999,154.410004,157.350006,155.409836,81378700
2022-02-04,171.679993,174.100006,170.679993,172.389999,170.014572,82465400
2021-04-05,123.870003,126.160004,123.07,125.900002,123.431572,88651200
2021-06-03,124.68,124.849998,123.129997,123.540001,121.32357,76229200
2022-01-28,165.710007,170.350006,162.800003,170.330002,167.769211,179935700
2020-04-02,60.084999,61.287498,59.224998,61.232498,59.57019,165934000


In [None]:
GOOGData = pd.read_hdf('../data/randomForestData/randomDorestData.h5','GOOG')
GOOGData.sample(10)

In [None]:
MSFTData = pd.read_hdf('../data/randomForestData/randomDorestData.h5','GOOG')
MSFTData.sample(10)

In [None]:
AMZNData = pd.read_hdf('../data/randomForestData/randomDorestData.h5','AMZN')
AMZNData.sample(10)

# Will the Stock increment its price tomorrow?

In [125]:
#Create a tomorrow price column
AAPLData['Tomorrow'] = AAPLData['Close'].shift(-1)

# Create a Yestarday's Close price
AAPLData['Yest_Close'] = AAPLData['Close'].shift(1)

#Create binary target for classification
AAPLData['Target'] = (AAPLData['Tomorrow'] > AAPLData['Close']).astype(int)

AAPLData = AAPLData.dropna()

In [79]:
AAPLData.head(10)

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,Tomorrow,Yest_Close,Target
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2019-09-04,52.0975,52.369999,51.830002,52.297501,50.605358,76752400,53.32,51.424999,1
2019-09-05,53.0,53.4925,52.877499,53.32,51.594776,95654800,53.314999,52.297501,0
2019-09-06,53.512501,53.605,53.127499,53.314999,51.589931,77449200,53.5425,53.32,1
2019-09-09,53.709999,54.110001,52.767502,53.5425,51.810085,109237600,54.174999,53.314999,1
2019-09-10,53.465,54.195,52.927502,54.174999,52.422108,127111600,55.897499,53.5425,1
2019-09-11,54.517502,55.927502,54.432499,55.897499,54.088882,177158400,55.772499,54.174999,0
2019-09-12,56.200001,56.605,55.715,55.772499,53.96793,128906800,54.6875,55.897499,0
2019-09-13,55.0,55.197498,54.255001,54.6875,52.91803,159053200,54.974998,55.772499,1
2019-09-16,54.432499,55.032501,54.389999,54.974998,53.196224,84632400,55.174999,54.6875,1
2019-09-17,54.990002,55.205002,54.779999,55.174999,53.389767,73274800,55.692501,54.974998,1


In [98]:
AAPLData.value_counts()

Open        High        Low         Close       Adj Close   Volume     Tomorrow    Yest_Close  Target
236.479996  237.229996  233.089996  234.399994  234.128998  62631300   234.820007  230.539993  1         1
52.097500   52.369999   51.830002   52.297501   50.605358   76752400   53.320000   51.424999   1         1
53.000000   53.492500   52.877499   53.320000   51.594776   95654800   53.314999   52.297501   0         1
53.465000   54.195000   52.927502   54.174999   52.422108   127111600  55.897499   53.542500   1         1
53.512501   53.605000   53.127499   53.314999   51.589931   77449200   53.542500   53.320000   1         1
                                                                                                        ..
55.000000   55.197498   54.255001   54.687500   52.918030   159053200  54.974998   55.772499   1         1
            55.235001   54.707500   54.972500   53.193813   75334000   54.705002   55.257500   0         1
55.134998   55.240002   54.320000   54.705

In [126]:
#Split the data
x_train = AAPLData[['Yest_Close','Volume','Open','High', 'Low']].iloc[:-100]
y_train = AAPLData['Target'].iloc[:-100]

x_test = AAPLData[['Yest_Close','Volume','Open','High', 'Low']].iloc[-100:]
y_test = AAPLData['Target'].iloc[-100:]

In [127]:
model = RandomForestClassifier(n_estimators=80, max_depth=45,min_samples_split=100, random_state=2)
model.fit(x_train,y_train)

predictions = model.predict(x_test)

accuracy = accuracy_score(y_test, predictions)
accuracy

0.4

In [129]:
#verify the importances
feature_importances_df = pd.DataFrame(
     {'feature':['Yest_Close','Volume','Open','High', 'Low'], 'importance': model.feature_importances_}
 ).sort_values('importance', ascending=False).reset_index(drop=True)

feature_importances_df

Unnamed: 0,feature,importance
0,Low,0.210322
1,Open,0.205918
2,Volume,0.19922
3,Yest_Close,0.193352
4,High,0.191189


In [121]:
# Now we can add some other training data to enhance the model

Predictors = ['Yest_Close', 'Open', 'Volume']
MA = [2,5,60]

for ma in MA:
    movingAverage = AAPLData['Close'].rolling(ma).mean()
    
    AAPLData[f'Close_ratio_{ma}'] = AAPLData['Close'] / movingAverage
    
    AAPLData[f'trend_{ma}'] = AAPLData['Target'].rolling(ma).sum()
    
    Predictors += [f'Close_ratio_{ma}',f'trend_{ma}']

In [122]:
# Repeat all with the new columns

# Split the data
x_train = AAPLData[Predictors].iloc[:-100]
y_train = AAPLData['Target'].iloc[:-100]

x_test = AAPLData[Predictors].iloc[-100:]
y_test = AAPLData['Target'].iloc[-100:]

In [123]:
model = RandomForestClassifier(n_estimators=5, min_samples_split=200, random_state=2)
model.fit(x_train,y_train)

predictions = model.predict(x_test)

accuracy = accuracy_score(y_test, predictions)
accuracy

0.99

In [124]:
#verify the importances
feature_importances_df = pd.DataFrame(
     {'feature':Predictors, 'importance': model.feature_importances_}
 ).sort_values('importance', ascending=False).reset_index(drop=True)

feature_importances_df

Unnamed: 0,feature,importance
0,trend_2,0.570549
1,Close_ratio_2,0.259677
2,trend_5,0.095504
3,Close_ratio_5,0.066258
4,Close_ratio_60,0.006854
5,trend_60,0.001158
6,Yest_Close,0.0
7,Open,0.0
8,Volume,0.0


In [None]:
#Create function to run mlflow experiment

def experiment(model:object, modelName:str, developer:str):
    
    with mlflow.start_run():
        mlflow.log_param('model', modelName)
        mlflow.log_param('developer', developer)
        
        #List of metrics to track
        metrics = []
        
        
        
    
    
    pass

In [None]:
mlflow.set_experiment('RandomForesRegressor')


[1, 2, 3, 2, 4]