In [None]:
#pip install yfinance

In [None]:

import pandas as pd 
import numpy as np
import yfinance as yf 
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.ensemble import RandomForestClassifier # robust overfiting with right parameters and identify non-linear relationships
from sklearn.metrics import accuracy_score, precision_score

import warnings
warnings.filterwarnings('ignore')


In [None]:
# Yahoo Finance load data into DataFrame automatically
msft = yf.Ticker('MSFT')
msft = msft.history(period='max')
msft.head()

In [None]:
# visual close price
msft.plot.line(y='Close', use_index=True)

In [None]:
msft.describe()

In [None]:
msft.info()

In [None]:
msft.isnull().sum()

# Preparing the data for machine learning


In [None]:
# create actualy close price column and target column
data = msft[['Close']]
# if the previous day higher than recent day encode 1, otherwise encode 0
data['Target'] = msft.rolling(2).apply(lambda x: x.iloc[1]>x.iloc[0])['Close']
data = data.rename(columns={'Close':'Actualy_close'})
data.head()

In [None]:
data['Target'].value_counts()

In [None]:
#use copy version for predict data
msft_prev = msft.copy()
#shift data forward
msft_prev = msft_prev.shift() # values on March 13 are move on March 14.
msft_prev.head()

In [None]:
# Combine target and predict with predictors, remember to remove the first NaN row of msft_prev
predictor = ['Close','High','Low','Open']
data = data.join(msft_prev[predictor]).iloc[1:]
data.head()

# Training the machine learning model


In [None]:
model = RandomForestClassifier(n_estimators=100, min_samples_leaf=200, random_state=1) #random_state gets the same value 
# choose data for train, and for test
train = data.iloc[:-100]
test = data.iloc[-100:]

model.fit(train[predictor], train['Target'])

In [None]:
# Predict model
preds = model.predict(test[predictor])
preds # numpy array by default

In [None]:
# convert preds into pandas series
preds = pd.Series(preds, index=test.index )
preds

In [None]:
#precision score
precision_score(test['Target'], preds)

In [None]:
# compare test values and predict value
combined = pd.concat({'Target':test['Target'], 'Prediction':preds}, axis=1)
combined

In [None]:
combined.plot()

# Creating a backtesting Engine
##looping over data, spliting data up, training new model, and predict for next few row

In [90]:
start = 1000
step = 750
predictions = []
for i in range(start, data.shape[0], step):
    train = data.iloc[0:i].copy()
    test = data.iloc[i:(step+i)].copy()
    model.fit(train[predictor], train['Target']) #fit model
    preds = model.predict_proba(test[predictor])[:,1]# predict probability of price go up
    preds = pd.Series(preds, index=test.index)
    preds[preds>.6]=1 #increase threshold to get higher confident and precise (by default = 0.5)
    preds[preds<0.6]=0
    combined = pd.concat({'Target':test['Target'], 'Prediction':preds}, axis=1) # compare test values and predict value
    predictions.append(combined)
predictions = pd.concat(predictions)
predictions


Unnamed: 0_level_0,Target,Prediction
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
1990-02-27,0.0,0.0
1990-02-28,1.0,0.0
1990-03-01,1.0,0.0
1990-03-02,1.0,0.0
1990-03-05,1.0,0.0
...,...,...
2022-05-16,1.0,0.0
2022-05-17,1.0,0.0
2022-05-18,0.0,0.0
2022-05-19,0.0,0.0


In [91]:
predictions['Target'].value_counts()

1.0    4115
0.0    4007
Name: Target, dtype: int64

# Improve accuracy of model

In [93]:
accuracy_score(predictions['Target'], predictions['Prediction'])

0.49335139128293526

In [108]:
start = 1000
step = 750

def backtest (data, model, predictors, start=1000, step1=750):
    predictions = []
    for i in range(start, data.shape[0], step):
        train = data.iloc[0:i].copy()
        test = data.iloc[i:(step+i)].copy()
        model.fit(train[predictor], train['Target']) #fit model
        preds = model.predict_proba(test[predictor])[:,1]# predict probability of price go up
        preds = pd.Series(preds, index=test.index)
        preds[preds>.6]=1 #increase threshold to get higher confident and precise (by default = 0.5)
        preds[preds<0.6]=0
        combined = pd.concat({'Target':test['Target'], 'Prediction':preds}, axis=1) # compare test values and predict value
        predictions.append(combined)
    predictions = pd.concat(predictions)
    return predictions



In [109]:
# To increase accuracy, we can add more predictors in to preditors
weekly_mean = data.rolling(7).mean()
quarterly_mean = data.rolling(90).mean()
annual_mean = data.rolling(365).mean()

weekly_trend = data.shift(1).rolling(7).mean()['Target']# how many days in a week close price goes up


In [110]:
data["weekly_mean"] = weekly_mean["Close"] / data["Close"]
data["quarterly_mean"] = quarterly_mean["Close"] / data["Close"]
data["annual_mean"] = annual_mean["Close"] / data["Close"]

data["annual_weekly_mean"] = data["annual_mean"] / data["weekly_mean"]
data["annual_quarterly_mean"] = data["annual_mean"] / data["quarterly_mean"]
data["weekly_trend"] = weekly_trend

data["open_close_ratio"] = data["Open"] / data["Close"]
data["high_close_ratio"] = data["High"] / data["Close"]
data["low_close_ratio"] = data["Low"] / data["Close"]

In [111]:
full_predictors = predictor + ["weekly_mean", "quarterly_mean", "annual_mean", "annual_weekly_mean", "annual_quarterly_mean", "open_close_ratio", "high_close_ratio", "low_close_ratio"]
predictions = backtest(data.iloc[365:], model, full_predictors)

In [112]:
precision_score(predictions["Target"], predictions["Prediction"])

0.0

In [113]:
predictions['Prediction'].value_counts()

0.0    7757
Name: Prediction, dtype: int64