Question: Will adjusted closing price for Apple go up or down the next day?
- i) Logistic Regression
- ii) Random Forest 
- iii) SVM

In [8]:
import pandas as pd 
import datetime as dt
import yfinance as yf

# data = pd.read_csv('data/apple_stock.csv')
# data.drop('Unnamed: 0', axis=1, inplace=True)
# data.head()

In [20]:
aapl = yf.Ticker('AAPLE')
date_from = str(dt.date.today() - dt.timedelta(days=1000))
date_to = str(dt.date.today())

print(f'Downloading Apple stock data from {date_from} to {date_to}.')
data = yf.download("AAPL", start=date_from, end=date_to)

Downloading Apple stock data from 2020-09-13 to 2023-06-10.
[*********************100%***********************]  1 of 1 completed


In [21]:
data.head()

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-09-14,114.720001,115.93,112.800003,115.360001,113.464844,140150100
2020-09-15,118.330002,118.830002,113.610001,115.540001,113.641899,184642000
2020-09-16,115.230003,116.0,112.040001,112.129997,110.287918,154679000
2020-09-17,109.720001,112.199997,108.709999,110.339996,108.527321,178011000
2020-09-18,110.400002,110.879997,106.089996,106.839996,105.084824,287104900


In [22]:
def get_indicators(data):
    data['ema3'] = data['Adj Close'] / data['Adj Close'].ewm(3).mean()
    data['ema6'] = data['Adj Close'] / data['Adj Close'].ewm(6).mean()
    data['ema12'] = data['Adj Close'] / data['Adj Close'].ewm(12).mean()
    return data 

X = get_indicators(data)
X.head()

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,ema3,ema6,ema12
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2020-09-14,114.720001,115.93,112.800003,115.360001,113.464844,140150100,1.0,1.0,1.0
2020-09-15,118.330002,118.830002,113.610001,115.540001,113.641899,184642000,1.000668,1.00072,1.000748
2020-09-16,115.230003,116.0,112.040001,112.129997,110.287918,154679000,0.98341,0.982103,0.981393
2020-09-17,109.720001,112.199997,108.709999,110.339996,108.527321,178011000,0.979275,0.976603,0.97511
2020-09-18,110.400002,110.879997,106.089996,106.839996,105.084824,287104900,0.964586,0.959496,0.956633


In [25]:
def get_ground_truth(data, window):
    """
    Takes as input stock data and assigns ground truth labels as to whether the stock will increase or decrease on the next day
    """
    y = (data.shift(-window)['Adj Close'] >= data['Adj Close'])
    y = y.iloc[:-window]
    data['y'] = y.astype(int)
    
    return data 

X_y = X[['Adj Close', 'ema3', 'ema6', 'ema12']]
X_y = get_ground_truth(X_y, window=1)
X_y.head()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['y'] = y.astype(int)


Unnamed: 0_level_0,Adj Close,ema3,ema6,ema12,y
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-09-14,113.464844,1.0,1.0,1.0,1.0
2020-09-15,113.641899,1.000668,1.00072,1.000748,0.0
2020-09-16,110.287918,0.98341,0.982103,0.981393,0.0
2020-09-17,108.527321,0.979275,0.976603,0.97511,0.0
2020-09-18,105.084824,0.964586,0.959496,0.956633,1.0


In [28]:
X_y = X_y.iloc[:520,:]
X_y.shape

(520, 5)

In [36]:
# train-test split
def train_test_split(data, split=0.80):
    rows = int(len(data)*split)
    train_df = data.iloc[:rows, :]
    test_df = data.iloc[rows:, :]
    X_train = train_df.iloc[:, :-1]
    X_test = test_df.iloc[:, :-1]
    y_train = train_df.iloc[:,-1]
    y_test = test_df.iloc[:,-1]
    return X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = train_test_split(X_y, split=0.8)


In [37]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [39]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report

In [40]:
def train_random_forest(X_train_scaled, X_test_scaled, y_train, y_test):
    rf = RandomForestClassifier()
    params = {'n_estimators': [100,110, 120, 130, 140, 150, 200]}

    rf_gs = GridSearchCV(rf, params, cv=5)

    rf_gs.fit(X_train_scaled, y_train)

    rf_best = rf_gs.best_estimator_

    print(rf_gs.best_params_)
    
    y_pred = rf_best.predict(X_test_scaled)
    
    print(classification_report(y_test, y_pred))
    print(confusion_matrix(y_test, y_pred))
    
    return rf_best
    
rf_model = train_random_forest(X_train_scaled, X_test_scaled, y_train, y_test)

{'n_estimators': 140}
              precision    recall  f1-score   support

         0.0       0.56      0.35      0.43        51
         1.0       0.54      0.74      0.62        53

    accuracy                           0.55       104
   macro avg       0.55      0.54      0.53       104
weighted avg       0.55      0.55      0.53       104

[[18 33]
 [14 39]]


In [None]:
, y_test)





