# 1 Problem

How well can we predict the closing price of a stock every day? Let's use HDFCBANK as an example


Outcome

* Timeseries analysis
* Novel method of evaluation of model
* Machine Learning isn't magic


In [None]:
pip install -U scikit-learn

In [None]:
import pandas as pd
import datetime
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import numpy as np

from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder, StandardScaler, OneHotEncoder
from sklearn_pandas import DataFrameMapper

# Additional Features

In [None]:
hdfc = pd.read_csv('../input/stock-price-forecast-india/HDFCBANK.NS.csv')
hdfc.head()

In [None]:
def get_stats(df):
    start_date = datetime.datetime.strptime(df['Date'].min(), '%Y-%m-%d')
    end_date = datetime.datetime.strptime(df['Date'].max(), '%Y-%m-%d')
    data_duration = (end_date - start_date).days

    print(f"Start Date: {df['Date'].min()}")
    print(f"End Date: {df['Date'].max()}")
    print(f"Date Difference: {data_duration}")
    print(f"Number of Data Points: {df.shape[0]}")

In [None]:
get_stats(hdfc)

Let's plot the adjusted closed price for the last 50 days

In [None]:
plt.rcParams.update({'figure.figsize': (17, 3), 'figure.dpi':300})
fig, ax = plt.subplots()
sns.lineplot(data=hdfc.tail(50), x='Date', y='Adj Close')
plt.grid(linestyle='-', linewidth=0.3)
ax.tick_params(axis='x', rotation=90)

# 2 Feature Engineering

Let's see how far we can get with:

1. Lagged Ajusted Close price
2. Previous day's volume
3. Day of week

In [None]:
hdfc.head

In [None]:
hdfc.columns = ['date', 'open', 'high', 'low', 'close', 'adjusted_close', 'volume']

hdfc['prev_adjusted_close'] = hdfc['adjusted_close'].shift(1)
hdfc['prev_volume'] = hdfc['volume'].shift(1)
hdfc['day_of_week'] = pd.to_datetime(hdfc['date']).dt.dayofweek

data = hdfc[['date', 'adjusted_close', 'prev_adjusted_close', 'prev_volume', 'day_of_week']].dropna()
data.sample()

In [None]:
categorical_features = ['day_of_week']
numerical_features = ['prev_adjusted_close', 'prev_volume']
label = 'adjusted_close'

train_df, test_df = train_test_split(data, test_size=0.1, shuffle=False)
X_train, y_train = train_df[categorical_features + numerical_features], train_df[label]
X_test, y_test = test_df[categorical_features + numerical_features], test_df[label]

num = [([n], [StandardScaler()]) for n in numerical_features]
cat = [([n], [OrdinalEncoder()]) for n in categorical_features]
mapper = DataFrameMapper(num + cat, df_out=True)

preprocessed_X_train = mapper.fit_transform(X_train)
preprocessed_X_train = sm.add_constant(preprocessed_X_train)
results = sm.OLS(y_train, preprocessed_X_train).fit()
results.summary()

Let's leave in prev_volume. If taken alone, it shows significance. The linear regression model may not be complex enough to capture contribution by prev_volume

# 3 Model Training

In [None]:
def evaluation(pipeline, X, y):
    y_prediction = pipeline.predict(X)
    return{
        'MAE': mean_absolute_error(y, y_prediction),
        'MAPE': mean_absolute_percentage_error(y, y_prediction),
        'y_pred': y_prediction
    }

In [None]:
mapper = DataFrameMapper(num + cat, df_out=True)
clf = LinearRegression()
pipeline = Pipeline([
    ('preprocess', mapper),
    ('clf', clf)
])

pipeline.fit(X_train, y_train)
results = evaluation(pipeline, X_test, y_test)
print(f"MAE: ₹{round(results['MAE'], 2)}, MAPE: {round(results['MAPE'] * 100, 2)}%")

Simple Linear Regression looks like it's doing super well. But is it?

In [None]:
hdfc_forecast = pd.DataFrame(data={
    'date': test_df['date'].reset_index().drop('index', axis=1)['date'],
    'predictions': results['y_pred'],
    'truth': y_test.reset_index().drop('index', axis=1)['adjusted_close']})
hdfc_forecast.sample()

In [None]:
plt.rcParams.update({'figure.figsize': (17, 3), 'figure.dpi':300})
fig, ax = plt.subplots()
sns.lineplot(data=hdfc_forecast.tail(50), x='date', y='truth')
sns.lineplot(data=hdfc_forecast.tail(50), x='date', y='predictions')
plt.grid(linestyle='-', linewidth=0.3)
ax.tick_params(axis='x', rotation=90)

In [None]:
hdfc_forecast.tail(10)

Not a good evaluation metric since we are lagging the truth. Maybe it's more apparent in drops

# 4 Better Evaluation Metric

In [None]:
test_df['perc_change'] = (
    (test_df['adjusted_close'] - test_df['prev_adjusted_close']) * 100 / \
    test_df['prev_adjusted_close']).abs()
hdfc_forecast = hdfc_forecast.merge(test_df[['date', 'perc_change']], on='date')
hdfc_forecast.sample()

In [None]:
performance = []
for perc_change in np.arange(0, 10, 0.5):
    test = hdfc_forecast[hdfc_forecast['perc_change'] > perc_change]
    performance.append({
        'perc_change': perc_change,
        'MAE':  mean_absolute_error(test['truth'], test['predictions']),
        'MAPE':  mean_absolute_percentage_error(test['truth'], test['predictions']),
        'count': test.shape[0]
    })
results = pd.DataFrame(performance)

In [None]:
plt.rcParams.update({'figure.figsize': (17, 3), 'figure.dpi': 300})
fig, ax = plt.subplots()
ax2 = ax.twinx()

sns.lineplot(
    data=results['MAE'],
    color='red',
    legend=True,
    ax=ax)

sns.barplot(
    x='perc_change',
    y='count',
    data=results,
    color='blue',
    alpha=0.1,
    ax=ax2
)

plt.grid(linestyle='-', linewidth=0.3)
title = ax.set_title('Model Evaluation for different Price Changes')
xlabel = ax.set_xlabel('Absolute Percentage Change (Day over Day)')
ylabel = ax2.set_ylabel('Number of Days')
y2label = ax.set_ylabel('MAE')

In [None]:
plt.rcParams.update({'figure.figsize': (17, 3), 'figure.dpi': 300})
fig, ax = plt.subplots()
ax2 = ax.twinx()

sns.lineplot(
    data=results['MAPE'],
    color='red',
    legend=True,
    ax=ax)

sns.barplot(
    x='perc_change',
    y='count',
    data=results,
    color='blue',
    alpha=0.1,
    ax=ax2
)

plt.grid(linestyle='-', linewidth=0.3)
title = ax.set_title('Model Evaluation for different Price Changes')
xlabel = ax.set_xlabel('Absolute Percentage Change (Day over Day)')
ylabel = ax2.set_ylabel('Number of Days')
y2label = ax.set_ylabel('MAPE')

We will apply some more models and use google trends in coming days! 

# Kernal is still under development!

If you have any suggestion please provide in comment section. Thank You!