In [55]:
import pandas as pd
import numpy as np
import datetime as dt
from sklearn import preprocessing
import plotly.graph_objs as go
import plotly.figure_factory as ff
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

init_notebook_mode(connected=True)

In [133]:
dataDF = pd.read_csv('AAPL.csv')

In [134]:
def features(df):
    new_df = pd.DataFrame() 
    new_df['date'] = df['Date']
    new_df['avg_prices'] = (df['Low'] + df['High']).apply(lambda x: x/2.0)
    new_df['shifted_adj_close'] = df['Adj Close'].shift(-1)
    #new_df['avg_close_week'] = df['Close'].rolling(window=5, center=False).mean()
    #new_df['avg_close_month'] = df['Close'].rolling(window=21, center=False).mean()
    #new_df['avg_close_quarter'] = df['Close'].rolling(window=84, center=False).mean()
    #new_df['std_close_week'] = df['Close'].rolling(window=5, center=False).std()
    #new_df['std_close_month'] = df['Close'].rolling(window=21, center=False).std()
    #new_df['std_close_quarter'] = df['Close'].rolling(window=84, center=False).std()
    new_df['open'] = df['Open']
    new_df['high'] = df['High']
    new_df['low'] = df['Low']
    new_df['close'] = df['Close']
    new_df['volume'] = df['Volume']
    new_df = new_df.dropna(axis=0)
    return new_df

In [135]:
dataDF_new = features(dataDF)

## Splitting data

In [136]:
'''
training ----> start : 2017/11/1
validation ----> 2017/11/1 : 2018/4/1
test ----> 2018/4/1 : end
'''
train_split_time = pd.datetime(2017,11,1)
val_split_time = pd.datetime(2018,4,1)

train_data = dataDF_new.loc[pd.to_datetime(dataDF_new['date']) <= train_split_time]
val_data = dataDF_new.loc[(pd.to_datetime(dataDF_new['date']) > train_split_time) & (pd.to_datetime(dataDF_new['date']) <= val_split_time)]
test_data = dataDF_new.loc[pd.to_datetime(dataDF_new['date']) > val_split_time]

## Normalizing data

In [142]:

def split_data(train_data, test_data, predicted_days):
    '''
    this function takes train data and test data and return splitted normalized data and labels
    '''
    Open = train_data.open
    high = train_data.high
    low = train_data.low
    close = train_data.close 
    volume = train_data.volume
    avg_prices = train_data.avg_prices
    adj_close = train_data.shifted_adj_close
    #close_shifted = adj_close.shift(-predicted_days) 
    data = pd.concat([Open, high, low, close, avg_prices,volume,adj_close], axis=1)
    data.columns = ['Open', 'high', 'low', 'close', 'avg_prices','volume','adj_close']
    #data = data.dropna()
    y_train = data['adj_close']
    cols = ['Open', 'high', 'low', 'close', 'avg_prices','volume']
    X_train = data[cols]
    
    Open = test_data.open
    high = test_data.high
    low = test_data.low
    close = test_data.close 
    volume = test_data.volume
    avg_prices = test_data.avg_prices
    adj_close = test_data.shifted_adj_close
    #close_shifted = adj_close.shift(-predicted_days) 
    data = pd.concat([Open, high, low, close, avg_prices,volume, adj_close], axis=1)
    data.columns = ['Open', 'high', 'low', 'close', 'avg_prices','volume','adj_close']
    #data = data.dropna()
    y_test = data['adj_close']
    cols = ['Open', 'high', 'low', 'close', 'avg_prices','volume']
    X_test = data[cols]
    
    scaler_x = preprocessing.MinMaxScaler(feature_range=(0,1))
    X_train = np.array(X_train).reshape((len(X_train),len(cols)))
    X_train = scaler_x.fit_transform(X_train)
    X_test = np.array(X_test).reshape((len(X_test),len(cols)))
    X_test = scaler_x.fit_transform(X_test)
    
    scaler_y = preprocessing.MinMaxScaler(feature_range=(0,1))
    y_train = np.array(y_train).reshape(len(y_train),1)
    y_train = scaler_y.fit_transform(y_train)
    y_train = y_train.ravel()
    y_test = np.array(y_test).reshape(len(y_test),1)
    y_test = scaler_y.fit_transform(y_test)
    y_test = y_test.ravel()
    
    return X_train, X_test, y_train, y_test



In [143]:
X_train, X_test, y_train, y_test = split_data(train_data,test_data, predicted_days=1)


In [144]:
X_test.shape

(91, 6)

## Linear Regression

#### After running all the below lines, i found out that both models overfit the data

In [145]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression().fit(X_train,y_train)
print("Training set score: {:.2f}".format(lr.score(X_train, y_train)))
print("Test set score: {:.2f}".format(lr.score(X_test, y_test)))


Training set score: 0.99
Test set score: 0.93


In [148]:
from sklearn.linear_model.stochastic_gradient import SGDRegressor
cln = SGDRegressor()
cln.fit(X_train,y_train)
y_pred = cln.predict(X_test)
print("Training set score: {:.2f}".format(cln.score(X_train, y_train)))
print("Test set score: {:.2f}".format(cln.score(X_test, y_test)))

Training set score: 0.97
Test set score: 0.91


In [149]:
print("Test set score: {:.2f}".format(np.mean(y_pred == y_test)))

Test set score: 0.00
