Title: [Practical machine learning (Regression)](https://www.youtube.com/watch?v=lN5jesocJjk&list=PLQVvvaa0QuDfKTOs3Keq_kaG2P55YRn5v&index=3)

## Import libraries and data 


In [1]:
import pandas as pd 
import quandl
import math
import numpy as np 
from sklearn import preprocessing, svm
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.linear_model import LinearRegression

## Create dataframe and features

In [2]:
data = quandl.get('WIKI/GOOGL')
data.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Ex-Dividend,Split Ratio,Adj. Open,Adj. High,Adj. Low,Adj. Close,Adj. Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2004-08-19,100.01,104.06,95.96,100.335,44659000.0,0.0,1.0,50.159839,52.191109,48.128568,50.322842,44659000.0
2004-08-20,101.01,109.08,100.5,108.31,22834300.0,0.0,1.0,50.661387,54.708881,50.405597,54.322689,22834300.0
2004-08-23,110.76,113.48,109.05,109.4,18256100.0,0.0,1.0,55.551482,56.915693,54.693835,54.869377,18256100.0
2004-08-24,111.24,111.6,103.57,104.87,15247300.0,0.0,1.0,55.792225,55.972783,51.94535,52.597363,15247300.0
2004-08-25,104.76,108.0,103.88,106.0,9188600.0,0.0,1.0,52.542193,54.167209,52.10083,53.164113,9188600.0


In [3]:
data = data[['Adj. Open', 'Adj. High' , 'Adj. Low', 'Adj. Close', 'Adj. Volume',]]
data.head()

Unnamed: 0_level_0,Adj. Open,Adj. High,Adj. Low,Adj. Close,Adj. Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2004-08-19,50.159839,52.191109,48.128568,50.322842,44659000.0
2004-08-20,50.661387,54.708881,50.405597,54.322689,22834300.0
2004-08-23,55.551482,56.915693,54.693835,54.869377,18256100.0
2004-08-24,55.792225,55.972783,51.94535,52.597363,15247300.0
2004-08-25,52.542193,54.167209,52.10083,53.164113,9188600.0


## Create more features to work with


In [4]:
# precent volatility
data['HL_PCT'] = (data['Adj. High'] - data['Adj. Close']) / data['Adj. Close'] * 100.0

# daily percent change
data['PCT_change'] = (data['Adj. Close'] - data['Adj. Open']) / data['Adj. Open'] * 100.0

data.head()


Unnamed: 0_level_0,Adj. Open,Adj. High,Adj. Low,Adj. Close,Adj. Volume,HL_PCT,PCT_change
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2004-08-19,50.159839,52.191109,48.128568,50.322842,44659000.0,3.712563,0.324968
2004-08-20,50.661387,54.708881,50.405597,54.322689,22834300.0,0.710922,7.227007
2004-08-23,55.551482,56.915693,54.693835,54.869377,18256100.0,3.729433,-1.22788
2004-08-24,55.792225,55.972783,51.94535,52.597363,15247300.0,6.417469,-5.726357
2004-08-25,52.542193,54.167209,52.10083,53.164113,9188600.0,1.886792,1.183658


In [5]:
## drop superfluous features

data1 = data.drop(data.iloc[:, 0:3], axis=1)
data1.head()

Unnamed: 0_level_0,Adj. Close,Adj. Volume,HL_PCT,PCT_change
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2004-08-19,50.322842,44659000.0,3.712563,0.324968
2004-08-20,54.322689,22834300.0,0.710922,7.227007
2004-08-23,54.869377,18256100.0,3.729433,-1.22788
2004-08-24,52.597363,15247300.0,6.417469,-5.726357
2004-08-25,53.164113,9188600.0,1.886792,1.183658


## Problem formulation
The speaker aims to use the previous data points
of adjusted close price to make step ahead prediciton.

In [6]:
forecast_col = 'Adj. Close'

## fill nana with outliers
data1.fillna(-99999, inplace=True)

In [14]:
## make steps ahead forecast with the previous datapoints

forecast_out = int(math.ceil(0.01 * len(data)))

# data1['label'] = data1[forecast_col]

## the data has been shifted up by forecast_out 
data1['label'] = data1[forecast_col].shift(-forecast_out)

## dropping the nans deletes that last(forecast_out) rows
data1.dropna(inplace=True)
data1.head()

Unnamed: 0_level_0,Adj. Close,Adj. Volume,HL_PCT,PCT_change,label
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2004-08-19,50.322842,44659000.0,3.712563,0.324968,69.078238
2004-08-20,54.322689,22834300.0,0.710922,7.227007,67.839414
2004-08-23,54.869377,18256100.0,3.729433,-1.22788,68.912727
2004-08-24,52.597363,15247300.0,6.417469,-5.726357,70.668146
2004-08-25,53.164113,9188600.0,1.886792,1.183658,71.219849


## Create input and output

In [15]:
X = np.array(data1.drop(['label'], 1))
X = X[:-forecast_out]
X_lately = X[-forecast_out:]
X = preprocessing.scale(X)

data1.dropna(inplace=True)
y = np.array(data1['label'])
y = np.array(data1['label'])

## Test/Train

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

ValueError: Found input variables with inconsistent numbers of samples: [3354, 3389]

## Create classifier

In [10]:
lin_clf = LinearRegression()
svm_clf = svm.SVR(kernel='poly')

clf = lin_clf
clf.fit(X_train, y_train)
accuracy = clf.score(X_test, y_test)

In [11]:
print(accuracy)

0.9786256048019195
