# Google Stock Prediction


In [81]:
import pandas as pd
import numpy as np
from sklearn import preprocessing, model_selection, svm
from sklearn.linear_model import LinearRegression

df = pd.read_csv('./dataset/GOOGL.csv')

print(df.head())

         Date       Open       High        Low      Close  Adj Close    Volume
0  2004-08-19  50.050049  52.082081  48.028027  50.220219  50.220219  44659096
1  2004-08-20  50.555557  54.594597  50.300301  54.209209  54.209209  22834343
2  2004-08-23  55.430431  56.796799  54.579578  54.754753  54.754753  18256126
3  2004-08-24  55.675674  55.855858  51.836838  52.487488  52.487488  15247337
4  2004-08-25  52.532532  54.054054  51.991993  53.053055  53.053055   9188602


### Clean Up the dataframe

In [82]:
df = df[['Open', 'High', 'Low', 'Close', 'Volume']]
df['HL_percent'] = (df['High'] - df['Close']) / df['Close'] * 100
df['Percent_change'] = (df['Close'] - df['Open']) / df['Open'] * 100

df = df[['Close', 'HL_percent', 'Percent_change', 'Volume']]

print(df.head())

       Close  HL_percent  Percent_change    Volume
0  50.220219    3.707395        0.340000  44659096
1  54.209209    0.710927        7.227004  22834343
2  54.754753    3.729441       -1.218966  18256126
3  52.487488    6.417472       -5.726354  15247337
4  53.053055    1.886789        0.990858   9188602


### ADD LABEL COLUMN

In [83]:
import math

FORECAST_COL = 'Close'
df.fillna(-9999, inplace=True)

# forecast future close based on today features
FORECAST_OUT = int(math.ceil(0.01*len(df)))

LABEL = 'Future_volume'
df[LABEL] = df[FORECAST_COL].shift(-FORECAST_OUT)
df.dropna(inplace=True)
print(df.head())

       Close  HL_percent  Percent_change    Volume  Future_volume
0  50.220219    3.707395        0.340000  44659096      86.301300
1  54.209209    0.710927        7.227004  22834343      93.793793
2  54.754753    3.729441       -1.218966  18256126      90.990990
3  52.487488    6.417472       -5.726354  15247337      93.078079
4  53.053055    1.886789        0.990858   9188602      96.746750


### Prepare data

In [84]:
# Define features and labels
print(df.columns)
X = np.array(df.drop(LABEL, axis=1))
y = np.array(df[LABEL])

# Scaling the features
X = preprocessing.scale(X)

# create training and test sets
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.2)

Index(['Close', 'HL_percent', 'Percent_change', 'Volume', 'Future_volume'], dtype='object')


### Prepare Classifier ( Linear Regression )

In [85]:
clf = LinearRegression()
clf.fit(X_train, y_train)
accuracy = clf.score(X_test, y_test)

print(accuracy)

0.9820591603064874


### Prepare Classifier ( SVM ) -- Default

In [88]:
clf = svm.SVR()
clf.fit(X_train, y_train)
accuracy = clf.score(X_test, y_test)

print(accuracy)

0.433093430028616


### Prepare Classifier ( SVM ) -- Polynomial

In [90]:
clf = svm.SVR(kernel='poly')
clf.fit(X_train, y_train)
accuracy = clf.score(X_test, y_test)

print(accuracy)

0.6102743400395245
