In [19]:
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder 
from sklearn import preprocessing
from tensorflow import keras
from tensorflow.keras import layers

pd.set_option('display.max_rows', 500)

In [20]:
# Cleaning Data

df = pd.read_csv('data/stock_XY_train.csv')
print('OG Data Size:{}'.format(df.shape))


dropped_columns = (df[df.columns[df.isnull().mean() > 0.15]].columns)
df = df[df.columns[df.isnull().mean() < 0.15]] # TO-DO: Tinker around with mean threshold.
df = df.dropna()
print('New Data Size:{}'.format(df.shape))
del df['operatingProfitMargin'] # Got rid of this column because it is all `1`. No reason to keep.

del df['Ticker']
del df['Sector']
del df['Yr']

OG Data Size:(12379, 226)
New Data Size:(8977, 156)


In [21]:
# Separating to train and test data
train_df = df.sample(frac=0.8,random_state=0)
test_df = df.drop(train_df.index)

# Grabbing stats in order to normalize data
train_stats = train_df.describe()
train_stats.pop('Buy')
train_stats = train_stats.transpose()

# Separating labels
train_label = train_df.pop('Buy')
test_label = test_df.pop('Buy')

# Normalizing Data
def norm(x):
    return (x - train_stats['mean']) / train_stats['std']

normed_train_data = norm(train_df)
normed_test_data = norm(test_df)

In [22]:
def build_model():
    model = keras.Sequential([
        layers.Dense(64, activation='relu', input_shape=[len(train_df.keys())]),
        layers.Dense(64, activation='relu'),
        layers.Dense(1)
    ])

    optimizer = tf.keras.optimizers.Adam(
                    learning_rate=0.001, 
                    beta_1=0.9, 
                    beta_2=0.999, 
                    epsilon=1e-07, 
                    amsgrad=False,
                    decay=0.001,
                    name='Adam')

    model.compile(loss='mse',
                optimizer=optimizer,
                metrics=['mae', 'mse', 'accuracy'])
    return model

model = build_model()
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_3 (Dense)              (None, 64)                9728      
_________________________________________________________________
dense_4 (Dense)              (None, 64)                4160      
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 65        
Total params: 13,953
Trainable params: 13,953
Non-trainable params: 0
_________________________________________________________________


In [23]:
EPOCHS = 200
BATCH_SIZE = 32

history = model.fit(
    normed_train_data, 
    train_label,
    epochs=EPOCHS, 
    validation_split=0.2, 
    verbose=0,
    batch_size=BATCH_SIZE)

In [None]:
hist = pd.DataFrame(history.history)
hist['epoch'] = history.epoch
hist

In [None]:
# plt.plot(hist['epoch'], hist['loss'])
plt.plot(hist['epoch'], hist['accuracy'])

In [None]:
model.fit(normed_train_data, train_label, epochs=200)

In [24]:
results = model.evaluate(normed_test_data, test_label, batch_size=128)
print('test loss, test acc:', results)

test loss, test acc: [166713.25047229897, 10.212719, 166713.25, 0.6395543]


# Submission File Generation

In [27]:
dfTest = pd.read_csv('data/stock_X_test.csv').drop('Unnamed: 0', axis=1).rename({'Unnamed: 0.1': 'Unnamed: 0'}, axis='columns')

dfTest = dfTest.drop(dropped_columns, axis=1) # TO-DO: Tinker around with mean threshold.
del dfTest['operatingProfitMargin'] # Got rid of this column because it is all `1`. No reason to keep.

del dfTest['Ticker']
del dfTest['Sector']
del dfTest['Yr']

dfTest['Buy'] = [1 if x else 0 for x in model.predict(dfTest) > 0]
dfTest[['Unnamed: 0', 'Buy']].to_csv('submission.csv', index=None, header=True)

  # Remove the CWD from sys.path while we load stuff.
