# Import Libraries

In [1]:
!pip install --upgrade ta
!pip install --upgrade category_encoders

Collecting ta
  Downloading https://files.pythonhosted.org/packages/a9/22/a355ecf2d67da8150332d22ef65c3a1f79109528279bf5d40735b6f2bd72/ta-0.7.0.tar.gz
Building wheels for collected packages: ta
  Building wheel for ta (setup.py) ... [?25l[?25hdone
  Created wheel for ta: filename=ta-0.7.0-cp37-none-any.whl size=28716 sha256=9d0f81cb8984a9843eb277d934bfd61fdd11e8b16a4e5252b70ee546bbcc295a
  Stored in directory: /root/.cache/pip/wheels/dd/88/30/de9553fb54a474eb7480b937cdbb140bdda613d29cf4da7994
Successfully built ta
Installing collected packages: ta
Successfully installed ta-0.7.0
Collecting category_encoders
[?25l  Downloading https://files.pythonhosted.org/packages/44/57/fcef41c248701ee62e8325026b90c432adea35555cbc870aff9cfba23727/category_encoders-2.2.2-py2.py3-none-any.whl (80kB)
[K     |████████████████████████████████| 81kB 3.7MB/s 
Installing collected packages: category-encoders
Successfully installed category-encoders-2.2.2


In [2]:
# Processing
import pandas as pd
import numpy as np

# Visualization
import seaborn as sns
import matplotlib.pyplot as plt

# Prediction
import tensorflow as tf

# Native Libraries
import re

plt.style.use('seaborn-whitegrid')

In [3]:
# Create a dataframe of the stock data
df_stock = pd.read_csv("/content/drive/MyDrive/Colabs/Stock/stocks.csv", thousands=',');

# Work with smaller dataset
stocks_n = 20
codes = list(df_stock["Code"].unique())[30:30+stocks_n]
df_stock = df_stock[df_stock["Code"].isin(codes)]

# Feature Engineering

In [4]:
from ta.trend import SMAIndicator, EMAIndicator, MACD
from ta.volatility import BollingerBands, AverageTrueRange
from ta.momentum import RSIIndicator, StochasticOscillator

from ta import add_all_ta_features

def create_features(df):
  minIndex = df.index.min()

  # Remove unit on volume
  unit_conversion = {"K": 1000, "M": 1000000, "B": 1000000000}
  df["Volume"] = df["Volume"].map(lambda x: 0 if x == "-" else int(float(x[:-1]) * unit_conversion[x[-1]]))

  # Convert to proper data types
  df["Change"] = df["Change%"].map(lambda x: re.sub("[%,]", "", x)).astype(float)
  df["Date"] = pd.to_datetime(df["Date"])

  # Drop Stock Name and Change%
  df = df.drop("Stock Name", axis=1)
  df = df.drop("Change%", axis=1)

  # Sort values by date
  df = df.sort_values("Date")

  # Rescale Prices
  base = df.iloc[0]["Price"]
  for col in ["Price", "Open", "High", "Low"]:
    df[col] = df[col].map(lambda x: x / base)

  # Add Simple Moving Average
  df["SMA20"] = SMAIndicator(close=df["Price"], window=20, fillna=True).sma_indicator()
  df["SMA50"] = SMAIndicator(close=df["Price"], window=50, fillna=True).sma_indicator()
  df["SMA200"] = SMAIndicator(close=df["Price"], window=200, fillna=True).sma_indicator()

  # Add Exponential Moving Average
  df["EMA20"] = EMAIndicator(close=df["Price"], window=20, fillna=True).ema_indicator()
  df["EMA50"] = EMAIndicator(close=df["Price"], window=50, fillna=True).ema_indicator()
  df["EMA200"] = EMAIndicator(close=df["Price"], window=200, fillna=True).ema_indicator()

  # Add Bollinger Bands
  bollinger = BollingerBands(close=df["Price"], fillna=True)
  df["Bollinger_HBand"] = bollinger.bollinger_hband()
  df["Bollinger_LBand"] = bollinger.bollinger_lband()

  # Add MACD
  df["MACD"] = MACD(close=df["Price"], fillna=True).macd()

  # Add RSI
  df["RSI"] = RSIIndicator(close=df["Price"], fillna=True).rsi()

  # Add Stochastic Oscillator
  df["StochasticOscillator"] = StochasticOscillator(high=df["High"], low=df["Low"], close=df["Price"], fillna=True).stoch()

  # Add Average True Range
  df["AverageTrueRange"] = AverageTrueRange(high=df["High"], low=df["Low"], close=df["Price"], fillna=True).average_true_range()
  
  # Append a target column where y = if x increased from the past week
  df["Gain"] = df.index.map(lambda x: np.nan if x < minIndex + 6 else df.loc[x - 6]["Price"] >= df.loc[x]["Price"])

  # Convert Date to Milliseconds
  df["Date"] = df["Date"].astype(int)
  
  # Drop the target that is NaN because there is not enough data to derive Y
  df = df[df["Gain"].notna()]
  df["Gain"] = df["Gain"].astype(int)

  return df

# Work with the data by groups based on the stock code
def add_features(df):
  df = df.groupby("Code").apply(create_features).reset_index(drop=True)
  return df

# Invoke a function to add new features
df_stock_features = add_features(df_stock)
df_stock_features.head()

Unnamed: 0,Code,Date,Price,Open,High,Low,Volume,Change,SMA20,SMA50,SMA200,EMA20,EMA50,EMA200,Bollinger_HBand,Bollinger_LBand,MACD,RSI,StochasticOscillator,AverageTrueRange,Gain
0,AGI,1330560000000000000,1.0,0.962838,1.0,0.962838,42330000,3.5,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,100.0,100.0,0.0,1
1,AGI,1330646400000000000,1.011824,1.011824,1.030405,1.0,46470000,1.18,1.005912,1.005912,1.005912,1.001126,1.000464,1.000118,1.017736,0.994088,0.000943,100.0,72.5,0.0,0
2,AGI,1330905600000000000,1.015203,1.013514,1.025338,1.010135,27290000,0.33,1.009009,1.009009,1.009009,1.002467,1.001042,1.000268,1.022045,0.995973,0.001941,100.0,77.5,0.0,1
3,AGI,1330992000000000000,0.994932,1.018581,1.018581,0.978041,13390000,-2.0,1.00549,1.00549,1.00549,1.001749,1.000802,1.000215,1.022105,0.988875,0.001084,39.67684,47.5,0.0,1
4,AGI,1331078400000000000,0.988176,0.971284,0.998311,0.971284,17820000,-0.68,1.002027,1.002027,1.002027,1.000456,1.000307,1.000095,1.022342,0.981712,-0.000139,32.614372,37.5,0.0,1


# Preprocess

In [16]:
from sklearn.model_selection import train_test_split

X_full = df_stock_features.copy()
y_full = X_full.pop("Gain")

X_train, X_valid, y_train, y_valid = train_test_split(X_full, y_full, test_size=0.3, random_state=1)
# X_valid, X_test, y_valid, y_test = train_test_split(X_valid, y_valid, test_size=0.5, random_state=1)

In [17]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Normalizer
from category_encoders.hashing import HashingEncoder

from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer

features_num = list(X_full.select_dtypes(include=np.number).columns)
features_cat = list(X_full.select_dtypes(exclude=np.number).columns)

transformer_num = make_pipeline(
    StandardScaler(),
    Normalizer()
)

transformer_cat = make_pipeline(
    HashingEncoder(cols=features_cat, n_components=9)
)

preprocessor = make_column_transformer(
    (transformer_num, features_num),
    (transformer_cat, features_cat),
)

X_train_processed = preprocessor.fit_transform(X_train)
X_valid_processed = preprocessor.transform(X_valid)
# X_test_processed = preprocessor.transform(X_test)

  elif pd.api.types.is_categorical(cols):


# Prediction

Random Tree Classifier

In [18]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import log_loss, accuracy_score

# Train model
random_forest = RandomForestClassifier(n_estimators=300)
random_forest.fit(X_train_processed, y_train)

# Predict
pred = random_forest.predict(X_valid_processed)
pred_proba = random_forest.predict_proba(X_valid_processed)

# Show cross entropy score
print("Log Loss", log_loss(y_valid, pred_proba))
print("Accuracy Score", accuracy_score(y_valid, pred))

Log Loss 0.5961579542298813
Accuracy Score 0.6920175989943432


# Archived

MLP Classifier

In [8]:
# from sklearn.neural_network import MLPClassifier
# from sklearn.metrics import log_loss

# # Train model
# mlp_classifier = MLPClassifier(early_stopping=True)
# mlp_classifier.fit(X_train_processed, y_train)

# # Predict
# pred = mlp_classifier.predict_proba(X_valid_processed)

# # Show cross entropy score
# print("Log Loss", log_loss(y_valid, pred))

KNN Classifier

In [9]:
# from sklearn.neighbors import KNeighborsClassifier
# from sklearn.metrics import log_loss

# # Train model
# knn = KNeighborsClassifier()
# knn.fit(X_train_processed, y_train)

# # Predict
# pred = knn.predict_proba(X_valid_processed)

# # Show cross entropy score
# print("Log Loss", log_loss(y_valid, pred))

Support Vector Machine

In [10]:
# from sklearn.svm import SVC
# from sklearn.metrics import log_loss, accuracy_score

# # Train model
# svc = SVC(probability=True)
# svc.fit(X_train_processed, y_train)

# # Predict
# pred = svc.predict(X_valid_processed)
# pred_proba = svc.predict_proba(X_valid_processed)

# # Show cross entropy score
# print("Log Loss", log_loss(y_valid, pred_proba))
# print("Accuracy", accuracy_score(y_valid, pred))

Naive Bayes

In [11]:
# from sklearn.naive_bayes import GaussianNB
# from sklearn.metrics import log_loss

# # Train model
# nb = GaussianNB()
# nb.fit(X_train_processed, y_train)

# # Predict
# pred = nb.predict_proba(X_valid_processed)

# # Show cross entropy score
# print("Log Loss", log_loss(y_valid, pred))

In [12]:
# from tensorflow import keras
# from tensorflow.keras import layers

# # YOUR CODE HERE: define the model given in the diagram
# model_sequential = keras.Sequential([
#     layers.BatchNormalization(input_shape=[X_train_processed.shape[1]]),
#     layers.Dense(units=256, activation='relu'),
#     layers.BatchNormalization(),
#     layers.Dropout(0.3),
#     layers.Dense(units=256, activation='relu'),
#     layers.BatchNormalization(),
#     layers.Dropout(0.3),
#     layers.Dense(1, activation='sigmoid')
# ])

# model_sequential.compile(optimizer='adam', loss='binary_crossentropy', metrics=['binary_accuracy'])

# early_stopping = keras.callbacks.EarlyStopping(
#     patience=5,
#     min_delta=0.001,
#     restore_best_weights=True,
# )

# history = model_sequential.fit(
#     X_train_processed, y_train,
#     validation_data=(X_valid_processed, y_valid),
#     batch_size=1024,
#     epochs=100,
#     callbacks=[early_stopping],
# )

# history_df = pd.DataFrame(history.history)
# history_df.loc[:, ['loss', 'val_loss']].plot(title="Cross-entropy")
# history_df.loc[:, ['binary_accuracy', 'val_binary_accuracy']].plot(title="Accuracy")

In [13]:
# feature_columns = []

# df_train = pd.DataFrame(X_train_processed)
# df_train_label = pd.DataFrame(y_train)
# df_valid = pd.DataFrame(X_valid_processed)
# df_valid_label = pd.DataFrame(y_valid)

# df_cols = {col: "ft_" + str(col) for col in list(df_train.columns)}
# df_train.rename(df_cols, inplace=True, axis=1)
# df_valid.rename(df_cols, inplace=True, axis=1)

# for feature in list(df_train.columns):
#   feature_columns.append(tf.feature_column.numeric_column(feature))

# def make_input_fn(X, y, n_epochs=None, shuffle=True):
#   n_samples = len(y)
#   def input_fn():
#     dataset = tf.data.Dataset.from_tensor_slices((dict(X), y))
#     if shuffle:
#       dataset = dataset.shuffle(n_samples)
#     # For training, cycle thru dataset as many times as need (n_epochs=None).
#     dataset = (dataset
#       .repeat(n_epochs)
#       .batch(n_samples))
#     return dataset
#   return input_fn

# train_input_fn = make_input_fn(df_train, df_train_label)
# valid_input_fn = make_input_fn(df_valid, df_valid_label, shuffle=False, n_epochs=1)

# params = {'n_trees': 50, 'max_depth': 3, 'n_batches_per_layer': 1, 'center_bias': True}

# est = tf.estimator.BoostedTreesClassifier(feature_columns, **params)

# # Train model.
# est.train(train_input_fn)

# # Evaluation.
# results = est.evaluate(valid_input_fn)
# pd.Series(results).to_frame()