<a href="https://colab.research.google.com/github/SighOfFrostmourne/ECOM6022/blob/main/Assignment_4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Assignment 4

## Environment setup and loading data

### Environment setup

In [35]:
# Python ≥3.5 and Scikit-Learn ≥0.20 are required
import sys
import sklearn
import numpy as np
import os

try:
    # %tensorflow_version only exists in Colab.
    %tensorflow_version 2.x
except Exception:
    pass

# TensorFlow ≥2.0 is required
import tensorflow as tf
assert tf.__version__ >= "2.0"

# to make this notebook's output stable across runs
np.random.seed(42)
# To plot pretty figures (for report etc)
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
ROOT_PATH = "/content/drive/MyDrive/ECOM6022/Assignment_4"
os.chdir(ROOT_PATH)
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "Assignment_4"
DATA_PATH = os.path.join(ROOT_PATH, "datasets")
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

### Load data which already downloaded to Google Drive

In [36]:
import pandas as pd

def load_data(data_path=DATA_PATH):
    train_csv_path = os.path.join(data_path, "stock_market_training.csv")
    test_csv_path = os.path.join(data_path, "stock_market_testing.csv")
    return pd.read_csv(train_csv_path), pd.read_csv(test_csv_path)

train_data, test_data = load_data() 

## Take a quick look at the data

In [11]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9000 entries, 0 to 8999
Data columns (total 11 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   F1      9000 non-null   float64
 1   F2      9000 non-null   float64
 2   F3      9000 non-null   float64
 3   F4      9000 non-null   float64
 4   F5      9000 non-null   float64
 5   F6      9000 non-null   float64
 6   F7      9000 non-null   float64
 7   F8      9000 non-null   float64
 8   F9      9000 non-null   float64
 9   F10     9000 non-null   float64
 10  Class   9000 non-null   int64  
dtypes: float64(10), int64(1)
memory usage: 773.6 KB


In [8]:
train_data.head()

Unnamed: 0,F1,F2,F3,F4,F5,F6,F7,F8,F9,F10,Class
0,0.306251,1.227179,-0.590406,-0.843588,1.158658,-2.609219,-2.361603,-1.207631,-0.677193,1.009508,2
1,1.515725,-0.204145,-2.258348,0.945892,-2.120108,-0.304811,1.45566,0.785203,-1.157508,2.52683,1
2,-1.389026,0.11567,0.245705,0.225163,0.840018,-0.671928,-1.570313,-1.382368,0.068508,0.944927,0
3,-0.740551,-0.259994,-1.14641,-1.03987,-2.867487,-1.386472,0.101187,0.897933,-0.735515,2.477461,1
4,-0.927937,0.938241,-0.05446,1.453347,1.140367,0.906949,-1.237445,1.13082,1.584983,0.848553,0


In [10]:
train_data.describe()

Unnamed: 0,F1,F2,F3,F4,F5,F6,F7,F8,F9,F10,Class
count,9000.0,9000.0,9000.0,9000.0,9000.0,9000.0,9000.0,9000.0,9000.0,9000.0,9000.0
mean,-0.001831,0.002335,-0.333083,-0.329106,-0.016856,-0.004721,0.005932,-0.004874,0.008286,-0.007801,0.997667
std,1.006028,1.000002,1.732397,1.561179,1.719284,1.558986,1.000442,0.995425,0.993745,1.487237,0.815381
min,-3.722487,-3.642163,-7.411571,-5.909465,-7.1125,-4.682266,-3.492335,-3.880869,-3.851288,-5.306898,0.0
25%,-0.675746,-0.675234,-1.496181,-1.415918,-1.199785,-1.121013,-0.661181,-0.679693,-0.669489,-1.079929,0.0
50%,0.015263,0.009475,-0.458095,-0.44147,0.041727,-0.093299,0.003924,0.00928,0.012985,-0.022499,1.0
75%,0.677262,0.682157,0.790806,0.738776,1.191769,1.086449,0.681353,0.663657,0.683734,1.068787,2.0
max,3.856141,3.796725,6.492938,6.650862,6.125745,6.133881,4.093397,3.531378,4.079641,5.196704,2.0


### Breif Analysis: 
There are 10 factors affecting the recommandation results (2 - Buy, 1 - Hold and 0 - Sell). All of them has very small mean and standard deviation. By further observation, F3 - F5 have relatively larger means. Therefore, they could be made as extra input_A in "Concat" layer, and all 10 factors to be used as input_B

## Data Preparation

### Split data
Separate the data and the lable. Further split it into validation sets

In [44]:
X_train_full = train_data.drop("Class", axis = 1)
y_train_full = train_data["Class"].copy()
X_test = test_data.drop("Class", axis = 1)
X_test_A = test_data. iloc[:, [2,3,4]]
y_test = test_data["Class"].copy()

# Split full training set to train and validation set
from sklearn.model_selection import train_test_split
X_train, X_train_valid, y_train, y_train_valid = train_test_split(X_train_full, y_train_full,
                                                    stratify=y_train_full, 
                                                    test_size=0.1, random_state=42)

# Use F3-F5 as an extra input
X_train_A = X_train. iloc[:, [2,3,4]]
X_train_A_valid = X_train_valid. iloc[:, [2,3,4]]

### Transform
Transform the datatype to numpy array frist, then use standardize scaler to transform training data

In [47]:
#No need for a pipeline here
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train_std = scaler.fit_transform(X_train)
X_train_valid_std = scaler.fit_transform(X_train_valid)
X_train_A_std = scaler.fit_transform(X_train_A)
X_train_A_valid_std = scaler.fit_transform(X_train_A_valid)
X_test_std = scaler.fit_transform(X_test)
X_test_A_std = scaler.fit_transform(X_test_A)

## Build the model

In [34]:
import tensorflow as tf
from tensorflow import keras

def build_model(n_hidden=1, n_neurons=30, learning_rate=3e-3, input_shape_1=[3], input_shape_2=[10]):
    model = keras.models.Model() # Use "Model" instead of "S"
    model.add(keras.layers.InputLayer(input_shape=input_shape_2))
    for layer in range(n_hidden):
        model.add(keras.layers.Dense(n_neurons, activation="relu"))
    model.add(keras.layers.concatenate([input_shape_1, keras.layers.Dense[range(n_hidden)]]))
    model.add(keras.layers.Dense(3, activation="softmax")) #Output 3 classes (2-Buy, 1- Hold, 0 - Sell)
    optimizer = keras.optimizers.SGD(learning_rate=learning_rate)
    model.compile(loss="sparse_categorical_crossentropy", optimizer=optimizer)
    return model