In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


## Load dataset

In [57]:
import pandas as pd
import numpy as np
# train = pd.read_csv('gdrive/MyDrive/Columbia/BDA_PJ/train.csv', index_col=0)
# test = pd.read_csv('gdrive/MyDrive/Columbia/BDA_PJ/test.csv', index_col=0)

train = pd.read_csv('gdrive/MyDrive/Columbia/BDA_PJ/weighted_train.csv', index_col=0)
test = pd.read_csv('gdrive/MyDrive/Columbia/BDA_PJ/weighted_test.csv', index_col=0)

use_seq = True

train.head()

Unnamed: 0,index,Date,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,avg,prev_label,prev_rate,label
2184,ADBE,2016-06-02,0.296,0.0,0.0,0.0,0.3243,0.0,0.3818,0.0,0.0516,0.0,0.0,0.0,0.0,0.4404,0.61645,0.0,0.38535,0.3818,0.0,0.6369,0.0,0.4404,0.0,-0.4585,0.289575,True,0.0019,False
3152,LRCX,2016-04-20,0.0,0.0,0.0,0.0,-0.147533,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.1531,0.0,0.0,0.0,0.0,0.4019,0.0,0.096378,0.0,-0.1366,0.021646,True,0.003764,False
41,ILMN,2016-03-28,0.0,0.2023,0.0,0.0,0.3182,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.064,0.0,0.0,0.25,0.0,0.0,0.0,-0.2235,0.0,0.0,0.0,0.017349,False,-0.028358,True
3095,DISCA,2016-03-29,0.0,0.0,0.0,0.0,0.0,0.0,0.3818,0.0,0.0,0.0,0.0,0.34,0.0,0.3818,0.279033,0.4003,0.0,0.0,0.0,0.0,0.0,0.5788,0.0,0.0,0.36446,True,0.001774,False
536,TRIP,2016-04-21,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.27115,0.3885,0.0,0.0,0.0,0.24695,0.35125,-0.085333,0.34,0.0,0.0,0.0,0.0,0.0,0.4246,0.0,0.148458,False,-0.012993,False


In [58]:
def get_X_y(data, use_seq=False):
  if use_seq:
    hours_range = [str(i) for i in range(24)]
    feats = hours_range+['avg', 'prev_rate']
  else:
    feats = ['avg', 'prev_rate']

  X = data[feats].to_numpy()
  # X = data[['avg', 'prev_rate']].to_numpy()
  y = np.reshape(data['label'].to_numpy(), (-1,1))
  return X, y

train_x, train_y = get_X_y(train, use_seq)
test_x, test_y = get_X_y(test, use_seq)

print(train_x.shape, train_y.shape)

(2852, 26) (2852, 1)


In [44]:
train[train['avg'].isna()]

Unnamed: 0,index,Date,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,avg,prev_label,prev_rate,label


## Train & Eval Model

In [45]:
def eval_by_stock(test, clf, use_seq=False):
  res = {'stock':[], 'avg_acc':[], 'pos_cnt':[], 'neg_cnt':[]}
  if use_seq:
    hours_range = [str(i) for i in range(24)]
    feats = hours_range+['avg', 'prev_rate']
  else:
    feats = ['avg', 'prev_rate']

  for stock, group in test.groupby('index'):
    X = group[feats].to_numpy()
    y = np.reshape(group['label'].to_numpy(), (-1,1))
    acc = clf.score(X, y)
    res['stock'].append(stock)
    res['avg_acc'].append(acc)
    res['pos_cnt'].append(sum(group['label']==True))
    res['neg_cnt'].append(sum(group['label']==False))
  
  res_df = pd.DataFrame.from_dict(res).sort_values(by='avg_acc', ascending=False)
  return res_df

### Logistic model

In [46]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score

clf = make_pipeline(StandardScaler(),
                    LogisticRegression(random_state=0))

clf.fit(train_x, train_y)
train_score = clf.score(train_x, train_y)
test_score = clf.score(test_x, test_y)
train_auc = roc_auc_score(train_y, clf.predict_proba(train_x)[:, 1])
test_auc = roc_auc_score(test_y, clf.predict_proba(test_x)[:, 1])

print('[Acc] Train: {:.3f} Test: {:.3f}'.format(train_score, test_score))
print('[AUC] Train: {:.3f} Test: {:.3f}'.format(train_auc, test_auc))

stock_acc = eval_by_stock(test, clf, use_seq)
stock_acc.head(10)

  y = column_or_1d(y, warn=True)


[Acc] Train: 0.541 Test: 0.500
[AUC] Train: 0.554 Test: 0.504


Unnamed: 0,stock,avg_acc,pos_cnt,neg_cnt
81,XLNX,1.0,9,0
31,FAST,0.888889,7,2
36,HSIC,0.875,7,1
19,CSCO,0.777778,7,2
79,WBA,0.777778,6,3
60,PCAR,0.777778,5,4
74,TXN,0.777778,6,3
12,BMRN,0.777778,4,5
69,SWKS,0.75,5,3
73,TSLA,0.714286,5,2


### SVM model

In [47]:
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score

clf = make_pipeline(StandardScaler(),
                    SVC(gamma='auto', probability=True))

clf.fit(train_x, train_y)
train_score = clf.score(train_x, train_y)
test_score = clf.score(test_x, test_y)

train_auc = roc_auc_score(train_y, clf.predict_proba(train_x)[:, 1])
test_auc = roc_auc_score(test_y, clf.predict_proba(test_x)[:, 1])

print('[Acc] Train: {:.3f} Test: {:.3f}'.format(train_score, test_score))
print('[AUC] Train: {:.3f} Test: {:.3f}'.format(train_auc, test_auc))

stock_acc = eval_by_stock(test, clf, use_seq)
stock_acc.head(10)

  y = column_or_1d(y, warn=True)


[Acc] Train: 0.748 Test: 0.517
[AUC] Train: 0.844 Test: 0.523


Unnamed: 0,stock,avg_acc,pos_cnt,neg_cnt
64,ROST,1.0,5,4
70,TMUS,0.888889,7,2
32,FB,0.857143,4,3
34,GILD,0.777778,6,3
79,WBA,0.777778,6,3
37,ILMN,0.777778,4,5
47,MAT,0.777778,7,2
74,TXN,0.777778,6,3
72,TSCO,0.777778,5,4
13,CA,0.777778,7,2


### Decision tree model

In [48]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score

clf = make_pipeline(DecisionTreeClassifier(random_state=0))

clf.fit(train_x, train_y)
train_score = clf.score(train_x, train_y)
test_score = clf.score(test_x, test_y)

train_auc = roc_auc_score(train_y, clf.predict_proba(train_x)[:, 1])
test_auc = roc_auc_score(test_y, clf.predict_proba(test_x)[:, 1])

print('[Acc] Train: {:.3f} Test: {:.3f}'.format(train_score, test_score))
print('[AUC] Train: {:.3f} Test: {:.3f}'.format(train_auc, test_auc))

stock_acc = eval_by_stock(test, clf, use_seq)
stock_acc.head(10)

[Acc] Train: 1.000 Test: 0.534
[AUC] Train: 1.000 Test: 0.534


Unnamed: 0,stock,avg_acc,pos_cnt,neg_cnt
16,CHTR,0.888889,6,3
33,FISV,0.888889,4,5
77,VRSK,0.875,5,3
73,TSLA,0.857143,5,2
79,WBA,0.777778,6,3
71,TRIP,0.777778,7,2
63,REGN,0.777778,2,7
41,JD,0.75,2,6
40,INTU,0.75,5,3
23,DISCK,0.75,5,3


### KNN model

In [49]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score

clf = KNeighborsClassifier(n_neighbors=3)
clf.fit(train_x, train_y)

train_score = clf.score(train_x, train_y)
test_score = clf.score(test_x, test_y)

train_auc = roc_auc_score(train_y, clf.predict_proba(train_x)[:, 1])
test_auc = roc_auc_score(test_y, clf.predict_proba(test_x)[:, 1])

print('[Acc] Train: {:.3f} Test: {:.3f}'.format(train_score, test_score))
print('[AUC] Train: {:.3f} Test: {:.3f}'.format(train_auc, test_auc))

stock_acc = eval_by_stock(test, clf, use_seq)
stock_acc.head(10)

  return self._fit(X, y)


[Acc] Train: 0.750 Test: 0.515
[AUC] Train: 0.812 Test: 0.513


Unnamed: 0,stock,avg_acc,pos_cnt,neg_cnt
72,TSCO,0.888889,5,4
64,ROST,0.777778,5,4
2,ADBE,0.777778,4,5
3,ADP,0.777778,6,3
60,PCAR,0.777778,5,4
24,DISH,0.777778,2,7
7,AMGN,0.777778,3,6
34,GILD,0.777778,6,3
55,NTES,0.777778,5,4
69,SWKS,0.75,5,3


### Random Forest model

In [50]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score

clf = RandomForestClassifier(max_depth=30, random_state=0)
clf.fit(train_x, train_y)

train_score = clf.score(train_x, train_y)
test_score = clf.score(test_x, test_y)

train_auc = roc_auc_score(train_y, clf.predict_proba(train_x)[:, 1])
test_auc = roc_auc_score(test_y, clf.predict_proba(test_x)[:, 1])

print('[Acc] Train: {:.3f} Test: {:.3f}'.format(train_score, test_score))
print('[AUC] Train: {:.3f} Test: {:.3f}'.format(train_auc, test_auc))

stock_acc = eval_by_stock(test, clf, use_seq)
stock_acc.head(10)

  import sys


[Acc] Train: 0.994 Test: 0.518
[AUC] Train: 1.000 Test: 0.534


Unnamed: 0,stock,avg_acc,pos_cnt,neg_cnt
72,TSCO,0.888889,5,4
52,NCLH,0.888889,2,7
70,TMUS,0.888889,7,2
7,AMGN,0.777778,3,6
19,CSCO,0.777778,7,2
16,CHTR,0.777778,6,3
74,TXN,0.777778,6,3
25,DLTR,0.777778,6,3
37,ILMN,0.777778,4,5
47,MAT,0.777778,7,2


## RNN for sequence

In [59]:
def get_seq_X_y(data):
  feats = [str(i) for i in range(24)]
  X = data[feats].to_numpy()
  X = np.reshape(X, (-1,24,1))

  y = np.reshape(data['label'].to_numpy(), (-1,1))
  return X, y

train_x, train_y = get_seq_X_y(train)
test_x, test_y = get_seq_X_y(test)

In [87]:
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as n
import numpy as np

class MyDataset(Dataset):
    def __init__(self, X, Y):
        self.X = X
        self.Y = Y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, index):
        data = torch.Tensor(self.X[index])
        label = torch.Tensor(self.Y[index])
        return (data, label)

train_set = MyDataset(train_x, train_y)
test_set = MyDataset(test_x, test_y)

In [95]:
class RNN_clf(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNN_clf, self).__init__()

        self.rnn = nn.RNN(input_size, hidden_size, num_layers=2, batch_first=True)
        self.output_layer = nn.Linear(hidden_size, output_size)
        self.sigmoid = nn.Sigmoid()

    def forward(self, input, hidden):
        output, hn = self.rnn(input, hidden)
        pred = self.output_layer(output[:, -1, :])
        pred = self.sigmoid(pred)
        return pred, hn

# CONFIG for RNN
n_hidden = 8
n_layers = 2

# CONFIG of hyper-parameters
epoch_num = 100
batch_size = 32
criterion = nn.BCELoss()
learning_rate = 0.005

# data
train_dataloader = DataLoader(train_set, batch_size = batch_size, shuffle = True)

# model
rnn = RNN_clf(input_size=1, hidden_size=n_hidden, output_size=1)
print(rnn)
optimizer = torch.optim.Adam(rnn.parameters(), lr=learning_rate)

RNN_clf(
  (rnn): RNN(1, 8, num_layers=2, batch_first=True)
  (output_layer): Linear(in_features=8, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)


In [98]:
from tqdm import tqdm

h_state = None  # 初始化隐藏层

loss_history = []
mytqdm = tqdm(range(epoch_num))
for step in mytqdm:
  optimizer.zero_grad()

  # batch训练
  batch_loss = []
  for batch_X, batch_y in train_dataloader:
    # batch_X: [batch, timesteps, input_size]
    prediction,h_state = rnn(batch_X, None) 
    loss = criterion(prediction, batch_y)
    batch_loss.append(loss.item())

    avg_loss = torch.mean(loss)
    avg_loss.backward()
    optimizer.step()
  
  mytqdm.set_description('[Epoch]: {}, [Loss]: {:.4f}'.format(step, np.mean(batch_loss)))


[Epoch]: 99, [Loss]: 0.7137277444203695: 100%|██████████| 100/100 [00:44<00:00,  2.26it/s]
