In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
import seaborn as sb

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

import warnings
warnings.filterwarnings('ignore')

In [10]:
df = pd.read_csv('../data/1month.csv')
df.columns = ['open_time', 'open_price', 'high_price', 'low_price', 'close_price', 'volume', 'close_time']
df['open_time'] = pd.to_datetime(df['open_time'] * 1000, unit='ms')
df['close_time'] = pd.to_datetime(df['close_time'], unit='ms')
df.head()

Unnamed: 0,open_time,open_price,high_price,low_price,close_price,volume,close_time
0,2017-09-01,386.44,394.39,192.0,304.36,167937.1,2017-09-30 23:59:59.999
1,2017-10-01,305.13,354.0,272.2,304.9,231137.8,2017-10-31 23:59:59.999
2,2017-11-01,304.89,515.0,274.73,427.43,558140.1,2017-11-30 23:59:59.999
3,2017-12-01,428.05,864.9,375.01,733.98,1709681.0,2017-12-31 23:59:59.999
4,2018-01-01,733.01,1440.0,716.8,1124.81,4449875.0,2018-01-31 23:59:59.999


In [11]:
df['close_day'] = df['close_time'].dt.day
df['close_dayofweek'] = df['close_time'].dt.dayofweek
df['close_month'] = df['close_time'].dt.month
df['close_quarter'] = df['close_time'].dt.quarter
df['close_year'] = df['close_time'].dt.year

In [12]:
df['target'] = np.where(df['close_price'].shift(-1) > df['close_price'], 1, 0)
df['open-close'] = df['open_price'] - df['close_price']
df['high-low'] = df['high_price'] - df['low_price']

In [13]:
df['target'].value_counts()

target
1    38
0    30
Name: count, dtype: int64

In [14]:
features = df[['open-close', 'high-low']]
target = df['target']

scaler = StandardScaler()
features = scaler.fit_transform(features)

In [15]:
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(54, 2) (14, 2) (54,) (14,)


In [16]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

models = [
    LogisticRegression(),
    SVC(kernel='poly', probability=True),
    XGBClassifier(),
    RandomForestClassifier(),
    GradientBoostingClassifier(),
    DecisionTreeClassifier(),
    KNeighborsClassifier()
]

names = [
    'Logistic Regression',
    'SVM',
    'XGBoost',
    'Random Forest',
    'Gradient Boosting',
    'Decision Tree',
    'KNN'
]

for model, name in zip(models, names):
    model.fit(X_train, y_train)
    
    print(name)
    print('Training Accuracy:', metrics.roc_auc_score(y_train, model.predict_proba(X_train)[:, 1]))
    print('Validation Accuracy:', metrics.roc_auc_score(y_test, model.predict_proba(X_test)[:, 1]))
    print()


Logistic Regression
Training Accuracy: 0.6079545454545455
Validation Accuracy: 0.5625

SVM
Training Accuracy: 0.375
Validation Accuracy: 0.7708333333333334

XGBoost
Training Accuracy: 0.9992897727272728
Validation Accuracy: 0.5416666666666667

Random Forest
Training Accuracy: 1.0
Validation Accuracy: 0.5208333333333334

Gradient Boosting
Training Accuracy: 1.0
Validation Accuracy: 0.45833333333333337

Decision Tree
Training Accuracy: 1.0
Validation Accuracy: 0.35416666666666663

KNN
Training Accuracy: 0.6207386363636364
Validation Accuracy: 0.3333333333333333

