In [1]:
import pandas as pd
import os
import glob
from datetime import datetime
import seaborn as sns
from sklearn import metrics
from sklearn.metrics import mean_squared_log_error
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder


In [2]:
path = '/Users/ishantchaudhary/DL_Assignment/Crypto_data_minute/'
extension = 'txt'
os.chdir(path)
result = glob.glob('*.{}'.format(extension))

In [3]:
#Appending the dataset of different cryptos with necessary transformations
dfs = list()
tr_dfs = list()
tst_dfs = list()
for f in result:
#     data = pd.read_csv(f)
    data = pd.read_csv(f,header = None)

    size = len(f)
    data['Crypto'] = f[:size - 9]
    data.columns = [ 'DateTime','Open', 'High', 'Low', 'Close', 'Volume','Crypto']
    data['DateTime']= pd.to_datetime(data['DateTime'])
    data = data.resample('H', on='DateTime').max()
    data['DateTime'] = data.index
    # adding percentage change in closing price 
    data['Change_close'] = data['Close'].pct_change()
    data['Change_close'] = data['Change_close'].bfill()
#   data['true_labels'] = (data['Change_close'] > 0).astype(int)
    # feature engineering basic 
    data['Close t-1'] = data['Close'].shift()
    data['Close t-1'] = data['Close t-1'].bfill()
    data['Change_close t-1'] = data['Change_close'].shift()
    data['Change_close t-1'] = data['Change_close t-1'].bfill()
    
    # add true labels on basis of change close 
    data['true_labels'] = (data['Change_close'] > 0).astype(int)
    
    data['naive_pred'] = data['true_labels'].shift()
    data['naive_pred'] = data['naive_pred'].bfill()
    # getting train and test
    training_data = data[data.DateTime<datetime(2021, 9, 30, 23)]
    test_data = data[data.DateTime>=datetime(2021, 9, 30, 23)]
    
    
    dfs.append(data)
    tr_dfs.append(training_data)
    tst_dfs.append(test_data)

df = pd.concat(dfs, ignore_index=True)
train_df = pd.concat(tr_dfs, ignore_index=True)
test_df = pd.concat(tst_dfs, ignore_index=True)
# df.columns = [ 'DateTime','Open', 'High', 'Low', 'Close', 'Volume','Crypto']

 

In [4]:
train_df.drop(['Open', 'High', 'Low'], axis=1,inplace =True)
test_df.drop(['Open', 'High', 'Low'], axis=1,inplace =True)

In [5]:
train_df.sort_values(by=['DateTime'],inplace=True)
train_df.reset_index(inplace=True)
train_df = train_df.iloc[: , 1:]

In [6]:
test_df.sort_values(by=['DateTime'],inplace=True)
test_df.reset_index(inplace=True)
test_df = test_df.iloc[: , 1:]

In [7]:
train_df=train_df.dropna()
test_df=test_df.dropna()

In [8]:
train_df.isna().sum()

DateTime            0
Close               0
Volume              0
Crypto              0
Change_close        0
Close t-1           0
Change_close t-1    0
true_labels         0
naive_pred          0
dtype: int64

In [9]:
test_df.isna().sum()

DateTime            0
Close               0
Volume              0
Crypto              0
Change_close        0
Close t-1           0
Change_close t-1    0
true_labels         0
naive_pred          0
dtype: int64

In [10]:
def data_model(train_df,test_df,test_start,test_end):
    xtr, xts = train_df.drop(['Close','Volume','Change_close','true_labels','naive_pred'], axis=1),\
               test_df.drop(['Volume','Change_close','Close','true_labels','naive_pred'], axis=1)
    # xtr['DateTime'], xts['DateTime'] = xtr['DateTime'].astype(str), 
    #xts['DateTime'].astype(str)

    xtr.index,xts.index = xtr['DateTime'], xts['DateTime']
    xtr, xts = xtr.drop(['DateTime'], axis=1), xts.drop(['DateTime'], axis=1)
    
    LE = LabelEncoder()
    xtr['Crypto'],xts['Crypto'] = LE.fit_transform(xtr['Crypto']), \
                                    LE.fit_transform(xts['Crypto'])
    ytr, yts = train_df['true_labels'].values, test_df['true_labels'].values
    
    # setting test data according to time specified
#     2021-09-30 23:00:00
    xts = xts.loc[test_start:test_end]
    size = xts.shape[0]
    yts = yts[0:size]
    
    return xtr,ytr,xts,yts
    

In [12]:
no_est = 50
def model_rfc(xtr,ytr,xts,yts,no_est):
    clf = RandomForestClassifier(n_estimators=no_est, n_jobs=-1, random_state=0)
    clf.fit(xtr, ytr)
    pred = clf.predict(xts)
    auc = metrics.roc_auc_score(yts, pred)
    print("No of Estimators",no_est)
    print("Timeline:",str(xts.index[0]),"---->",str(xts.index[-1]))
    print("AUC Score:",auc)
    pred_result = xts.copy()
    pred_result['Actual'] = yts.copy()
    pred_result['Predicted'] = pred
    pred_result.head()

In [13]:
test_start = '2021-09-30 23:00:00'
test_end = '2021-10-30 23:00:00'
xtr,ytr,xts,yts = data_model(train_df,test_df,test_start,test_end)

In [14]:
no_est = 50
model_rfc(xtr,ytr,xts,yts,no_est)

No of Estimators 50
Timeline: 2021-09-30 23:00:00 ----> 2021-10-30 23:00:00
AUC Score: 0.5246329488778055


In [15]:
no_est = 250
model_rfc(xtr,ytr,xts,yts,no_est)

No of Estimators 250
Timeline: 2021-09-30 23:00:00 ----> 2021-10-30 23:00:00
AUC Score: 0.5287507793017456
