In [None]:
import pandas as pd
import numpy as np

## Load Dataset and Clean Dataset

In [None]:
# read the dataset
df = pd.read_csv("train.csv", sep = ",")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 201991 entries, 0 to 201990
Data columns (total 28 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   id                    201991 non-null  int64  
 1   last_price            201991 non-null  float64
 2   mid                   201991 non-null  float64
 3   opened_position_qty   141525 non-null  float64
 4   closed_position_qty   141525 non-null  float64
 5   transacted_qty        201991 non-null  float64
 6   d_open_interest       201991 non-null  int64  
 7   bid1                  201991 non-null  float64
 8   bid2                  201990 non-null  float64
 9   bid3                  201990 non-null  float64
 10  bid4                  201990 non-null  float64
 11  bid5                  201990 non-null  float64
 12  ask1                  201990 non-null  float64
 13  ask2                  201990 non-null  float64
 14  ask3                  201990 non-null  float64
 15  

In [None]:
# drop the last 50,000 rows
df = df.iloc[:-50000]

# rename column name
df.rename(columns = {'opened_position_qty ': 'opened_position_qty'}, inplace = True)

In [None]:
# check for missing data
df.isnull().sum()

Unnamed: 0,0
id,0
last_price,0
mid,0
opened_position_qty,44711
closed_position_qty,44711
transacted_qty,0
d_open_interest,0
bid1,0
bid2,0
bid3,0


In [None]:
# replace data points based on surrounding data values
# linear assumes a 'straight line' from the previous data points to the next one. may be less accurate if we have a bunch of missing data points in a row
df['opened_position_qty'].interpolate(method='linear', inplace=True)
df['closed_position_qty'].interpolate(method='linear', inplace=True)

In [None]:
# check for missing data
df.isnull().sum()

Unnamed: 0,0
id,0
last_price,0
mid,0
opened_position_qty,1
closed_position_qty,1
transacted_qty,0
d_open_interest,0
bid1,0
bid2,0
bid3,0


In [None]:
df.head()

Unnamed: 0,id,last_price,mid,opened_position_qty,closed_position_qty,transacted_qty,d_open_interest,bid1,bid2,bid3,...,bid2vol,bid3vol,bid4vol,bid5vol,ask1vol,ask2vol,ask3vol,ask4vol,ask5vol,y
0,0,3842.4,3842.6,,,103.0,0,3842.4,3842.0,3841.8,...,1.0,6.0,14.0,6.0,6.0,1.0,1.0,10.0,2.0,1.0
1,1,3842.8,3843.4,6.0,49.0,55.0,-43,3843.0,3842.8,3842.4,...,6.0,11.0,1.0,6.0,1.0,4.0,4.0,1.0,13.0,0.0
2,2,3844.0,3844.3,7.0,77.0,84.0,-69,3843.8,3843.6,3843.2,...,1.0,4.0,21.0,12.0,1.0,16.0,10.0,4.0,9.0,0.0
3,3,3843.8,3843.4,3.0,34.0,37.0,-30,3843.0,3842.8,3842.4,...,13.0,12.0,2.0,4.0,2.0,7.0,1.0,2.0,11.0,1.0
4,4,3843.2,3843.1,3.0,38.0,41.0,-35,3842.8,3842.4,3842.0,...,12.0,2.0,2.0,4.0,1.0,3.0,1.0,11.0,15.0,1.0


In [None]:
# remove the first row which is the only one with missing data now
df = df.iloc[1:]

In [None]:
df.describe()

Unnamed: 0,id,last_price,mid,opened_position_qty,closed_position_qty,transacted_qty,d_open_interest,bid1,bid2,bid3,...,bid2vol,bid3vol,bid4vol,bid5vol,ask1vol,ask2vol,ask3vol,ask4vol,ask5vol,y
count,151990.0,151990.0,151990.0,151990.0,151990.0,151990.0,151990.0,151990.0,151990.0,151990.0,...,151990.0,151990.0,151990.0,151990.0,151990.0,151990.0,151990.0,151990.0,151990.0,151990.0
mean,75995.5,3862.327836,3862.329098,1.236305,1.810767,2.435009,-0.149372,3862.140272,3861.906368,3861.687122,...,5.166643,5.731693,6.171952,6.640937,3.913192,5.024186,5.494493,5.841358,6.214113,0.353661
std,43875.878044,26.498086,26.496604,1.953357,2.515919,3.940268,2.403965,26.49685,26.495659,26.494944,...,5.280013,5.730177,6.129232,6.85472,4.539096,5.653891,6.349592,7.227007,7.756851,0.478107
min,1.0,3812.0,3812.1,0.0,0.0,0.0,-69.0,3811.8,3811.6,3811.4,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
25%,37998.25,3840.8,3840.9,0.0,1.0,0.0,-1.0,3840.6,3840.4,3840.2,...,2.0,2.0,2.0,3.0,1.0,2.0,2.0,2.0,2.0,0.0
50%,75995.5,3850.4,3850.3,1.0,1.0,1.0,0.0,3850.2,3850.0,3849.8,...,4.0,4.0,4.0,5.0,2.0,3.0,4.0,4.0,4.0,0.0
75%,113992.75,3888.6,3888.6,1.5,2.0,3.0,0.0,3888.4,3888.2,3888.0,...,6.0,7.0,8.0,8.0,5.0,6.0,7.0,7.0,7.0,1.0
max,151990.0,3917.6,3917.3,50.0,101.0,165.0,44.0,3917.2,3916.8,3916.6,...,118.0,119.0,119.0,120.0,115.0,132.0,133.0,134.0,135.0,1.0


In [None]:
df.head()

Unnamed: 0,id,last_price,mid,opened_position_qty,closed_position_qty,transacted_qty,d_open_interest,bid1,bid2,bid3,...,bid2vol,bid3vol,bid4vol,bid5vol,ask1vol,ask2vol,ask3vol,ask4vol,ask5vol,y
1,1,3842.8,3843.4,6.0,49.0,55.0,-43,3843.0,3842.8,3842.4,...,6.0,11.0,1.0,6.0,1.0,4.0,4.0,1.0,13.0,0.0
2,2,3844.0,3844.3,7.0,77.0,84.0,-69,3843.8,3843.6,3843.2,...,1.0,4.0,21.0,12.0,1.0,16.0,10.0,4.0,9.0,0.0
3,3,3843.8,3843.4,3.0,34.0,37.0,-30,3843.0,3842.8,3842.4,...,13.0,12.0,2.0,4.0,2.0,7.0,1.0,2.0,11.0,1.0
4,4,3843.2,3843.1,3.0,38.0,41.0,-35,3842.8,3842.4,3842.0,...,12.0,2.0,2.0,4.0,1.0,3.0,1.0,11.0,15.0,1.0
5,5,3843.6,3844.2,12.0,17.0,29.0,-5,3843.8,3843.4,3843.2,...,6.0,1.0,2.0,17.0,1.0,12.0,15.0,10.0,3.0,0.0


In [None]:
df.tail()

Unnamed: 0,id,last_price,mid,opened_position_qty,closed_position_qty,transacted_qty,d_open_interest,bid1,bid2,bid3,...,bid2vol,bid3vol,bid4vol,bid5vol,ask1vol,ask2vol,ask3vol,ask4vol,ask5vol,y
151986,151986,3899.2,3899.1,1.5,1.0,0.0,0,3899.0,3898.8,3898.6,...,3.0,3.0,3.0,4.0,1.0,8.0,4.0,5.0,32.0,0.0
151987,151987,3899.0,3899.0,1.0,1.0,2.0,0,3898.8,3898.6,3898.4,...,3.0,3.0,4.0,5.0,1.0,9.0,4.0,5.0,32.0,0.0
151988,151988,3898.8,3899.0,1.0,1.0,2.0,0,3898.8,3898.6,3898.4,...,3.0,3.0,4.0,5.0,1.0,8.0,4.0,5.0,32.0,0.0
151989,151989,3898.8,3898.9,1.0,1.0,0.0,0,3898.8,3898.6,3898.4,...,3.0,3.0,4.0,5.0,1.0,1.0,8.0,4.0,5.0,0.0
151990,151990,3898.8,3898.9,1.0,1.0,0.0,0,3898.8,3898.6,3898.4,...,3.0,3.0,4.0,5.0,1.0,1.0,8.0,4.0,5.0,1.0


## Split the Dataset for Training and Testing

In [None]:
from sklearn.model_selection import train_test_split

# predictors
X = df.drop(columns = ['y'])

# response variable
y = df['y']

# spilt the dataset into training and testing with a 80/20 spilt with a random state
X_train, X_test, y_train, y_test, = train_test_split(X, y, test_size = 0.2, random_state = 123)

## Random Forest Model

### Fiting a Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# parameters for hypertuning
parameters = [{'n_estimators': [100,500,700,1000],
               'max_features': ['sqrt'],
               'max_depth': [3, 5, 7],
               'criterion': ['gini', 'entropy']}]

# hypertune for random froest
GridSearch_RandomForest = GridSearchCV(estimator = RandomForestClassifier(random_state = 123), param_grid = parameters, scoring = 'accuracy', cv = 5, n_jobs = -1)

# fit a random forest
GridSearch_RandomForest.fit(X_train, y_train)
print(GridSearch_RandomForest.best_params_)

# score for the best model
print(GridSearch_RandomForest.score(X_train, y_train))


{'criterion': 'gini', 'max_depth': 7, 'max_features': 'sqrt', 'n_estimators': 100}
0.6554296335285216


### Predicted Data from Random Forest

In [None]:
prediction_RandomForest = GridSearch_RandomForest.predict(X_test)
prediction_probs_RandomForest = GridSearch_RandomForest.predict_proba(X_test)

In [None]:
prediction_RandomForest

array([0., 0., 0., ..., 0., 0., 0.])

In [None]:
prediction_probs_RandomForest

array([[0.65473188, 0.34526812],
       [0.73421828, 0.26578172],
       [0.75412756, 0.24587244],
       ...,
       [0.664953  , 0.335047  ],
       [0.57994487, 0.42005513],
       [0.7138974 , 0.2861026 ]])

In [None]:
from sklearn.metrics import confusion_matrix
conf_matrix = confusion_matrix(y_test, prediction_RandomForest)

In [None]:
conf_matrix

array([[19476,   256],
       [10291,   375]])