In [1]:
import threading
import time
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime, timezone
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix

In [2]:
df = pd.read_csv('train.csv')
df

Unnamed: 0,timestamp,open,high,low,close,volume,quote_asset_volume,number_of_trades,taker_buy_base_volume,taker_buy_quote_volume,target
0,1525471260,0.90120,0.90130,0.90120,0.90130,134.98,121.646459,4.0,125.08,112.723589,1.0
1,1525471320,0.90185,0.90195,0.90185,0.90195,1070.54,965.505313,12.0,879.94,793.612703,0.0
2,1525471380,0.90140,0.90140,0.90139,0.90139,2293.06,2066.963991,5.0,0.00,0.000000,0.0
3,1525471440,0.90139,0.90140,0.90138,0.90139,6850.59,6175.000909,19.0,1786.30,1610.149485,0.0
4,1525471500,0.90139,0.90139,0.90130,0.90130,832.30,750.222624,3.0,784.82,707.428900,0.0
...,...,...,...,...,...,...,...,...,...,...,...
2133567,1652817240,0.43060,0.43060,0.42990,0.43040,136274.00,58630.162800,144.0,54216.00,23325.927700,1.0
2133568,1652817300,0.43030,0.43070,0.43030,0.43050,104478.00,44967.837600,99.0,52232.00,22484.030400,1.0
2133569,1652817360,0.43050,0.43120,0.43050,0.43090,212396.00,91526.987200,177.0,108324.00,46673.061600,0.0
2133570,1652817420,0.43110,0.43110,0.43040,0.43060,131047.00,56443.003800,107.0,32713.00,14097.148900,0.0


In [3]:
def remove_outliers(dataframe, column):
    q1 = dataframe[column].quantile(0.25)
    q3 = dataframe[column].quantile(0.75)
    iqr = q3 - q1
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr
    dataframe = dataframe[(dataframe[column] >= lower_bound) & (dataframe[column] <= upper_bound)]
    return dataframe

df = remove_outliers(df, 'close')

In [4]:
def generate_features(dataframe):
    temp_df = pd.DataFrame(dataframe)
    temp_df['avg_50'] = temp_df['close'].rolling(window=50).mean()
    temp_df['avg_20'] = temp_df['close'].rolling(window=20).mean()
    temp_df['ema_50'] = temp_df['close'].ewm(span=50, adjust=False).mean()
    temp_df['ema_26'] = temp_df['close'].ewm(span=26, adjust=False).mean()
    temp_df['ema_12'] = temp_df['close'].ewm(span=12, adjust=False).mean()
    temp_df['std_20'] = temp_df['close'].rolling(window=20).std()
    temp_df['shift_1'] = temp_df.shift(-1)['close']
    temp_df['shift_2'] = temp_df.shift(-2)['close']
    temp_df['shift_3'] = temp_df.shift(-3)['close']
    temp_df['shift_5'] = temp_df.shift(-5)['close']
    temp_df['shift_10'] = temp_df.shift(-10)['close']
    temp_df['upper_band'] = temp_df['avg_20'] + (temp_df['std_20'] * 2)
    temp_df['lower_band'] = temp_df['avg_20'] - (temp_df['std_20'] * 2)
    return temp_df

df = generate_features(df)
df

Unnamed: 0,timestamp,open,high,low,close,volume,quote_asset_volume,number_of_trades,taker_buy_base_volume,taker_buy_quote_volume,...,ema_26,ema_12,std_20,shift_1,shift_2,shift_3,shift_5,shift_10,upper_band,lower_band
0,1525471260,0.90120,0.90130,0.90120,0.90130,134.98,121.646459,4.0,125.08,112.723589,...,0.901300,0.901300,,0.90195,0.90139,0.90139,0.90001,0.89979,,
1,1525471320,0.90185,0.90195,0.90185,0.90195,1070.54,965.505313,12.0,879.94,793.612703,...,0.901348,0.901400,,0.90139,0.90139,0.90130,0.89808,0.89968,,
2,1525471380,0.90140,0.90140,0.90139,0.90139,2293.06,2066.963991,5.0,0.00,0.000000,...,0.901351,0.901398,,0.90139,0.90130,0.90001,0.90000,0.89956,,
3,1525471440,0.90139,0.90140,0.90138,0.90139,6850.59,6175.000909,19.0,1786.30,1610.149485,...,0.901354,0.901397,,0.90130,0.90001,0.89808,0.90000,0.89734,,
4,1525471500,0.90139,0.90139,0.90130,0.90130,832.30,750.222624,3.0,784.82,707.428900,...,0.901350,0.901382,,0.90001,0.89808,0.90000,0.89979,0.89734,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2133567,1652817240,0.43060,0.43060,0.42990,0.43040,136274.00,58630.162800,144.0,54216.00,23325.927700,...,0.430929,0.430606,0.001255,0.43050,0.43090,0.43060,,,0.433844,0.428826
2133568,1652817300,0.43030,0.43070,0.43030,0.43050,104478.00,44967.837600,99.0,52232.00,22484.030400,...,0.430897,0.430590,0.001203,0.43090,0.43060,0.43010,,,0.433617,0.428803
2133569,1652817360,0.43050,0.43120,0.43050,0.43090,212396.00,91526.987200,177.0,108324.00,46673.061600,...,0.430898,0.430637,0.001137,0.43060,0.43010,,,,0.433384,0.428836
2133570,1652817420,0.43110,0.43110,0.43040,0.43060,131047.00,56443.003800,107.0,32713.00,14097.148900,...,0.430875,0.430632,0.001029,0.43010,,,,,0.433038,0.428922


In [5]:
def process_time(dataframe):
    dataframe['timestamp'] = pd.to_datetime(dataframe['timestamp'], unit='s')
    dataframe['hour'] = dataframe['timestamp'].dt.hour
    dataframe['minute'] = dataframe['timestamp'].dt.minute
    dataframe.drop('timestamp', axis=1, inplace=True)
    return dataframe

df = process_time(df)

In [6]:
def filter_data_train(dataframe):
    cols = ['hour', 'minute', 'open', 'high', 'low', 'close', 'target', 'avg_50', 'avg_20', 'ema_50', 'ema_26',
            'shift_1', 'shift_2', 'shift_3', 'shift_5', 'shift_10', 'volume', 'quote_asset_volume', 'number_of_trades',
            'taker_buy_base_volume', 'taker_buy_quote_volume', 'upper_band', 'lower_band']
    dataframe = dataframe[cols]
    return dataframe

df = filter_data_train(df)
df

Unnamed: 0,hour,minute,open,high,low,close,target,avg_50,avg_20,ema_50,...,shift_3,shift_5,shift_10,volume,quote_asset_volume,number_of_trades,taker_buy_base_volume,taker_buy_quote_volume,upper_band,lower_band
0,22,1,0.90120,0.90130,0.90120,0.90130,1.0,,,0.901300,...,0.90139,0.90001,0.89979,134.98,121.646459,4.0,125.08,112.723589,,
1,22,2,0.90185,0.90195,0.90185,0.90195,0.0,,,0.901325,...,0.90130,0.89808,0.89968,1070.54,965.505313,12.0,879.94,793.612703,,
2,22,3,0.90140,0.90140,0.90139,0.90139,0.0,,,0.901328,...,0.90001,0.90000,0.89956,2293.06,2066.963991,5.0,0.00,0.000000,,
3,22,4,0.90139,0.90140,0.90138,0.90139,0.0,,,0.901330,...,0.89808,0.90000,0.89734,6850.59,6175.000909,19.0,1786.30,1610.149485,,
4,22,5,0.90139,0.90139,0.90130,0.90130,0.0,,,0.901329,...,0.90000,0.89979,0.89734,832.30,750.222624,3.0,784.82,707.428900,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2133567,19,54,0.43060,0.43060,0.42990,0.43040,1.0,0.431162,0.431335,0.430399,...,0.43060,,,136274.00,58630.162800,144.0,54216.00,23325.927700,0.433844,0.428826
2133568,19,55,0.43030,0.43070,0.43030,0.43050,1.0,0.431200,0.431210,0.430403,...,0.43010,,,104478.00,44967.837600,99.0,52232.00,22484.030400,0.433617,0.428803
2133569,19,56,0.43050,0.43120,0.43050,0.43090,0.0,0.431238,0.431110,0.430423,...,,,,212396.00,91526.987200,177.0,108324.00,46673.061600,0.433384,0.428836
2133570,19,57,0.43110,0.43110,0.43040,0.43060,0.0,0.431246,0.430980,0.430430,...,,,,131047.00,56443.003800,107.0,32713.00,14097.148900,0.433038,0.428922


In [7]:
X_data, y_data = df.drop(columns=['target']), df['target']
# 9:1 ratio is kept for training and testing respectively
X_train_data, X_test_data, y_train_data, y_test_data = train_test_split(X_data, y_data, test_size=0.1, random_state=42, stratify=y_data)

scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train_data)
X_test_scaled = scaler.transform(X_test_data)

In [8]:
progress_counter = 0
estimators_count = 100

def track_progress():
    global progress_counter
    while progress_counter < estimators_count:
        progress_pct = (progress_counter / estimators_count) * 100
        print(f"Training progress... {progress_pct:.2f}% completed")  #Providing update evry 1 minute
        time.sleep(60)

progress_thread = threading.Thread(target=track_progress)
progress_thread.daemon = True
progress_thread.start()

forest_model = RandomForestClassifier(n_estimators=estimators_count, random_state=42, warm_start=True, n_jobs=-1)

for j in range(1, estimators_count + 1):
    forest_model.set_params(n_estimators=j)
    forest_model.fit(X_train_scaled, y_train_data)
    progress_counter = j

progress_counter = estimators_count

Training progress... 0.00% completed
Training progress... 0.00% completed
Training progress... 2.00% completed
Training progress... 3.00% completed
Training progress... 4.00% completed
Training progress... 5.00% completed
Training progress... 6.00% completed
Training progress... 7.00% completed
Training progress... 8.00% completed
Training progress... 9.00% completed
Training progress... 10.00% completed
Training progress... 12.00% completed
Training progress... 13.00% completed
Training progress... 14.00% completed
Training progress... 15.00% completed
Training progress... 16.00% completed
Training progress... 17.00% completed
Training progress... 19.00% completed
Training progress... 20.00% completed
Training progress... 21.00% completed
Training progress... 22.00% completed
Training progress... 23.00% completed
Training progress... 24.00% completed
Training progress... 26.00% completed
Training progress... 27.00% completed
Training progress... 28.00% completed
Training progress... 2

In [9]:
def filter_data_test(dataframe):
    cols = ['hour', 'minute', 'open', 'high', 'low', 'close', 'avg_50', 'avg_20', 'ema_50', 'ema_26',
            'shift_1', 'shift_2', 'shift_3', 'shift_5', 'shift_10', 'volume', 'quote_asset_volume', 'number_of_trades',
            'taker_buy_base_volume', 'taker_buy_quote_volume', 'upper_band', 'lower_band']
    dataframe = dataframe[cols]
    return dataframe

test = pd.read_csv('test.csv')
test = process_time(test)
test = generate_features(test)
test = filter_data_test(test)
predictions = forest_model.predict(X_test_scaled)

print("Model Performance:")
print(f"Accuracy: {accuracy_score(y_test_data, predictions)}")
print(f"F1 Score: {f1_score(y_test_data, predictions)}")
print(f"Confusion Matrix:\n {confusion_matrix(y_test_data, predictions)}")
print(f"Classification Report:\n {classification_report(y_test_data, predictions)}")
X_test = test
idx = X_test.index
X_test = scaler.transform(X_test)

Model Performance:
Accuracy: 0.7749557629378258
F1 Score: 0.7505816233874391
Confusion Matrix:
 [[90006 18164]
 [28257 69848]]
Classification Report:
               precision    recall  f1-score   support

         0.0       0.76      0.83      0.79    108170
         1.0       0.79      0.71      0.75     98105

    accuracy                           0.77    206275
   macro avg       0.78      0.77      0.77    206275
weighted avg       0.78      0.77      0.77    206275



In [10]:
test_df = pd.read_csv('test.csv')
test_df = generate_features(test_df)
test_df = process_time(test_df)

test_df = filter_data_test(test_df)

X_test_final = scaler.transform(test_df)
predictions_final = forest_model.predict(X_test_final)
predictions_df = pd.DataFrame({
    'row_id': test_df.index,
    'target': predictions_final
})

predictions_df.to_csv('submission_result.csv', index=False)