In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from datetime import datetime as dt

np.random.seed(4987645)


In [None]:
train_path = '/content/drive/MyDrive/anti/dataset/features_25S.csv'
print("Time before training data import: ", dt.now())
train_data_raw = pd.read_csv(train_path, on_bad_lines="skip", engine='python')
print("Time after training data import: ", dt.now())

Time before training data import:  2025-01-10 17:36:05.288326
Time after training data import:  2025-01-10 17:36:15.184811


In [None]:
train_data_raw['avg_PV'] = train_data_raw['avg_price'] * train_data_raw['avg_volume']  # Example feature
X_train_data = train_data_raw.drop(columns=['date', 'pump_index', 'symbol', 'gt'])
X_train_data = X_train_data.drop(columns=['hour_cos'], errors='ignore')  # Drop column based on feature importance
y_train_data = train_data_raw['gt']

In [None]:
X_train = X_train_data.values
y_train = y_train_data.values

scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)

In [None]:
test_path = '/content/drive/MyDrive/anti/dataset/features_25S.csv'
print("Time before test data import: ", dt.now())
test_data_raw = pd.read_csv(test_path, on_bad_lines="skip", engine='python')
print("Time after test data import: ", dt.now())

Time before test data import:  2025-01-10 17:36:15.368649
Time after test data import:  2025-01-10 17:36:24.942310


In [None]:
test_data_raw['avg_PV'] = test_data_raw['avg_price'] * test_data_raw['avg_volume']  # Example feature
X_test_data = test_data_raw.drop(columns=['date', 'pump_index', 'symbol', 'gt'])
X_test_data = X_test_data.drop(columns=['hour_cos'], errors='ignore')  # Drop column based on feature importance
y_test_data = test_data_raw['gt']

X_test = X_test_data.values
y_test = y_test_data.values

X_test = scaler.transform(X_test)

In [None]:
rfc = RandomForestClassifier(n_estimators=20, random_state=13)
print("Time before RFC fit: ", dt.now())
rfc.fit(X_train, y_train)
print("Time after RFC fit: ", dt.now())

# Predict on the test dataset
y_pred = rfc.predict(X_test)

Time before RFC fit:  2025-01-10 17:36:25.041009
Time after RFC fit:  2025-01-10 17:36:32.910793


In [None]:
cm = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix and its ratios")
print(cm)
TN = cm[0, 0]
FN = cm[0, 1]
FP = cm[1, 0]
TP = cm[1, 1]

m_precision = round(TP / (TP + FP), 4)
m_TN_rate = round(TN / (TN + FP), 4)
m_FP_rate = round(FP / (TN + FP), 4)
m_recall = round(TP / (TP + FN), 4)
m_accuracy = round((TP + TN) / cm.sum(), 4)

print("Test Precision = Positive Predictive Value = ", m_precision)
print("Test Specificity = True Negative Rate = ", m_TN_rate)
print("Test 1-Specificity = False Positive Rate = ", m_FP_rate)
print("Test Recall = Sensitivity = True Positive Rate = ", m_recall)
print("Test Accuracy = ", m_accuracy)


Confusion Matrix and its ratios
[[481840      0]
 [     3    314]]
Test Precision = Positive Predictive Value =  0.9905
Test Specificity = True Negative Rate =  1.0
Test 1-Specificity = False Positive Rate =  0.0
Test Recall = Sensitivity = True Positive Rate =  1.0
Test Accuracy =  1.0


In [None]:
feature_scores = pd.Series(rfc.feature_importances_, index=X_train_data.columns).sort_values(ascending=False)
print("\nFeature Importance:")
print(feature_scores)

# Predicted Test Probabilities
pp_test = pd.DataFrame(rfc.predict_proba(X_test), columns=['p_0', 'p_1'])
pp_test['y'] = y_test

print("\nPredicted Test Probabilities (Sample):")
print(pp_test.head())


Feature Importance:
std_rush_order    0.324994
std_trades        0.180485
avg_PV            0.125695
avg_volume        0.100293
avg_rush_order    0.099179
std_volume        0.044699
minute_cos        0.029937
minute_sin        0.029049
std_price         0.018164
avg_price_max     0.018046
hour_sin          0.014763
avg_price         0.014697
dtype: float64

Predicted Test Probabilities (Sample):
   p_0  p_1  y
0  1.0  0.0  0
1  1.0  0.0  0
2  1.0  0.0  0
3  1.0  0.0  0
4  1.0  0.0  0
