In [4]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pickle
import warnings
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV

In [2]:
df = pd.read_parquet('preprocessed_final.parquet')

In [3]:
df_0 = df[df['tsne_label'] == 0]

drift = df[df['tsne_label'] != 0]
y = df_0['label']
X = df_0.drop(columns=['label', 'tsne_label'])

In [4]:
param_grid = {  'n_estimators': [5, 10, 100, 500],  
                'max_depth': [None, 10, 30, 50],
                'min_samples_split': [2, 5, 10, 20]
            }

In [5]:
cross_val = KFold(n_splits=5, random_state=42, shuffle=True)
grid = GridSearchCV(RandomForestClassifier(), param_grid,n_jobs=-1, cv=cross_val, verbose=1)
grid.fit(X, y)

with open('random_forest_grid.pickle', 'wb') as file:
    pickle.dump(grid, file)
    
print(f"Best Parameters: {grid.best_params_}")

Fitting 5 folds for each of 64 candidates, totalling 320 fits


Best Parameters: {'max_depth': 30, 'min_samples_split': 2, 'n_estimators': 500}


In [6]:
print(f"Best Estimator: {grid.best_estimator_}")

Best Estimator: RandomForestClassifier(max_depth=30, n_estimators=500)
