In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, mean_squared_error
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import KNNImputer
import re
import matplotlib.pyplot as plt

%matplotlib inline
# ignore warnings
import warnings
warnings.filterwarnings("ignore")


In [3]:
df = pd.read_csv('./felix_cleaned.csv')

In [5]:
# Splitting the data
X = df.drop(['class'], axis=1)  # Features excluding 'id' and 'class'
y = df['class']  # Target variable

In [6]:
class_counts = y.value_counts()
single_sample_classes = class_counts[class_counts == 1].index
filter_mask = ~y.isin(single_sample_classes)
X = X[filter_mask]
y = y[filter_mask]

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y)

In [11]:
imputer = KNNImputer(n_neighbors=5)
X_train = pd.DataFrame(imputer.fit_transform(X_train), columns=X_train.columns)
X_test = pd.DataFrame(imputer.transform(X_test), columns=X_test.columns)

In [8]:
# Defining the parameter grid for GradientBoostingClassifier
param_dist = {
    'n_estimators': [10, 50, 100, 200, 500],
    'learning_rate': [0.001, 0.01, 0.1, 0.5, 1],
    'max_depth': [3, 4, 5, 6, 7],
    'subsample': [0.6, 0.7, 0.8, 0.9, 1.0],
    'max_features': ['sqrt', 'log2', None]
}

# Instantiating GradientBoostingClassifier
gbm = GradientBoostingClassifier()

# Setting up RandomizedSearchCV
random_search = RandomizedSearchCV(
    gbm, param_distributions=param_dist, n_iter=100, scoring='accuracy', 
    cv=5, verbose=1, random_state=42, n_jobs=-1
)

# Fitting the RandomizedSearchCV object to the data
random_search.fit(X_train, y_train)

# Getting the best parameters and the best score
print("Best Parameters:", random_search.best_params_)
print("Best Score:", random_search.best_score_)

# Evaluate on the test set
print("Test Set Score:", random_search.score(X_test, y_test))

Fitting 5 folds for each of 100 candidates, totalling 500 fits


Best Parameters: {'subsample': 0.7, 'n_estimators': 200, 'max_features': 'sqrt', 'max_depth': 7, 'learning_rate': 0.5}
Best Score: 0.4555555555555556
Test Set Score: 0.2894736842105263


In [9]:
print("Test Set Score:", random_search.score(X_train, y_train))

Test Set Score: 0.9883720930232558
