In [8]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, mean_squared_error
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import KNNImputer
import re
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import chi2_contingency
from sklearn.svm import SVC

%matplotlib inline
# ignore warnings
import warnings
warnings.filterwarnings("ignore")


In [9]:
df = pd.read_json('./data/dataSet_Culture_06102023-POINT.json')

In [11]:
# Splitting the data
X = df.drop(['class'], axis=1)  # Features excluding 'id' and 'class'
y = df['class']  # Target variable

In [7]:
class_counts = y.value_counts()

# Identify the classes with 99 or fewer samples
small_sample_classes = class_counts[class_counts <= 99].index

# Create a mask for these classes
small_sample_mask = y.isin(small_sample_classes)

# Set the values for these classes to 0
y[small_sample_mask] = 0

In [11]:
class_counts = y.value_counts()
single_sample_classes = class_counts[class_counts <= 2].index
filter_mask = ~y.isin(single_sample_classes)
X = X[filter_mask]
y = y[filter_mask]

In [12]:
y.value_counts()

class
3    29
2    27
4    24
5    20
1    15
6     9
Name: count, dtype: int64

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y)

In [14]:
imputer = KNNImputer(n_neighbors=5)
X_train = pd.DataFrame(imputer.fit_transform(X_train), columns=X_train.columns)
X_test = pd.DataFrame(imputer.transform(X_test), columns=X_test.columns)

In [15]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score

# Assuming X_train, X_test, y_train, y_test are already defined
unique_classes = set(y_train)
binary_classifications = {}
evaluation_metrics = {}
trained_classifiers = {}

param_dist = {
    # Your hyperparameters grid here
    # For example:
    'n_estimators': [50, 100, 200, 500],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 4, 5, 6, 7],
    'max_features': ['log2', 'sqrt', None],
    'subsample': [0.6, 0.7, 0.8, 0.9, 1.0]
}

for u_class in unique_classes:
    # Convert the labels for one-vs-all classification
    y_train_binary = [1 if label == u_class else 0 for label in y_train]
    y_test_binary = [1 if label == u_class else 0 for label in y_test]

    # Train a Gradient Boosting classifier using Randomized Search CV
    gbm = GradientBoostingClassifier()
    random_search = RandomizedSearchCV(
        gbm, param_distributions=param_dist, n_iter=200, scoring='accuracy', 
        cv=5, verbose=1, n_jobs=-1
    )
    random_search.fit(X_train, y_train_binary)
    
    # Storing the trained classifier
    trained_classifiers[u_class] = random_search.best_estimator_
    
    # Evaluate on the test set
    y_pred = random_search.predict(X_test)
    test_accuracy = accuracy_score(y_test_binary, y_pred)
    evaluation_metrics[u_class] = {
        "Best Parameters": random_search.best_params_,
        "Best CV Score": random_search.best_score_,
        "Test Accuracy": test_accuracy
    }

    # Print results
    print(f"Class {u_class} - Best Parameters:", random_search.best_params_)
    print(f"Class {u_class} - Best CV Score:", random_search.best_score_)
    print(f"Class {u_class} - Test Set Score:", test_accuracy)


Fitting 5 folds for each of 200 candidates, totalling 1000 fits


Class 1 - Best Parameters: {'subsample': 0.6, 'n_estimators': 500, 'max_features': 'log2', 'max_depth': 6, 'learning_rate': 0.1}
Class 1 - Best CV Score: 0.8954248366013072
Class 1 - Test Set Score: 0.868421052631579
Fitting 5 folds for each of 200 candidates, totalling 1000 fits
Class 2 - Best Parameters: {'subsample': 0.7, 'n_estimators': 50, 'max_features': 'log2', 'max_depth': 7, 'learning_rate': 0.01}
Class 2 - Best CV Score: 0.7790849673202614
Class 2 - Test Set Score: 0.7894736842105263
Fitting 5 folds for each of 200 candidates, totalling 1000 fits
Class 3 - Best Parameters: {'subsample': 0.7, 'n_estimators': 50, 'max_features': 'sqrt', 'max_depth': 4, 'learning_rate': 0.05}
Class 3 - Best CV Score: 0.7908496732026145
Class 3 - Test Set Score: 0.8157894736842105
Fitting 5 folds for each of 200 candidates, totalling 1000 fits
Class 4 - Best Parameters: {'subsample': 1.0, 'n_estimators': 50, 'max_features': 'sqrt', 'max_depth': 3, 'learning_rate': 0.1}
Class 4 - Best CV Score: 0.

In [16]:
trained_classifiers

{1: GradientBoostingClassifier(max_depth=6, max_features='log2', n_estimators=500,
                            subsample=0.6),
 2: GradientBoostingClassifier(learning_rate=0.01, max_depth=7, max_features='log2',
                            n_estimators=50, subsample=0.7),
 3: GradientBoostingClassifier(learning_rate=0.05, max_depth=4, max_features='sqrt',
                            n_estimators=50, subsample=0.7),
 4: GradientBoostingClassifier(max_features='sqrt', n_estimators=50),
 5: GradientBoostingClassifier(learning_rate=0.01, max_depth=7, max_features='sqrt',
                            subsample=0.9),
 6: GradientBoostingClassifier(max_depth=4, max_features='log2', n_estimators=50,
                            subsample=0.9)}

In [17]:
probabilities_batch = {u_class: [] for u_class in trained_classifiers.keys()}

for u_class, gbm in trained_classifiers.items():
    probs = gbm.predict_proba(X_test)[:, 1]
    probabilities_batch[u_class] = probs

final_class_predictions = []
for i in range(len(X_test)):
    final_class = max(probabilities_batch, key=lambda x: probabilities_batch[x][i])
    final_class_predictions.append(final_class)

In [18]:
accuracy_score(final_class_predictions, y_test)

0.4473684210526316

In [19]:
probabilities_batch = {u_class: [] for u_class in trained_classifiers.keys()}

for u_class, gbm in trained_classifiers.items():
    probs = gbm.predict_proba(X_train)[:, 1]
    probabilities_batch[u_class] = probs

final_class_predictions = []
for i in range(len(X_train)):
    final_class = max(probabilities_batch, key=lambda x: probabilities_batch[x][i])
    final_class_predictions.append(final_class)

In [20]:
accuracy_score(final_class_predictions, y_train)

1.0