In [1]:
#Importing librabries
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, mean_squared_error, mean_absolute_error, r2_score
from sklearn import preprocessing
from scipy.stats import randint, uniform

import warnings
warnings.filterwarnings("ignore")

In [2]:
#Initializing an empty dataframe for storing the data after each iteration
results_df = pd.DataFrame(columns=['n_estimators', 'learning_rate', 'max_depth', 'max_features', 'mean_test_score'])

In [3]:
#Loading the dataset
file_path = "./dataset.csv"
try:
    dataset = pd.read_csv(file_path)
except FileNotFoundError:
    print("File not found. Please check the path and try again.")
    exit(1)

In [4]:
#Verify if dataset is a dataframe
if not isinstance(dataset, pd.DataFrame):
    print("Dataset is not a dataframe. Please check the file and try again.")
    exit(1)
    

In [5]:
encoder = LabelEncoder()
y = encoder.fit_transform(dataset['koi_disposition'])

dataset_numeric = dataset.dropna(subset=['koi_score'])

#Save the 'koi_disposition' column in a variable and drop it from the dataset
koi_disposition_column = dataset['koi_disposition']

non_numeric_columns = dataset.select_dtypes(exclude=['number']).columns

#Drop 'koi_disposition' and any other non-numeric columns from the dataset
dataset_numeric = dataset.drop(columns=non_numeric_columns)

dataset_numeric['koi_disposition'] = koi_disposition_column


In [6]:
#Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(dataset_numeric.drop(columns=['koi_disposition']), y, test_size=0.2, random_state=42)

In [7]:
#Handle missing values using SimpleImputed for the training and testing sets
imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

In [8]:
#Scaling the training and testing sets
scaler = StandardScaler().fit(X_train_imputed)
X_train_scaled = scaler.transform(X_train_imputed)
X_test_scaled = scaler.transform(X_test_imputed)

In [9]:
#Setting initial ranges for hyperparameters
param_ranges = {
    'n_estimators': (90, 130),
    'learning_rate': (0.01, 0.1),
    'max_depth': (5, 15),
    'max_features': ['sqrt', 'log2', None]
}

#Setting the number of iterations
n_iter = 200

In [10]:
# Perform multiple iterations of the hyperparameter tuning
for i in range(n_iter):
    print(f"Iteration {i+1}/{n_iter}")

    # Sample hyperparameters from the ranges
    params = {
        'n_estimators': np.random.randint(param_ranges['n_estimators'][0], param_ranges['n_estimators'][1]),
        'learning_rate': np.random.uniform(param_ranges['learning_rate'][0], param_ranges['learning_rate'][1]),
        'max_depth': np.random.randint(param_ranges['max_depth'][0], param_ranges['max_depth'][1]),
        'max_features': np.random.choice(param_ranges['max_features'])
    }

    print(f"Sampled Parameters: {params}")

    # Create the GradientBoostingClassifier model
    model = GradientBoostingClassifier(**params)

    # # Perform cross-validation
    # scores = cross_val_score(model, X_train_scaled, y_train, cv=5)

    # # Calculate the mean score
    # mean_score = np.mean(scores)

    # print(f"Mean Score: {mean_score}")

    # Fit the classifier on the training data
    model.fit(X_train_scaled, y_train)

    # Get the feature importances
    feature_importances = model.feature_importances_

    # Sort the features by importance
    sorted_indices = np.argsort(feature_importances)[::-1]

    # Select the top k features
    k = 10
    selected_features = X_train.columns[sorted_indices[:k]]

    # Use only the selected features for training and testing
    X_train_selected = X_train_scaled[:, sorted_indices[:k]]
    X_test_selected = X_test_scaled[:, sorted_indices[:k]]

    # Create a new model instance for final evaluation
    final_model = GradientBoostingClassifier(**params)
    final_model.fit(X_train_selected, y_train)

    # Evaluate the model on the testing data
    mean_cv_score = final_model.score(X_test_selected, y_test)

    print(f"Mean Score: {mean_cv_score}")

    # Add the results to the results dataframe
    # Create a new DataFrame with the results
    new_row = pd.DataFrame([{
        'n_estimators': params['n_estimators'],
        'learning_rate': params['learning_rate'],
        'max_depth': params['max_depth'],
        'max_features': params['max_features'],
        'mean_test_score': mean_cv_score
    }])

    # Concatenate the new row with the existing results_df
    results_df = pd.concat([results_df, new_row], ignore_index=True)

print(results_df)

Iteration 1/200
Sampled Parameters: {'n_estimators': 99, 'learning_rate': 0.06236361197257111, 'max_depth': 6, 'max_features': None}
Mean Score: 0.9163617354939885
Iteration 2/200
Sampled Parameters: {'n_estimators': 112, 'learning_rate': 0.08154830386924548, 'max_depth': 9, 'max_features': None}
Mean Score: 0.9100888656560376
Iteration 3/200
Sampled Parameters: {'n_estimators': 106, 'learning_rate': 0.01086711759230214, 'max_depth': 13, 'max_features': 'sqrt'}


In [None]:
# Visualize the results history
plt.figure(figsize=(10, 6))
plt.scatter(results_df['learning_rate'], results_history['n_estimators'], c=results_history['mean_test_score'], cmap='viridis', marker='x')
plt.xlabel('Learning Rate')
plt.ylabel('N - Estimators')
plt.title('Random Search Mean Cross-Validation Accuracy')
plt.colorbar(label='Mean Test Score')
plt.show()

In [None]:
#Create a new model with the best hyperparameters
gradient_boosting_regressor = GradientBoostingRegressor(random_state=42)

#Train the model
gradient_boosting_regressor.fit(X_train_scaled, y_train)

#Predict the target variable
y_pred = gradient_boosting_regressor.predict(X_test_scaled)

#Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")
print(f"R2 Score: {r2}")
print(f"Mean Absolute Error: {mae}")

#Viualize the predictions vs true values
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred)
plt.xlabel('True Values')
plt.ylabel('Predictions')
plt.title('Predictions vs True Values')
plt.show()

