In [1]:
import random
import warnings
import numpy as np
import matplotlib.pyplot as plt
from deap import base, creator, tools, algorithms
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
import xgboost as xgb
import pandas as pd
import numpy as np

# DataFrame 'df' with columns: 'WineType', 'Alcohol', 'AshAlkalinity', etc.

df = pd.read_csv('wine.csv')
X = df.drop(columns=['ash', 'proline'])
y = df['alcohol']

# Convert the target variable to integer type
y = y.astype(int)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the evaluation function
def evaluate(individual):
    num_boost_round, max_depth, min_child_weight = individual
    min_child_weight = max(0.0000000001, min_child_weight)  # Ensure min_child_weight is greater than or equal to 0
    num_boost_round = int(num_boost_round)
    max_depth = max(0,int(max_depth))  # XGBoost requires integer values for these parameters
    # Define XGBoost hyperparameters
    params = {
        'objective': 'multi:softmax',  # Example: For multiclass classification
        'num_class': 178,  # Number of classes in your problem
        'booster': 'gbtree',  # 'gbtree' or 'gblinear'
        'lambda': 1,  # Regularization term
        'min_child_weight': int(min_child_weight),  # Minimum sum of instance weight (hessian) needed in a child
        'subsample': 0.8,  # Subsample ratio of the training instance
        'colsample_bylevel': 0.8,  # Subsample ratio of columns for each level
        'colsample_bytree': 0.8,  # Subsample ratio of columns for each tree
        'learning_rate': 0.1,  # Step size shrinkage to prevent overfitting
        'num_boost_round': int(num_boost_round),  # Number of boosting rounds
        'max_depth': int(max_depth),  # Maximum depth of a tree
        # Add more XGBoost parameters as needed
    }

    # ...

    min_child_weight = int(min_child_weight)
    dtest = xgb.DMatrix(X_test, label=y_test)

    # Train the XGBoost model
    dtrain = xgb.DMatrix(X_train, label=y_train)
    model = xgb.train(params, dtrain, num_boost_round=num_boost_round)

    # Make predictions on the test set
    y_pred = model.predict(dtest)

    # Calculate accuracy
    accuracy = np.mean(y_pred == y_test)

    # Calculate macro average
    report = classification_report(y_test, y_pred, output_dict=True)
    macro_avg = report['macro avg']['f1-score']

    return accuracy, macro_avg

# Define the optimization problem
creator.create("FitnessMulti", base.Fitness, weights=(1.0, 1.0))
creator.create("Individual", list, fitness=creator.FitnessMulti)

toolbox = base.Toolbox()
toolbox.register("num_boost_round", random.randint, 1, 100)
toolbox.register("max_depth", random.randint, 1, 10)
toolbox.register("min_child_weight", random.uniform, 1, 10)
toolbox.register("individual", tools.initCycle, creator.Individual, (toolbox.num_boost_round, toolbox.max_depth, toolbox.min_child_weight), n=1)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)

toolbox.register("evaluate", evaluate)
toolbox.register("mate", tools.cxBlend, alpha=0.5)
toolbox.register("mutate", tools.mutGaussian, mu=0, sigma=1, indpb=0.2)
toolbox.register("select", tools.selNSGA2)

# Run the optimization algorithm (NSGA-II)
def main():
    # Suppress runtime warnings and user warnings
    warnings.filterwarnings("ignore", category=RuntimeWarning)
    warnings.filterwarnings("ignore", category=UserWarning)

    # Store the values of num_boost_round, min_child_weight, and accuracy
    num_boost_round_values = []
    min_child_weight_values = []
    accuracy_values = []
    pop = toolbox.population(n=14)
    algorithms.eaMuPlusLambda(pop, toolbox, mu=14, lambda_=4, cxpb=0.7, mutpb=0.2, ngen=5, stats=None, halloffame=None, verbose=True)

    # Get the best individual(s) found by NSGA-II
    best_individuals = tools.sortNondominated(pop, len(pop), first_front_only=False)[0]
    print('best set of hyperparameters',best_individuals)
    for individual in best_individuals:
            num_boost_round_values.append(individual[0])
            min_child_weight_values.append(individual[2])
            accuracy_values.append(evaluate(individual)[0])
    # Extract the hyperparameters of the best individual
    best_hyperparameters = best_individuals[0]
    best_hyperparameters = best_hyperparameters  # Assuming only one best individual, adjust if multiple
    print('best hyperparameters',best_hyperparameters)
    # Evaluate the best individual
    best_accuracy, best_macro_avg = evaluate(best_hyperparameters)

    # Print the best hyperparameters and their performance
    print("Best Hyperparameters:")
    print("Num Boost Round:", best_hyperparameters[0])
    print("Max Depth:", best_hyperparameters[1])
    print("Min Child Weight:", best_hyperparameters[2])
    print("Best Accuracy:", best_accuracy)
    print("Best Macro Average:", best_macro_avg)

    # Plot the Pareto optimal parameters
    pareto_front = np.array([indiv.fitness.values for indiv in best_individuals])

    plt.scatter(pareto_front[:, 0], pareto_front[:,1])
    plt.xlim(0, 0.999)  # Set the x-axis limits from 0 to 1
    plt.ylim(0, 0.999)  # Set the y-axis limits from 0 to 1
    plt.xlabel("Accuracy")
    plt.ylabel("Macro Average")
    plt.title("Pareto Optimal Parameters")
    plt.show()

    # Plot the change in num_boost_round, min_child_weight, and accuracy
    plt.figure(figsize=(10, 6))
    plt.plot(num_boost_round_values, label='Num Boost Round')
    plt.plot(min_child_weight_values, label='Min Child Weight')
    plt.plot(accuracy_values, label='Accuracy')
    plt.xlabel('Generation')
    plt.ylabel('Value')
    plt.title('Change in Num Boost Round, Min Child Weight, and Accuracy')
    plt.legend()
    plt.show()

main()


ModuleNotFoundError: No module named 'deap'