In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.preprocessing import LabelEncoder  # Change this import
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam, SGD
from sklearn.model_selection import train_test_split

# Show all columns and rows
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# Import the data
df = pd.read_csv('diabetes_prediction_dataset.csv')

count_0 = df['diabetes'].value_counts()[0]
count_1 = df['diabetes'].value_counts()[1]

# Randomly sample rows with 'diabetes' value 0 to match the count of 1s
if count_0 > count_1:
    df = pd.concat([df[df['diabetes'] == 0].sample(n=count_1, random_state=42), df[df['diabetes'] == 1]])
elif count_1 > count_0:
    df = pd.concat([df[df['diabetes'] == 1].sample(n=count_0, random_state=42), df[df['diabetes'] == 0]])

# randomly delete columns with bmi level of 27.32 until 250 samples left
while len(df[df['bmi'] == 27.32]) > 200:
    df = df.drop(df[df['bmi'] == 27.32].sample(n=1, random_state=42).index)

# Shuffle the DataFrame to mix the rows
df = df.sample(frac=1, random_state=42)

# df = df.drop(df.columns[-1], axis=1)

label_encoder = LabelEncoder()
df['gender_encoded'] = label_encoder.fit_transform(df['gender'])
df['smoking_history_encoded'] = label_encoder.fit_transform(df['smoking_history'])

df.head()


Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes,gender_encoded,smoking_history_encoded
115,Male,28.0,0,0,never,27.63,6.2,130,0,1,4
98190,Female,43.0,0,0,never,39.61,5.8,155,1,0,4
69762,Male,0.32,0,0,No Info,16.2,6.6,80,0,1,0
65292,Female,45.0,0,0,never,24.1,5.8,90,0,0,4
6874,Male,70.0,0,1,former,28.87,6.0,155,1,1,3


In [2]:
# Split data into test and train
train, test = train_test_split(df, test_size=0.3, random_state=42)


# Split data into X and y

X_train = train[['age', 'hypertension', 'heart_disease', 'bmi', 'HbA1c_level', 'blood_glucose_level', 'gender_encoded', 'smoking_history_encoded']]
y_train = train['diabetes']

X_test = test[['age', 'hypertension', 'heart_disease', 'bmi', 'HbA1c_level', 'blood_glucose_level', 'gender_encoded', 'smoking_history_encoded']]
y_test = test['diabetes']

In [16]:
import random
from deap import base, creator, tools, algorithms
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score

# Define the Fitness and Individual classes
creator.create("FitnessMax", base.Fitness, weights=(1.0,))
creator.create("Individual", list, fitness=creator.FitnessMax)

# Define the genetic algorithm parameters
population_size = 2
num_generations = 2
mutation_rate = 0.2

# Define the hyperparameter search space for the SVM
hyperparameter_space = {
    'C': [0.1, 1.0, 10.0],
    'kernel': ['linear', 'rbf', 'poly'],
}

# Define the evaluation function (fitness function)
def evaluate_individual(individual):
    # Decode the hyperparameters from the individual's genes
    C, kernel_index = individual
    kernel = hyperparameter_space['kernel'][kernel_index]
    
    # Create an SVM model with the decoded hyperparameters
    model = SVC(C=C, kernel=kernel)
    
    # Perform cross-validation to evaluate the model
    scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')
    accuracy = scores.mean()
    
    return accuracy,

# Function to create a random individual
def create_individual():
    C = random.choice(hyperparameter_space['C'])
    kernel = random.choice(range(len(hyperparameter_space['kernel'])))  # Use the index
    return [C, kernel]


# Create a toolbox for the genetic algorithm
toolbox = base.Toolbox()
toolbox.register("individual", tools.initIterate, creator.Individual, create_individual)
toolbox.register("population", tools.initRepeat, list, toolbox.individual, n=population_size)
toolbox.register("evaluate", evaluate_individual)
toolbox.register("mate", tools.cxTwoPoint)
toolbox.register("mutate", tools.mutUniformInt, low=0, up=len(hyperparameter_space['C']) - 1, indpb=mutation_rate)
toolbox.register("select", tools.selTournament, tournsize=3)

# Create the initial population
population = toolbox.population()

# Define a function to print progress
def print_progress(gen, evals, best):
    print(f"Generation {gen}, Evaluations {evals}, Best Fitness {best.fitness.values[0]}")

# Set up statistics and verbose
stats = tools.Statistics(lambda ind: ind.fitness.values)
stats.register("avg", np.mean)
stats.register("min", np.min)
stats.register("max", np.max)


In [17]:

# Run the genetic algorithm with verbosity (verbose=True)
best_individual, best_fitness = algorithms.eaMuPlusLambda(population, toolbox, mu=population_size, lambda_=2*population_size, cxpb=0.7, mutpb=0.3, ngen=num_generations, stats=stats, verbose=True)

best_C, best_kernel = best_individual

# Ensure 'C' is a float
best_C_value = float(best_C[0])  # Assuming that the 'C' value is stored at index 0 of the list

# Create the final SVM model with the best hyperparameters
final_model = SVC(C=best_C_value, kernel=best_kernel)
final_model.fit(X_train, y_train)


gen	nevals	avg     	min     	max     
0  	2     	0.888813	0.888653	0.888974
