In [None]:
from Metaheuristicas.fitness_functions import *
import json

import numpy as np
import pandas as pd
from sklearn.feature_selection import mutual_info_classif, chi2
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

# Necessary functions

### Fitness Functions

In [None]:

from skrebate import ReliefF


def mutual_information_eval(solution, data, labels):
    # Convert NumPy array back to DataFrame
    data_df = pd.DataFrame(data)
    selected_data = data_df.iloc[:, solution == 1]
    if selected_data.shape[1] == 0:
        return -np.inf
    mi_scores = mutual_info_classif(selected_data, labels)
    return np.sum(mi_scores)


def chi2_eval(solution, data, labels):
    # Convert NumPy array back to DataFrame
    data_df = pd.DataFrame(data)
    selected_data = data_df.iloc[:, solution == 1]
    if selected_data.shape[1] == 0:
        return -np.inf
    chi2_scores, _ = chi2(selected_data, labels)
    return np.mean(chi2_scores)





def relieff_eval(solution, data, labels, n_neighbors=10):
    
    if not isinstance(data, pd.DataFrame):
        data = pd.DataFrame(data)

    selected_features = data.iloc[:, solution.astype(bool)]

    if selected_features.shape[1] == 0:
        return -np.inf

    labels = np.array(labels)

    relief = ReliefF(n_neighbors=n_neighbors)
    relief.fit(selected_features.values, labels)

    relieff_score = relief.feature_importances_.mean()

    return relieff_score

def load_and_preprocess_data(filename='Resources/SeisBenchV1_v1_1.json'):
    """
    Load and preprocess data from a JSON file.

    Parameters:
    - filename: Path to the JSON file.

    Returns:
    - X_scaled: Scaled feature matrix (DataFrame).
    - y: Target variable.
    """
    with open(filename) as file:
        data = json.load(file)
        data = pd.DataFrame(data)
        data.dropna(inplace=True)
        data.drop(data[data['Type'] == 'REGIONAL'].index, inplace=True)
        data.drop(data[data['Type'] == 'HB'].index, inplace=True)
        data.drop(data[data['Type'] == 'ICEQUAKE'].index, inplace=True)
        data.drop(data[data['Type'] == ''].index, inplace=True)

    label_encoder = LabelEncoder()
    data['Type'] = label_encoder.fit_transform(data['Type'])

    X = data.iloc[:, 1:]
    y = data['Type']

    scaler = MinMaxScaler()
    X_scaled = scaler.fit_transform(X)

    return pd.DataFrame(X_scaled, columns=X.columns), y







In [None]:
def add_result(classifier, fitness_function, accuracy, precision, recall, f1_score, auc):
    new_data = {
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1 Score": f1_score,
        "AUC": auc
    }

    if classifier == "Naive Bayes":
        global naive_bayes_df
        naive_bayes_df.loc[fitness_function] = new_data
    elif classifier == "Random Forest":
        global random_forest_df
        random_forest_df.loc[fitness_function] = new_data
    elif classifier == "Neural Network":
        global neural_network_df
        neural_network_df.loc[fitness_function] = new_data

### Save Results

In [None]:
from IPython.display import display, clear_output

# Step 1: Initialize empty DataFrames for each classifier with metrics as columns
metrics = ["Accuracy", "Precision", "Recall", "F1 Score", "AUC"]

naive_bayes_df = pd.DataFrame(columns=metrics, index=["Mutual Information", "X2", "Relief"])
random_forest_df = pd.DataFrame(columns=metrics, index=["Mutual Information", "X2", "Relief"])
neural_network_df = pd.DataFrame(columns=metrics, index=["Mutual Information", "X2", "Relief"])

# Display all tables function
def display_tables():
    clear_output(wait=True)
    print("Naive Bayes Results")
    display(naive_bayes_df)
    print("Random Forest Results")
    display(random_forest_df)
    print("Neural Network Results")
    display(neural_network_df)

# Genetic algorithm implementation

In [None]:
import numpy as np
from sklearn.feature_selection import mutual_info_classif
import random
from Metaheuristicas.fitness_functions import mutual_information_eval, load_and_preprocess_data



# Function to calculate mutual information for a subset of features


def genetic_algorithm(X, y, population_size=42, num_parents=28, generations=100, mutation_rate=0.1, crossover_rate=0.8, fitness_function=mutual_information_eval):
    n_features = X.shape[1]

    # Initialize a random population of individuals (feature subsets)
    population = [np.random.choice([0, 1], size=n_features) for _ in range(population_size)]
    # population =population_size
    best_solution = None
    best_fitness = -float('inf')

    for generation in range(generations):
        print(f"Generation {generation + 1}/{generations}")
        # Evaluate the fitness of each individual in the population
        fitness_scores = []
        for individual in population:
            fitness = fitness_function(individual, X, y)
            fitness_scores.append(fitness)

            # Update the best solution found
            if fitness > best_fitness:
                best_fitness = fitness
                best_solution = individual.copy()

        # Selection: Select individuals based on their fitness (roulette wheel selection)
        fitness_sum = sum(fitness_scores)
        if fitness_sum == 0:
            probabilities = [1 / len(fitness_scores)] * len(fitness_scores)
        else:
            probabilities = [fitness / fitness_sum for fitness in fitness_scores]

        selected_population = random.choices(population, weights=probabilities, k=num_parents)

        # Crossover: Create new population using crossover
        new_population = []
        for i in range(0, population_size, 2):
            parent1 = selected_population[i % num_parents]
            parent2 = selected_population[(i + 1) % num_parents]

            if random.random() < crossover_rate:
                # Perform crossover (single-point crossover) while maintaining feature vector length
                crossover_point = random.randint(1, n_features - 1)
                child1 = np.concatenate((parent1[:crossover_point], parent2[crossover_point:]))
                child2 = np.concatenate((parent2[:crossover_point], parent1[crossover_point:]))
            else:
                child1, child2 = parent1.copy(), parent2.copy()

            new_population.append(child1)
            new_population.append(child2)

        # Mutation: Mutate the new population without modifying the length
        for individual in new_population:
            for feature in range(n_features):
                if random.random() < mutation_rate:
                    # Flip the bit for feature selection (1 becomes 0, 0 becomes 1)
                    individual[feature] = 1 - individual[feature]  # Flip the feature bit

        # Replace the old population with the new population
        population = new_population

    return best_solution, best_fitness








In [None]:
X, y = load_and_preprocess_data(filename='Resources/SeisBenchV1_v1_1.json')


# Optimization

In [None]:
#Save the results from the optimizations
Mutual_Information = []
Chi2 = []
ReliefFList = []

In [None]:

# generations = 500
mutation=0.1
crossover=0.9

In [None]:
from Metaheuristicas.Genetico import genetic_algorithm
for n_generations in range(500, 1000, 50):
    best_solution, best_Gfitness = genetic_algorithm(X, y,mutation_rate=mutation,crossover_rate=crossover,fitness_function=mutual_information_eval,generations = n_generations )
    selected_features = X.columns[best_solution.astype(bool)].tolist()
    Mutual_Information.append([selected_features, best_Gfitness, n_generations])

    

In [None]:
for n_generations in range(500, 1000, 50):
    best_solution, best_Gfitness = genetic_algorithm(X, y,mutation_rate=mutation,crossover_rate=crossover,fitness_function=chi2_eval,generations = n_generations )
    selected_features = X.columns[best_solution.astype(bool)].tolist()
    Chi2.append([selected_features, best_Gfitness, n_generations])

In [None]:
for n_generations in range(500, 1000, 50):
    best_solution, best_Gfitness = genetic_algorithm(X, y,mutation_rate=mutation,crossover_rate=crossover,fitness_function=relieff_eval,generations = n_generations )
    selected_features = X.columns[best_solution.astype(bool)].tolist()
    ReliefFList.append([selected_features, best_Gfitness, n_generations])

In [None]:
import pickle
# with open('Mutual_Information.pkl', 'wb') as f:
#     pickle.dump(Mutual_Information, f)
# with open('Chi2.pkl', 'wb') as f:
#     pickle.dump(Chi2, f)
# with open('ReliefF.pkl', 'wb') as f:
#     pickle.dump(ReliefF, f)

In [None]:
Mutual_Information.sort(key=lambda x: x[1], reverse=True)
Chi2.sort(key=lambda x: x[1], reverse=True)
ReliefFList.sort(key=lambda x: x[1], reverse=True)

In [None]:
for i in Mutual_Information:
    print("Mutual Information: ",i[1], i[2])

In [None]:
for i in Chi2:
    print("Chi2: ",i[1], i[2])

In [None]:
for i in ReliefFList:
    print("ReliefF List: ",i[1], i[2])

In [None]:
print("Mutual Information selected Features: \n",len(Mutual_Information[0][0]))


In [None]:
print("Chi2 selected features: \n",len(Chi2[0][0]))

In [None]:
print("ReliefF selected features: \n",len(ReliefFList[0][0]))


## Base Results

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

### Naive Bayes

In [None]:


#Naive Bayes classifier with all features

cmodel = GaussianNB()
cmodel.fit(X_train, y_train)
y_pred = cmodel.predict(X_test)
y_pred_proba = cmodel.predict_proba(X_test)[:, 1]  # Probability estimates for AUC

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_pred_proba)

In [None]:
print("Accuracy: ", accuracy)
print("Precision: ", precision)
print("Recall: ", recall)
print("F1: ", f1)
print("AUC: ", auc)


### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
random_forest = RandomForestClassifier(n_estimators=100, random_state=42)

random_forest.fit(X_train, y_train)
y_pred = random_forest.predict(X_test)
y_pred_proba = random_forest.predict_proba(X_test)[:, 1]  # Probability estimates for AUC
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_pred_proba)


In [None]:
print("Accuracy: ", accuracy)
print("Precision: ", precision)
print("Recall: ", recall)
print("F1: ", f1)
print("AUC: ", auc)


### Neural Network

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import BinaryCrossentropy

In [None]:


# Crear y compilar el modelo
RN = Sequential()

# Definir la dimensión de entrada basada en las características seleccionadas
input_dim = X_train.shape[1]
RN.add(Dense(units=64, activation='relu', input_dim=input_dim))
RN.add(Dense(units=32, activation='relu'))
RN.add(Dense(units=1, activation='sigmoid'))

# Compilar el modelo
RN.compile(optimizer=Adam(learning_rate=0.001), 
              loss=BinaryCrossentropy(),
              metrics=['accuracy'])

# Entrenar el modelo
history = RN.fit(X_train, y_train, epochs=100, batch_size=32, validation_split=0.2)

# Evaluar el modelo en el conjunto de prueba
loss, accuracy = RN.evaluate(X_test, y_test)

print(f'Test Accuracy: {accuracy}')

# Realizar predicciones en el conjunto de prueba
y_pred_prob = RN.predict(X_test)  # Predicciones como probabilidades
y_pred = (y_pred_prob > 0.5).astype(int)  # Convertir probabilidades a etiquetas (0 o 1)




In [None]:
# Calcular y mostrar las métricas
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1:", f1_score(y_test, y_pred))
print("AUC:", roc_auc_score(y_test, y_pred_prob))  # Usamos probabilidades para AUC
add_result("Neural Network", "Mutual Information", accuracy_score(y_test, y_pred), precision_score(y_test, y_pred), recall_score(y_test, y_pred), f1_score(y_test, y_pred), roc_auc_score(y_test, y_pred_prob))

In [None]:
MI_Scores = []
Chi_Scores = []
ReliefF_Scores = []

# Genetic algorithm with optimized parameters (best fitness)

In [None]:
MIOpt= []
ChiOpt = []
ReliefOpt = []

In [None]:
best_solution, best_Gfitness = genetic_algorithm(X, y, mutation_rate=mutation, crossover_rate=crossover,fitness_function=mutual_information_eval, generations=Mutual_Information[0][2])


GAMIFeatures = X.columns[best_solution.astype(bool)].tolist()
MIOpt.append([GAMIFeatures, best_Gfitness])

In [None]:
print(MIOpt)

In [None]:
print("Features selected by MI: ", GAMIFeatures)


In [None]:
best_solution, best_Gfitness = genetic_algorithm(X, y, mutation_rate=mutation, crossover_rate=crossover,fitness_function=chi2_eval, generations=Chi2[0][2])

GAX2Features = X.columns[best_solution.astype(bool)].tolist()
ChiOpt.append([GAX2Features, best_Gfitness])

In [None]:
print(ChiOpt)

In [None]:
print("Features selected by Chi2: ", GAX2Features)

In [None]:
best_solution, best_Gfitness = genetic_algorithm(X, y, mutation_rate=mutation, crossover_rate=crossover,fitness_function=chi2_eval, generations=ReliefFList[0][2])

GARFFeatures = X.columns[best_solution.astype(bool)].tolist()
ChiOpt.append([GARFFeatures, best_Gfitness])


In [None]:
print(ReliefOpt)

In [None]:
print("Features selected by ReliefF: ", GARFFeatures)

### Mutual Information Classifiers

#### Naive Bayes with Mutual Information

In [None]:
NB = GaussianNB()

In [None]:
NB.fit(X_train[GAMIFeatures], y_train)

y_pred = NB.predict(X_test[GAMIFeatures])

In [None]:

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1:", f1_score(y_test, y_pred))
print("AUC:", roc_auc_score(y_test, y_pred))
add_result("Naive Bayes", "Mutual Information", accuracy_score(y_test, y_pred), precision_score(y_test, y_pred), recall_score(y_test, y_pred), f1_score(y_test, y_pred), roc_auc_score(y_test, y_pred))

### Random Forest with Mutual Information

In [None]:
DT = RandomForestClassifier()


In [None]:
DT.fit(X_train[GAMIFeatures], y_train)
y_pred = DT.predict(X_test[GAMIFeatures])


In [None]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1:", f1_score(y_test, y_pred))
print("AUC:", roc_auc_score(y_test, y_pred))
add_result("Random Forest", "Mutual Information", accuracy_score(y_test, y_pred), precision_score(y_test, y_pred), recall_score(y_test, y_pred), f1_score(y_test, y_pred), roc_auc_score(y_test, y_pred))

### Neural Network with Mutual Information

In [None]:
History = RN.fit(X_train[GAMIFeatures], y_train, epochs=100, batch_size=32, validation_split=0.2)

# Evaluar el modelo en el conjunto de prueba
loss, accuracy = RN.evaluate(X_test[GAMIFeatures], y_test)

print(f'Test Accuracy: {accuracy}')
# Realizar predicciones en el conjunto de prueba
y_pred_prob = RN.predict(X_test[GAMIFeatures])  # Predicciones como probabilidades
y_pred = (y_pred_prob > 0.5).astype(int)  # Convertir probabilidades a etiquetas (0 o 1)

In [None]:
# Calcular y mostrar las métricas
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1:", f1_score(y_test, y_pred))
print("AUC:", roc_auc_score(y_test, y_pred_prob))  # Usamos probabilidades para AUC
add_result("Neural Network", "Mutual Information", accuracy_score(y_test, y_pred), precision_score(y_test, y_pred), recall_score(y_test, y_pred), f1_score(y_test, y_pred), roc_auc_score(y_test, y_pred_prob))

### Chi2 Classifiers

#### Naive Bayes with chi2

In [None]:
NB.fit(X_train[GAX2Features], y_train)
y_pred = NB.predict(X_test[GAX2Features])


In [None]:

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1:", f1_score(y_test, y_pred))
print("AUC:", roc_auc_score(y_test, y_pred))
add_result("Naive Bayes", "X2", accuracy_score(y_test, y_pred), precision_score(y_test, y_pred), recall_score(y_test, y_pred), f1_score(y_test, y_pred), roc_auc_score(y_test, y_pred))

#### Random Forest with chi2

In [None]:
DT.fit(X_train[GAX2Features], y_train)
y_pred = DT.predict(X_test[GAX2Features])

In [None]:

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1:", f1_score(y_test, y_pred))
print("AUC:", roc_auc_score(y_test, y_pred))
add_result("Random Forest", "X2", accuracy_score(y_test, y_pred), precision_score(y_test, y_pred), recall_score(y_test, y_pred), f1_score(y_test, y_pred), roc_auc_score(y_test, y_pred))

#### Neural Network with chi2

In [None]:
history = RN.fit(X_train[GAX2Features], y_train, epochs=100, batch_size=32, validation_split=0.2)

loss, accuracy = RN.evaluate(X_test[GAX2Features], y_test)

print(f'Test Accuracy: {accuracy}')
y_pred_prob = RN.predict(X_test[GAX2Features]) 
y_pred = (y_pred_prob > 0.5).astype(int)  

In [None]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1:", f1_score(y_test, y_pred))
print("AUC:", roc_auc_score(y_test, y_pred_prob)) 
add_result("Neural Network", "X2", accuracy_score(y_test, y_pred), precision_score(y_test, y_pred), recall_score(y_test, y_pred), f1_score(y_test, y_pred), roc_auc_score(y_test, y_pred_prob))

### ReliefF Classifiers

#### Naive Bayes with ReliefF

In [None]:
NB.fit(X_train[GARFFeatures], y_train)
y_pred = NB.predict(X_test[GARFFeatures])


In [None]:

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1:", f1_score(y_test, y_pred))
print("AUC:", roc_auc_score(y_test, y_pred))
add_result("Naive Bayes", "Relief", accuracy_score(y_test, y_pred), precision_score(y_test, y_pred), recall_score(y_test, y_pred), f1_score(y_test, y_pred), roc_auc_score(y_test, y_pred))

#### Random Forest with ReliefF


In [None]:
DT.fit(X_train[GARFFeatures], y_train)
y_pred = DT.predict(X_test[GARFFeatures])


In [None]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1:", f1_score(y_test, y_pred))
print("AUC:", roc_auc_score(y_test, y_pred))
add_result("Random Forest", "Relief", accuracy_score(y_test, y_pred), precision_score(y_test, y_pred), recall_score(y_test, y_pred), f1_score(y_test, y_pred), roc_auc_score(y_test, y_pred))

#### Neural Network with ReliefF

In [None]:
history = RN.fit(X_train[GARFFeatures], y_train, epochs=100, batch_size=32, validation_split=0.2)

loss, accuracy = RN.evaluate(X_test[GARFFeatures], y_test)

print(f'Test Accuracy: {accuracy}')
y_pred_prob = RN.predict(X_test[GARFFeatures]) 
y_pred = (y_pred_prob > 0.5).astype(int)  

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1:", f1_score(y_test, y_pred))
print("AUC:", roc_auc_score(y_test, y_pred_prob)) 
add_result("Neural Network", "Relief", accuracy_score(y_test, y_pred), precision_score(y_test, y_pred), recall_score(y_test, y_pred), f1_score(y_test, y_pred), roc_auc_score(y_test, y_pred_prob))

# Display results

In [None]:
display_tables()