<a href="https://colab.research.google.com/github/Woocash371/GAA/blob/main/Wyb%C3%B3r_cech_GA_algorytm_wy%C5%9Bciowy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install deap


Collecting deap
  Downloading deap-1.4.1-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Downloading deap-1.4.1-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (135 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/135.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m135.4/135.4 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: deap
Successfully installed deap-1.4.1


In [36]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split
from deap import base, creator, tools, algorithms
import random
from multiprocessing import Pool

# Definicja funkcji oceny na poziomie globalnym
def evaluate(individual, X_train, X_test, y_train, y_test, model_type, error_metric):
    # Zastosowanie maski wyboru cech
    mask = np.array(individual, dtype=bool)
    if not any(mask):
        return (float('inf'),)  # Kara za wybór braku cech

    X_train_masked = X_train[:, mask]
    X_test_masked = X_test[:, mask]

    # Wybór modelu regresji
    if model_type == 'LinearRegression':
        model = LinearRegression()
    elif model_type == 'RandomForest':
        model = RandomForestRegressor(random_state=42)
    elif model_type == 'SVR':
        model = SVR()
    else:
        raise ValueError("Nieznany typ modelu: {}".format(model_type))

    # Trenowanie modelu
    model.fit(X_train_masked, y_train)
    y_pred = model.predict(X_test_masked)

    # Wybór metryki błędu
    if error_metric == 'mse':
        error = mean_squared_error(y_test, y_pred)
    elif error_metric == 'mae':
        error = mean_absolute_error(y_test, y_pred)
    else:
        raise ValueError("Nieznana metryka błędu: {}".format(error_metric))

    return (error,)

def genetic_algorithm(file_path, target_column, model_type='LinearRegression', error_metric='mse', population_size=50, num_generations=20, crossover_prob=0.7, mutation_prob=0.2,
                      keep_best_idividual= False,amount_of_best_individual=1,crossover_type='TwoPoint', k_random_points = 1):

    print(f'Analiza cech dla {target_column}')
    print(f'Typ modelu: {model_type}')
    print(f'Metryka błędu: {error_metric}')
    print(f'Rozmiar populacji: {population_size}')
    print(f'Liczba generacji: {num_generations}')
    print(f'Prawdopodobieństwo krzyżowania: {crossover_prob}')
    print(f'Prawdopodobieństwo mutacji: {mutation_prob}')
    print(f'Typ krzyżowania: {crossover_type}')
    if crossover_type == 'KRandom':
      print(f'Liczba punktów krzyżowania: {k_random_points}')
    # Przygotowanie danych
    if file_path.endswith('.xlsx') or file_path.endswith('.xls'):
        xls = pd.ExcelFile(file_path)
        df = pd.read_excel(xls, sheet_name=0)  # Wczytywanie pierwszego arkusza
    elif file_path.endswith('.csv'):
        df = pd.read_csv(file_path)  # Wczytywanie pliku CSV
    else:
        raise ValueError("Nieobsługiwany format pliku: {}".format(file_path))

    # Usunięcie nienazwanych kolumn
    df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

    # Separacja cech (X) i celu (y)
    X = df.drop(columns=[target_column])
    y = df[target_column]

    # Standaryzacja cech
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Podział na zbiory treningowy i testowy
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

    # Definicja algorytmu genetycznego
    creator.create("FitnessMin", base.Fitness, weights=(-1.0,))
    creator.create("Individual", list, fitness=creator.FitnessMin)

    def cxKRandom(ind1, ind2, k):
      points = sorted(random.sample(range(1, len(ind1)), k))
      for i in range(len(points)):
        if i % 2 == 0:
          start = points[i]
          end = points[i + 1] if i + 1 < len(points) else len(ind1)
          ind1[start:end], ind2[start:end] = ind2[start:end], ind1[start:end]
        return ind1, ind2

    # Inicjalizacja algorytmu genetycznego
    toolbox = base.Toolbox()
    toolbox.register("attr_bool", random.randint, 0, 1)
    toolbox.register("individual", tools.initRepeat, creator.Individual, toolbox.attr_bool, n=X.shape[1])
    toolbox.register("population", tools.initRepeat, list, toolbox.individual)

    # Rejestracja rodzaju krzyżowania
    if crossover_type == 'OnePoint':
        toolbox.register("mate", tools.cxOnePoint)
    elif crossover_type == 'TwoPoint':
        toolbox.register("mate", tools.cxTwoPoint)
    elif crossover_type == 'PMX':
        toolbox.register("mate", tools.cxPartialyMatched)
    elif crossover_type == 'Uniform':
        toolbox.register("mate", tools.cxUniform, indpb=0.5)
    elif crossover_type == 'KRandom':
        toolbox.register("mate", cxKRandom, k=k_random_points)
    else:
        raise ValueError("Nieznany typ krzyżowania: {}".format(crossover_type))

    toolbox.register("mutate", tools.mutFlipBit, indpb=0.05)
    toolbox.register("select", tools.selTournament, tournsize=3)
    toolbox.register("evaluate", evaluate, X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test, model_type=model_type, error_metric=error_metric)

    # Definicja statystyk
    stats = tools.Statistics(lambda ind: ind.fitness.values)

    stats.register("avg", np.mean)
    stats.register("std", np.std)
    stats.register("min", np.min)
    stats.register("max", np.max)
    stats.register("median", np.median)
    stats.register("var", np.var)

    if keep_best_idividual == True:
      halloffame = tools.HallOfFame(amount_of_best_individual)
    else:
      halloffame = tools.HallOfFame(1)  # Zawsze przechowuj co najmniej jednego najlepszego osobnika

    # Tworzenie populacji początkowej
    population = toolbox.population(n=population_size)

    # Równoległe uruchomienie oceny osobników
    with Pool() as pool:
        toolbox.register("map", pool.map)
        # Uruchomienie algorytmu genetycznego
        algorithms.eaSimple(population, toolbox, cxpb=crossover_prob, mutpb=mutation_prob, ngen=num_generations,
                           stats=stats, halloffame=halloffame, verbose=True)

    if halloffame and len(halloffame) > 0:
        if keep_best_idividual:
            print("Najlepsze osobniki:")
            for i, individual in enumerate(halloffame):
                print(f'Osobnik {i + 1}:')
                features = np.array(individual, dtype=bool)
                selected_features = X.columns[features]
                print("  - Wybrane cechy:", selected_features.tolist())
                fitness = individual.fitness.values[0]
                print(f"  - Błąd ({error_metric}): {fitness}\n")
        else:
            best_individual = halloffame[0]
            best_features = np.array(best_individual, dtype=bool)
            print(f"Najlepszy osobnik: {best_individual}")
            selected_features = X.columns[best_features]
            print("Najbardziej wpływowe cechy:", selected_features.tolist())
    else:
        print("Nie znaleziono najlepszych osobników do wyświetlenia.")

In [39]:
file_path = "/content/sample_data/Excel_do_Ga_Nasze_stanowisko.xlsx"
genetic_algorithm(file_path, target_column='(mi-m_osp)/m_złoża', model_type='LinearRegression',
                  error_metric='mse', population_size=50, num_generations=50, crossover_prob=0.8, mutation_prob=0.1,
                      keep_best_idividual= True,amount_of_best_individual=2,crossover_type='TwoPoint', k_random_points = 2)

Analiza cech dla (mi-m_osp)/m_złoża
Typ modelu: LinearRegression
Metryka błędu: mse
Rozmiar populacji: 50
Liczba generacji: 50
Prawdopodobieństwo krzyżowania: 0.8
Prawdopodobieństwo mutacji: 0.1
Typ krzyżowania: TwoPoint




gen	nevals	avg     	std     	min      	max     	median  	var       
0  	50    	0.126062	0.083308	0.0411866	0.430237	0.110013	0.00694022
1  	44    	0.0675574	0.0295751	0.0409711	0.155584	0.0541085	0.000874688
2  	45    	0.0493441	0.0106015	0.0409383	0.109809	0.0453792	0.000112391
3  	43    	0.0483595	0.0129958	0.0384056	0.109632	0.0445601	0.00016889 
4  	41    	0.043887 	0.00577934	0.0381661	0.0663376	0.0414775	3.34008e-05
5  	41    	0.0455195	0.0289257 	0.0381614	0.245225 	0.0410598	0.000836697
6  	40    	0.0395078	0.00175251	0.0381614	0.0460982	0.0384056	3.0713e-06 
7  	38    	0.0384838	0.000801893	0.0381614	0.0411667	0.0381686	6.43032e-07
8  	42    	0.038814 	0.00454051 	0.0381614	0.0705975	0.0381661	2.06162e-05
9  	40    	0.0385589	0.00235011 	0.0381614	0.0547036	0.0381614	5.52301e-06
10 	35    	0.038377 	0.00114415 	0.0381614	0.0457741	0.0381614	1.30907e-06
11 	37    	0.038494 	0.00231567 	0.0381614	0.0547036	0.0381614	5.36234e-06
12 	44    	0.0385523	0.00234535 	0.0381614	0.054703

Analiza cech dla median_house_value
Typ modelu: LinearRegression
Metryka błędu: mse
Rozmiar populacji: 100
Liczba generacji: 100
Prawdopodobieństwo krzyżowania: 0.8
Prawdopodobieństwo mutacji: 0.1
Typ krzyżowania: TwoPoint




Pokolenie 100: {'avg': inf, 'std': nan, 'min': 4651428230.885948, 'max': inf, 'median': 9340055590.733479, 'var': nan}
Nie znaleziono najlepszych osobników do wyświetlenia.


  x = asanyarray(arr - arrmean)
