<h1 style="color: brown;">1. Importing Libraries</h1> 

In [69]:
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import mean_squared_error, accuracy_score
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from deap import base, creator, tools, algorithms
from collections import Counter

This notebook makes use of the DEAP library, which is essential for evolutionary algorithms. If it is not already installed, it can be added by running the following command:

In [1]:
#!pip install deap

<h1 style="color: brown;">2. California Housing Prices</h1> 

<h2 style="color: brown;">2.1 Fetching and Preparing the California Housing Dataset</h2> 

In [2]:
# Fetching the California housing dataset from the source
california_housing = fetch_california_housing()

# Converting the dataset into a Pandas DataFrame for easier handling
data = pd.DataFrame(data=california_housing.data, columns=california_housing.feature_names)

# Adding a new column for house prices (target values) to the DataFrame
data['Target'] = california_housing.target

# Checking if there are any null values in the DataFrame
data.isnull().values.any()

False

<h2 style="color: brown;">2.2 Preparing the Feature Set and Target Variable for Model Training</h2> 

In [3]:
# Dropping the 'Target' column from the DataFrame to create the feature set (X)
X = data.drop('Target', axis=1)

# Extracting the 'Target' column from the DataFrame to create the target variable (y)
y = data['Target']

# Storing the names of the features (column names) from the original dataset
feature_names = california_housing.feature_names

# Creating an instance of StandardScaler to standardize the features
scaler = StandardScaler()

# Fitting the scaler to the feature data and transforming the data to have a mean of 0 and a standard deviation of 1
X = scaler.fit_transform(X)

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

<h2 style="color: brown;">2.3 Evaluating Regression Models on the California Housing Dataset</h2> 

In [4]:
# Creating a dictionary of different regression models to evaluate
regressors = {
    "KNN": KNeighborsRegressor(n_neighbors=5),
    "Decision Tree": DecisionTreeRegressor(),
}

# Initializing an empty dictionary to store Mean Squared Errors (MSE) for each model
mse_all_features = {}

# Looping through each regressor in the dictionary to evaluate its performance
for name, model in regressors.items():
    # Fitting the model with the training data
    model.fit(X_train, y_train)
    
    # Predicting values using the test data
    predictions = model.predict(X_test)
    
    # Calculating the Mean Squared Error (MSE) of the predictions
    mse = mean_squared_error(y_test, predictions)
    
    # Storing the MSE for the current model in the dictionary
    mse_all_features[name] = mse
    
    # Printing the MSE of the current model
    print(f"{name} MSE with All Features: {mse:.6f}")

KNN MSE with All Features: 0.423345
Decision Tree MSE with All Features: 0.531282


<h2 style="color: brown;">2.4 Setting Up a Genetic Algorithm for Feature Selection with Regression Models</h2> 

In [5]:
# Defining the fitness function for the genetic algorithm using a given regressor
def evaluate(individual, model):
    # Selecting features based on the individual’s binary representation
    selected_features = [index for index, bit in enumerate(individual) if bit]
    
    # Handling the case where no features are selected to avoid division by zero
    if len(selected_features) == 0:
        return (float('inf'),)  # Returning a high error value to indicate poor fitness
    
    # Creating a subset of features based on the selected features
    X_subset = X[:, selected_features]
    
    # Splitting the feature subset and target data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X_subset, y, test_size=0.3, random_state=42)
    
    # Fitting the model with the training data
    model.fit(X_train, y_train)
    
    # Making predictions using the test data
    predictions = model.predict(X_test)
    
    # Calculating the Mean Squared Error (MSE) of the predictions
    mse = mean_squared_error(y_test, predictions)
    
    # Returning the MSE as the fitness value
    return (mse,)

# Setting up the Genetic Algorithm components

# Creating the fitness function class with minimization (lower MSE is better)
creator.create("FitnessMin", base.Fitness, weights=(-1.0,))

# Creating the individual class with a fitness attribute
creator.create("Individual", list, fitness=creator.FitnessMin)

# Initializing the genetic algorithm toolbox
toolbox = base.Toolbox()

# Registering the attribute generator for individuals (binary values)
toolbox.register("attr_bool", np.random.randint, 2)

# Registering the individual generator (a list of binary values)
toolbox.register("individual", tools.initRepeat, creator.Individual, toolbox.attr_bool, n=X.shape[1])

# Registering the population generator (a list of individuals)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)

# Registering the mating function (uniform crossover)
toolbox.register("mate", tools.cxUniform, indpb=0.5)

# Registering the mutation function (flip bit mutation)
toolbox.register("mutate", tools.mutFlipBit, indpb=0.2)

# Registering the selection function (tournament selection)
toolbox.register("select", tools.selTournament, tournsize=3)

<h2 style="color: brown;"> 2.5 Running the Genetic Algorithm for Feature Selection with Different Regression Models</h2> 

In [6]:
# Defining the function to run the Genetic Algorithm (GA) for a specific regressor
def run_ga_for_regressor(regressor_name, model):
    # Initializing the population of individuals for the GA
    population = toolbox.population(n=100)

    # Setting up a custom stopping condition
    max_stable_generations = 20
    stable_generations = 0
    previous_best_individual = None
    best_individual = None
    best_mse = float('inf')

    generation = 0
    while stable_generations < max_stable_generations:
        generation += 1

        # Registering the evaluation function with the chosen regressor
        toolbox.register("evaluate", evaluate, model=model)

        # Running one generation of the genetic algorithm
        population, _ = algorithms.eaSimple(population, toolbox, cxpb=0.5, mutpb=0.2, ngen=1, verbose=False)

        # Extracting the best individual from the current generation
        current_best_individual = tools.selBest(population, 1)[0]
        current_best_mse = evaluate(current_best_individual, model)[0]  # Unpacking the tuple correctly

        # Printing the results of the current best individual (binary vector)
        print(f"Generation {generation}: MSE: {current_best_mse:.6f}, Feature Subset: {current_best_individual}")

        # Checking if the current best individual is the same as the previous best
        if previous_best_individual is not None and current_best_individual == previous_best_individual:
            stable_generations += 1
        else:
            stable_generations = 0
            previous_best_individual = current_best_individual

        # Updating the best individual if necessary
        if current_best_mse < best_mse:
            best_individual = current_best_individual
            best_mse = current_best_mse

    # Determining the feature names of the best individual from the final generation
    final_selected_features = [i for i, bit in enumerate(best_individual) if bit]
    final_selected_feature_names = [feature_names[i] for i in final_selected_features]
    final_not_selected_feature_names = [feature_names[i] for i in range(len(feature_names)) if i not in final_selected_features]

    return best_individual, best_mse, final_selected_features, final_selected_feature_names, final_not_selected_feature_names

# Initializing dictionaries to store results for GA-selected features
mse_ga_features = {}
selected_features_ga = {}
selected_features_names_ga = {}
not_selected_features_names_ga = {}

# Running the GA for each regressor in the regressors dictionary
for name, model in regressors.items():
    print(f"\nRunning GA for {name}...")
    best_individual, best_mse, selected_features, selected_feature_names, not_selected_feature_names = run_ga_for_regressor(name, model)
    
    # Storing the results for each regressor
    mse_ga_features[name] = best_mse
    selected_features_ga[name] = selected_features
    selected_features_names_ga[name] = selected_feature_names
    not_selected_features_names_ga[name] = not_selected_feature_names


Running GA for KNN...
Generation 1: MSE: 0.339875, Feature Subset: [0, 0, 1, 0, 0, 0, 1, 1]
Generation 2: MSE: 0.302857, Feature Subset: [0, 0, 1, 0, 0, 1, 1, 1]
Generation 3: MSE: 0.296418, Feature Subset: [1, 0, 0, 0, 0, 1, 1, 1]
Generation 4: MSE: 0.296418, Feature Subset: [1, 0, 0, 0, 0, 1, 1, 1]
Generation 5: MSE: 0.296418, Feature Subset: [1, 0, 0, 0, 0, 1, 1, 1]
Generation 6: MSE: 0.282219, Feature Subset: [0, 0, 0, 0, 0, 0, 1, 1]
Generation 7: MSE: 0.282219, Feature Subset: [0, 0, 0, 0, 0, 0, 1, 1]
Generation 8: MSE: 0.282219, Feature Subset: [0, 0, 0, 0, 0, 0, 1, 1]
Generation 9: MSE: 0.282219, Feature Subset: [0, 0, 0, 0, 0, 0, 1, 1]
Generation 10: MSE: 0.282219, Feature Subset: [0, 0, 0, 0, 0, 0, 1, 1]
Generation 11: MSE: 0.282219, Feature Subset: [0, 0, 0, 0, 0, 0, 1, 1]
Generation 12: MSE: 0.282219, Feature Subset: [0, 0, 0, 0, 0, 0, 1, 1]
Generation 13: MSE: 0.282219, Feature Subset: [0, 0, 0, 0, 0, 0, 1, 1]
Generation 14: MSE: 0.282219, Feature Subset: [0, 0, 0, 0, 0, 0

<h2 style="color: brown;">2.6 Final Comparison of MSEs with All Features vs. GA-Selected Features</h2> 

In [7]:
# Printing the final comparison of Mean Squared Errors (MSEs) with all features versus GA-selected features
print("\nComparison of MSE with All Features vs GA-Selected Features:")

# Looping through each regressor to compare the MSEs
for name in regressors.keys():
    print(f"{name}: MSE with All Features: {mse_all_features[name]:.6f} | MSE with GA-Selected Features: {mse_ga_features[name]:.6f}")


Comparison of MSE with All Features vs GA-Selected Features:
KNN: MSE with All Features: 0.423345 | MSE with GA-Selected Features: 0.282219
Decision Tree: MSE with All Features: 0.531282 | MSE with GA-Selected Features: 0.376411


<h2 style="color: brown;">2.7 Displaying and Tracking Feature Selections After GA Optimization</h2> 

In [12]:
# Initializing counters to keep track of selected and not selected features
selected_feature_counter = Counter()
not_selected_feature_counter = Counter()

# Printing the final feature selections and non-selections after GA optimization
print("\nSelected Features After GA Optimization:")
for name in regressors.keys():
    print(f"{name}:")
    print(f"  Features to Select: {selected_features_ga[name]}")
    print(f"  Feature names: {selected_features_names_ga[name]}")
    print(f"  Not Selected Features: {not_selected_features_names_ga[name]}")

    # Updating the counters with the selected and not selected feature names
    selected_feature_counter.update(selected_features_names_ga[name])
    not_selected_feature_counter.update(not_selected_features_names_ga[name])


Selected Features After GA Optimization:
KNN:
  Features to Select: [6, 7]
  Feature names: ['Latitude', 'Longitude']
  Not Selected Features: ['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup']
Decision Tree:
  Features to Select: [6, 7]
  Feature names: ['Latitude', 'Longitude']
  Not Selected Features: ['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup']


<h2 style="color: brown;">2.8 Analyzing Feature Selection Frequency After GA Optimization</h2> 

In [13]:
# Categorizing the selected features based on how often they are selected
selected_2_times = [feature for feature, count in selected_feature_counter.items() if count == 2]
selected_1_time = [feature for feature, count in selected_feature_counter.items() if count == 1]

# Categorizing the not selected features based on how often they are not selected
not_selected_2_times = [feature for feature, count in not_selected_feature_counter.items() if count == 2]
not_selected_1_time = [feature for feature, count in not_selected_feature_counter.items() if count == 1]

# Printing the categorized results of feature selection frequency
print("\nFeature Selection Frequency:")
print(f"Selected 2 times: {selected_2_times}")
print(f"Selected 1 time: {selected_1_time}")

# Printing the categorized results of feature non-selection frequency
print("\nFeature Non-Selection Frequency:")
print(f"Not Selected 2 times: {not_selected_2_times}")
print(f"Not Selected 1 time: {not_selected_1_time}")


Feature Selection Frequency:
Selected 2 times: ['Latitude', 'Longitude']
Selected 1 time: []

Feature Non-Selection Frequency:
Not Selected 2 times: ['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup']
Not Selected 1 time: []


<h1 style="color: brown;">3. Student Performance</h1> 

<h2 style="color: brown;">3.1 Analyzing and Displaying Column Types in the Student Dataset </h2> 

In [22]:
# Reading the CSV file into a DataFrame
data = pd.read_csv("student.csv")

# Selecting columns with numerical data types
numerical_cols = data.select_dtypes(include=['number']).columns

# Selecting columns with categorical data types
categorical_cols = data.select_dtypes(include=['object', 'category']).columns

# Counting the number of numerical columns
num_numerical = len(numerical_cols)

# Counting the number of categorical columns
num_categorical = len(categorical_cols)

# Printing the number of numerical columns and listing them
print(f'Number of numerical columns: {num_numerical}')
print('Numerical columns:', list(numerical_cols))
print()

# Printing the number of categorical columns and listing them
print(f'Number of categorical columns: {num_categorical}')
print('Categorical columns:', list(categorical_cols))

Number of numerical columns: 16
Numerical columns: ['age', 'Medu', 'Fedu', 'traveltime', 'studytime', 'failures', 'famrel', 'freetime', 'goout', 'Dalc', 'Walc', 'health', 'absences', 'G1', 'G2', 'G3']

Number of categorical columns: 17
Categorical columns: ['school', 'sex', 'address', 'famsize', 'Pstatus', 'Mjob', 'Fjob', 'reason', 'guardian', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery', 'higher', 'internet', 'romantic']


<h2 style="color: brown;">3.2 Dropping Specific Columns from the Dataset</h2> 

In [23]:
# Dropping the 'G1' and 'G2' columns from the DataFrame
data = data.drop(columns=['G1', 'G2'])

<h2 style="color: brown;">3.3 Encoding Categorical Features</h2> 

In [24]:
# Initializing dictionaries and lists to store encoding information
encoding_info = {}
label_encode_cols = []
one_hot_encode_cols = []

# Looping through each categorical column to determine the encoding method
for col in categorical_cols:
    # Counting the number of unique values in the column
    num_unique = data[col].nunique()
    
    # Choosing encoding method based on the number of unique values
    if num_unique <= 2:
        label_encode_cols.append(col)  # Adding column to label encoding list
        encoding_info[col] = "Label Encoding"  # Recording encoding method
    else:
        one_hot_encode_cols.append(col)  # Adding column to one-hot encoding list
        encoding_info[col] = "One-Hot Encoding"  # Recording encoding method

# Applying Label Encoding to the appropriate columns
le = LabelEncoder()
for col in label_encode_cols:
    data[col] = le.fit_transform(data[col])  # Transforming the column values

# Applying One-Hot Encoding to the appropriate columns
data = pd.get_dummies(data, columns=one_hot_encode_cols)

# Printing the encoding methods applied to each column
print("\nEncoding Applied:")
for col, encoding in encoding_info.items():
    print(f"Column '{col}': {encoding}")


Encoding Applied:
Column 'school': Label Encoding
Column 'sex': Label Encoding
Column 'address': Label Encoding
Column 'famsize': Label Encoding
Column 'Pstatus': Label Encoding
Column 'Mjob': One-Hot Encoding
Column 'Fjob': One-Hot Encoding
Column 'reason': One-Hot Encoding
Column 'guardian': One-Hot Encoding
Column 'schoolsup': Label Encoding
Column 'famsup': Label Encoding
Column 'paid': Label Encoding
Column 'activities': Label Encoding
Column 'nursery': Label Encoding
Column 'higher': Label Encoding
Column 'internet': Label Encoding
Column 'romantic': Label Encoding


<h2 style="color: brown;">3.4 Splitting the Student Dataset into Features and Target Variable</h2> 

In [25]:
# Separating the features and the target variable from the DataFrame
X = data.drop('G3', axis=1)  # Features: All columns except 'G3'
y = data['G3']  # Target variable: 'G3'

# Getting the list of feature names from the DataFrame
feature_names = X.columns.tolist()

# Splitting the feature data (X) and target data (y) into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

<h2 style="color: brown;">3.5 Evaluating Regression Models on the Student Performance Dataset</h2> 

In [26]:
# Creating a list of regressors to evaluate
regressors = {
    "KNN": KNeighborsRegressor(n_neighbors=5),  # K-Nearest Neighbors Regressor
    "Decision Tree": DecisionTreeRegressor(),  # Decision Tree Regressor
}

# Initializing a dictionary to store Mean Squared Errors (MSE) for each regressor
mse_all_features = {}

# Looping through each regressor to evaluate its performance
for name, model in regressors.items():
    # Fitting the model with the training data
    model.fit(X_train, y_train)
    
    # Making predictions using the test data
    predictions = model.predict(X_test)
    
    # Calculating the Mean Squared Error (MSE) of the predictions
    mse = mean_squared_error(y_test, predictions)
    
    # Storing the MSE for the current model in the dictionary
    mse_all_features[name] = mse
    
    # Printing the MSE of the current model with all features
    print(f"{name} MSE with All Features: {mse:.6f}")

KNN MSE with All Features: 19.972773
Decision Tree MSE with All Features: 32.344538


<h2 style="color: brown;">3.6 Setting Up the Genetic Algorithm for Feature Selection</h2> 

In [29]:
# Defining the function to evaluate the performance of an individual (feature subset) using a given model
def evaluate(individual, model):
    # Selecting features based on the binary representation of the individual
    selected_features = [i for i, bit in enumerate(individual) if bit]
    
    # Handling the case where no features are selected to avoid division by zero
    if len(selected_features) == 0:
        return (float('inf'),)  # Returning a high error value to indicate poor fitness

    # Creating a subset of the features based on the selected features
    X_subset = X.iloc[:, selected_features]

    # Splitting the feature subset and target data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X_subset, y, test_size=0.3, random_state=42)
    
    # Fitting the model with the training data
    model.fit(X_train, y_train)
    
    # Making predictions using the test data
    y_pred = model.predict(X_test)
    
    # Calculating the Mean Squared Error (MSE) of the predictions
    mse = mean_squared_error(y_test, y_pred)

    # Returning the MSE as the fitness value
    return (mse,)

# Setting up the Genetic Algorithm (GA) components

# Creating a fitness function class with minimization (lower MSE is better)
creator.create("FitnessMin", base.Fitness, weights=(-1.0,))

# Creating the individual class with a fitness attribute
creator.create("Individual", list, fitness=creator.FitnessMin)

# Initializing the GA toolbox
toolbox = base.Toolbox()

# Registering the attribute generator for individuals (binary values)
toolbox.register("attr_bool", np.random.randint, 2)

# Registering the individual generator (a list of binary values)
toolbox.register("individual", tools.initRepeat, creator.Individual, toolbox.attr_bool, n=X.shape[1])

# Registering the population generator (a list of individuals)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)

# Registering the mating function (uniform crossover)
toolbox.register("mate", tools.cxUniform, indpb=0.5)

# Registering the mutation function (flip bit mutation)
toolbox.register("mutate", tools.mutFlipBit, indpb=0.2)

# Registering the selection function (tournament selection)
toolbox.register("select", tools.selTournament, tournsize=3)


<h2 style="color: brown;">3.7 Running Genetic Algorithm for Feature Selection and Storing Results</h2> 

In [30]:
# Defining the function to run the Genetic Algorithm (GA) for a specific regressor
def run_ga_for_regressor(regressor_name, model):
    # Initializing the population of individuals for the GA
    population = toolbox.population(n=100)

    # Setting up a custom stopping condition
    max_stable_generations = 20
    stable_generations = 0
    previous_best_individual = None
    best_individual = None
    best_mse = float('inf')

    generation = 0
    while stable_generations < max_stable_generations:
        generation += 1

        # Registering the evaluation function with the chosen regressor
        toolbox.register("evaluate", evaluate, model=model)

        # Running one generation of the genetic algorithm
        population, _ = algorithms.eaSimple(population, toolbox, cxpb=0.5, mutpb=0.2, ngen=1, verbose=False)

        # Extracting the best individual from the current generation
        current_best_individual = tools.selBest(population, 1)[0]
        current_best_mse = evaluate(current_best_individual, model)[0]  # Unpacking the tuple correctly

        # Printing the results of the current best individual (binary vector)
        print(f"Generation {generation}: MSE: {current_best_mse:.6f}, Feature Subset: {current_best_individual}")

        # Checking if the current best individual is the same as the previous best
        if previous_best_individual is not None and current_best_individual == previous_best_individual:
            stable_generations += 1
        else:
            stable_generations = 0
            previous_best_individual = current_best_individual

        # Updating the best individual if necessary
        if current_best_mse < best_mse:
            best_individual = current_best_individual
            best_mse = current_best_mse

    # Determining the feature names of the best individual from the final generation
    final_selected_features = [i for i, bit in enumerate(best_individual) if bit]
    final_selected_feature_names = [feature_names[i] for i in final_selected_features]
    final_not_selected_feature_names = [feature_names[i] for i in range(len(feature_names)) if i not in final_selected_features]

    return best_individual, best_mse, final_selected_features, final_selected_feature_names, final_not_selected_feature_names

# Initializing dictionaries to store results for GA-selected features
mse_ga_features = {}
selected_features_ga = {}
selected_features_names_ga = {}
not_selected_features_names_ga = {}

# Running the GA for each regressor in the regressors dictionary
for name, model in regressors.items():
    print(f"\nRunning GA for {name}...")
    best_individual, best_mse, selected_features, selected_feature_names, not_selected_feature_names = run_ga_for_regressor(name, model)
    
    # Storing the results for each regressor
    mse_ga_features[name] = best_mse
    selected_features_ga[name] = selected_features
    selected_features_names_ga[name] = selected_feature_names
    not_selected_features_names_ga[name] = not_selected_feature_names


Running GA for KNN...
Generation 1: MSE: 17.785210, Feature Subset: [0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1]
Generation 2: MSE: 16.245378, Feature Subset: [0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1]
Generation 3: MSE: 16.564034, Feature Subset: [0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1]
Generation 4: MSE: 16.468235, Feature Subset: [0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1]
Generation 5: MSE: 15.796975, Feature Subset: [0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1]
Generation 6: MSE: 15.796975, Feature Subset: [0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0,

<h2 style="color: brown;">3.8 Final Comparison of MSEs with All Features vs. GA-Selected Features</h2> 

In [31]:
# Printing the final comparison of Mean Squared Errors (MSEs) with all features versus GA-selected features
print("\nComparison of MSE with All Features vs GA-Selected Features:")

# Looping through each regressor to compare the MSEs
for name in regressors.keys():
    print(f"{name}: MSE with All Features: {mse_all_features[name]:.6f} | MSE with GA-Selected Features: {mse_ga_features[name]:.6f}")


Comparison of MSE with All Features vs GA-Selected Features:
KNN: MSE with All Features: 19.972773 | MSE with GA-Selected Features: 14.351261
Decision Tree: MSE with All Features: 32.344538 | MSE with GA-Selected Features: 12.659664


<h2 style="color: brown;">3.9 Displaying and Tracking Feature Selections After GA Optimization</h2> 

In [32]:
# Initializing counters to keep track of selected and not selected features
selected_feature_counter = Counter()
not_selected_feature_counter = Counter()

# Printing the final feature selections and non-selections after GA optimization
print("\nSelected Features After GA Optimization:")
for name in regressors.keys():
    print(f"{name}:")
    print(f"  Features to Select: {selected_features_ga[name]}")
    print(f"  Feature names: {selected_features_names_ga[name]}")
    print(f"  Not Selected Features: {not_selected_features_names_ga[name]}")

    # Updating the counters with the selected and not selected feature names
    selected_feature_counter.update(selected_features_names_ga[name])
    not_selected_feature_counter.update(not_selected_features_names_ga[name])


Selected Features After GA Optimization:
KNN:
  Features to Select: [1, 2, 4, 5, 6, 10, 11, 12, 15, 20, 21, 23, 24, 25, 26, 30, 31, 33, 35, 36, 40, 41, 42]
  Feature names: ['sex', 'age', 'famsize', 'Pstatus', 'Medu', 'failures', 'schoolsup', 'famsup', 'nursery', 'freetime', 'goout', 'Walc', 'health', 'absences', 'Mjob_at_home', 'Mjob_teacher', 'Fjob_at_home', 'Fjob_other', 'Fjob_teacher', 'reason_course', 'guardian_father', 'guardian_mother', 'guardian_other']
  Not Selected Features: ['school', 'address', 'Fedu', 'traveltime', 'studytime', 'paid', 'activities', 'higher', 'internet', 'romantic', 'famrel', 'Dalc', 'Mjob_health', 'Mjob_other', 'Mjob_services', 'Fjob_health', 'Fjob_services', 'reason_home', 'reason_other', 'reason_reputation']
Decision Tree:
  Features to Select: [0, 1, 3, 5, 8, 9, 10, 11, 13, 14, 15, 17, 21, 23, 25, 27, 28, 30, 33, 34, 38, 39, 40, 42]
  Feature names: ['school', 'sex', 'address', 'Pstatus', 'traveltime', 'studytime', 'failures', 'schoolsup', 'paid', 'a

<h2 style="color: brown;">3.10 Analyzing Feature Selection Frequency After GA Optimization</h2> 

In [34]:
# Categorizing the selected features based on how often they are selected
selected_2_times = [feature for feature, count in selected_feature_counter.items() if count == 2]
selected_1_time = [feature for feature, count in selected_feature_counter.items() if count == 1]

# Categorizing the not selected features based on how often they are not selected
not_selected_2_times = [feature for feature, count in not_selected_feature_counter.items() if count == 2]
not_selected_1_time = [feature for feature, count in not_selected_feature_counter.items() if count == 1]

# Printing the categorized results of feature selection frequency
print("\nFeature Selection Frequency:")
print(f"Selected 2 times: {selected_2_times}")
print(f"Selected 1 time: {selected_1_time}")

# Printing the categorized results of feature non-selection frequency
print("\nFeature Non-Selection Frequency:")
print(f"Not Selected 2 times: {not_selected_2_times}")
print(f"Not Selected 1 time: {not_selected_1_time}")


Feature Selection Frequency:
Selected 2 times: ['sex', 'Pstatus', 'failures', 'schoolsup', 'nursery', 'goout', 'Walc', 'absences', 'Mjob_teacher', 'Fjob_other', 'guardian_father', 'guardian_other']
Selected 1 time: ['age', 'famsize', 'Medu', 'famsup', 'freetime', 'health', 'Mjob_at_home', 'Fjob_at_home', 'Fjob_teacher', 'reason_course', 'guardian_mother', 'school', 'address', 'traveltime', 'studytime', 'paid', 'activities', 'internet', 'Mjob_health', 'Mjob_other', 'Fjob_services', 'reason_other', 'reason_reputation']

Feature Non-Selection Frequency:
Not Selected 2 times: ['Fedu', 'higher', 'romantic', 'famrel', 'Dalc', 'Mjob_services', 'Fjob_health', 'reason_home']
Not Selected 1 time: ['school', 'address', 'traveltime', 'studytime', 'paid', 'activities', 'internet', 'Mjob_health', 'Mjob_other', 'Fjob_services', 'reason_other', 'reason_reputation', 'age', 'famsize', 'Medu', 'famsup', 'freetime', 'health', 'Mjob_at_home', 'Fjob_at_home', 'Fjob_teacher', 'reason_course', 'guardian_moth

<h1 style="color: brown;">4. Student Dropout and Academic Success</h1> 

<h2 style="color: brown;">4.1 Loading Dataset and Analyzing Column Types</h2> 

In [75]:
# Reading the CSV file into a DataFrame
data = pd.read_csv("data.csv")

# Selecting columns with numerical data types
numerical_cols = data.select_dtypes(include=['number']).columns

# Selecting columns with categorical data types
categorical_cols = data.select_dtypes(include=['object', 'category']).columns

# Counting the number of numerical columns
num_numerical = len(numerical_cols)

# Counting the number of categorical columns
num_categorical = len(categorical_cols)

# Printing the number of numerical columns and listing them
print(f'Number of numerical columns: {num_numerical}')
print('Numerical columns:', list(numerical_cols))
print()

# Printing the number of categorical columns and listing them
print(f'Number of categorical columns: {num_categorical}')
print('Categorical columns:', list(categorical_cols))

Number of numerical columns: 36
Numerical columns: ['Marital status', 'Application mode', 'Application order', 'Course', 'Daytime/evening attendance\t', 'Previous qualification', 'Previous qualification (grade)', 'Nacionality', "Mother's qualification", "Father's qualification", "Mother's occupation", "Father's occupation", 'Admission grade', 'Displaced', 'Educational special needs', 'Debtor', 'Tuition fees up to date', 'Gender', 'Scholarship holder', 'Age at enrollment', 'International', 'Curricular units 1st sem (credited)', 'Curricular units 1st sem (enrolled)', 'Curricular units 1st sem (evaluations)', 'Curricular units 1st sem (approved)', 'Curricular units 1st sem (grade)', 'Curricular units 1st sem (without evaluations)', 'Curricular units 2nd sem (credited)', 'Curricular units 2nd sem (enrolled)', 'Curricular units 2nd sem (evaluations)', 'Curricular units 2nd sem (approved)', 'Curricular units 2nd sem (grade)', 'Curricular units 2nd sem (without evaluations)', 'Unemployment ra

<h2 style="color: brown;">4.2 Label Encoding the 'Target' Column in the Dataset</h2> 

In [76]:
# Initializing a LabelEncoder for encoding the 'Target' column
label_encoder = LabelEncoder()

# Fitting and transforming the 'Target' column with the LabelEncoder
data['Target'] = label_encoder.fit_transform(data['Target'])

# Creating a mapping of original values to encoded values for the 'Target' column
mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))

# Printing the encoding mappings for the 'Target' column
print('Column "Target" encoding:')
for original, encoded in mapping.items():
    print(f'  Value: {original} -> Encoded: {encoded}')

Column "Target" encoding:
  Value: Dropout -> Encoded: 0
  Value: Enrolled -> Encoded: 1
  Value: Graduate -> Encoded: 2


<h2 style="color: brown;">4.3 Preparing the Dataset for Model Training and Testing</h2> 

In [77]:
# Separating the features and the target variable from the DataFrame
X = data.drop('Target', axis=1)  # Features: All columns except 'Target'
Y = data['Target']  # Target variable: 'Target'

# Getting the list of feature names from the DataFrame
feature_names = X.columns.tolist()

# Standardizing the features to have mean=0 and variance=1
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Splitting the feature data (X) and target data (Y) into training and testing sets
# 30% of the data is used for testing, and the random state ensures reproducibility
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=42)

<h2 style="color: brown;">4.4 Evaluating Classifier Models on the Dataset Using Accuracy</h2> 

In [78]:
# Creating a dictionary of classifiers to evaluate
classifiers = {
    "KNN": KNeighborsClassifier(n_neighbors=5),
    "Decision Tree": DecisionTreeClassifier()
}

# Initializing a dictionary to store the accuracy of each classifier
accuracy_all_features = {}

# Looping through each classifier to evaluate its performance
for name, model in classifiers.items():
    # Fitting the model with the training data
    model.fit(X_train, y_train)
    
    # Making predictions using the test data
    predictions = model.predict(X_test)
    
    # Calculating the accuracy of the predictions
    accuracy = accuracy_score(y_test, predictions)
    
    # Storing the accuracy in the dictionary
    accuracy_all_features[name] = accuracy
    
    # Printing the accuracy of the current classifier with all features
    print(f"{name} Accuracy with All Features: {accuracy:.6f}")

KNN Accuracy with All Features: 0.691265
Decision Tree Accuracy with All Features: 0.663404


<h2 style="color: brown;">4.5 Setting Up the Genetic Algorithm for Feature Selection with Classifier Models</h2> 

In [79]:
# Defining the fitness function for the genetic algorithm using a given classifier
def evaluate(individual, model):
    # Selecting features based on the binary representation of the individual
    selected_features = [index for index, bit in enumerate(individual) if bit]
    
    # Handling the case where no features are selected by returning a low accuracy value
    if len(selected_features) == 0:
        return (0.0,)  # Returning a low accuracy value to indicate poor fitness
    
    # Creating a subset of the features based on the selected features
    X_subset = X[:, selected_features]
    
    # Splitting the feature subset and target data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X_subset, Y, test_size=0.3, random_state=42)
    
    # Fitting the model with the training data
    model.fit(X_train, y_train)
    
    # Making predictions using the test data
    predictions = model.predict(X_test)
    
    # Calculating the accuracy of the predictions
    accuracy = accuracy_score(y_test, predictions)
    
    # Returning the accuracy as the fitness value
    return (accuracy,)

# Setting up the Genetic Algorithm (GA) components

# Creating a fitness function class with maximization (higher accuracy is better)
creator.create("FitnessMax", base.Fitness, weights=(1.0,))

# Creating the individual class with a fitness attribute
creator.create("Individual", list, fitness=creator.FitnessMax)

# Initializing the GA toolbox
toolbox = base.Toolbox()

# Registering the attribute generator for individuals (binary values)
toolbox.register("attr_bool", np.random.randint, 2)

# Registering the individual generator (a list of binary values)
toolbox.register("individual", tools.initRepeat, creator.Individual, toolbox.attr_bool, n=X.shape[1])

# Registering the population generator (a list of individuals)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)

# Registering the mating function (uniform crossover)
toolbox.register("mate", tools.cxUniform, indpb=0.5)

# Registering the mutation function (flip bit mutation)
toolbox.register("mutate", tools.mutFlipBit, indpb=0.2)

# Registering the selection function (tournament selection)
toolbox.register("select", tools.selTournament, tournsize=3)



<h2 style="color: brown;">4.6 Running Genetic Algorithm for Feature Selection and Storing Results for Classifiers</h2> 

In [80]:
# Defining a function to run the genetic algorithm (GA) for a specific classifier
def run_ga_for_classifier(classifier_name, model):
    # Initializing the population of individuals for the GA
    population = toolbox.population(n=100)  # Adjusted to match regressor's population size

    # Setting up a custom stopping condition
    max_stable_generations = 20
    stable_generations = 0
    previous_best_individual = None
    best_individual = None
    best_accuracy = 0.0

    generation = 0
    while stable_generations < max_stable_generations:
        generation += 1

        # Registering the evaluation function with the chosen classifier
        toolbox.register("evaluate", evaluate, model=model)

        # Running one generation of the genetic algorithm
        population, _ = algorithms.eaSimple(population, toolbox, cxpb=0.5, mutpb=0.2, ngen=1, verbose=False)

        # Extracting the best individual of the current generation
        current_best_individual = tools.selBest(population, 1)[0]
        current_best_accuracy = evaluate(current_best_individual, model)[0]  # Assuming evaluate returns a tuple

        # Printing the results of the current best individual
        print(f"Generation {generation}: Accuracy: {current_best_accuracy:.6f}, Feature Subset: {current_best_individual}")

        # Checking if the current best individual is the same as the previous best
        if previous_best_individual is not None and current_best_individual == previous_best_individual:
            stable_generations += 1
        else:
            stable_generations = 0
            previous_best_individual = current_best_individual

        # Updating the best individual if the current one has better accuracy
        if current_best_accuracy > best_accuracy:
            best_individual = current_best_individual
            best_accuracy = current_best_accuracy

    # Determining the feature names of the best individual from the final generation
    final_selected_features = [i for i, bit in enumerate(best_individual) if bit]
    final_selected_feature_names = [feature_names[i] for i in final_selected_features]
    final_not_selected_feature_names = [feature_names[i] for i in range(len(feature_names)) if i not in final_selected_features]

    return best_individual, best_accuracy, final_selected_features, final_selected_feature_names, final_not_selected_feature_names

# Initializing dictionaries to store results for GA-selected features
accuracy_ga_features = {}
selected_features_ga = {}
selected_features_names_ga = {}
not_selected_features_names_ga = {}

# Running the GA for each classifier and storing results
for name, model in classifiers.items():
    print(f"\nRunning GA for {name}...")
    best_individual, best_accuracy, selected_features, selected_feature_names, not_selected_feature_names = run_ga_for_classifier(name, model)
    
    # Storing the results for each classifier
    accuracy_ga_features[name] = best_accuracy
    selected_features_ga[name] = selected_features
    selected_features_names_ga[name] = selected_feature_names
    not_selected_features_names_ga[name] = not_selected_feature_names



Running GA for KNN...
Generation 1: Accuracy: 0.741717, Feature Subset: [0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0]
Generation 2: Accuracy: 0.741717, Feature Subset: [0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0]
Generation 3: Accuracy: 0.741717, Feature Subset: [0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0]
Generation 4: Accuracy: 0.743223, Feature Subset: [0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0]
Generation 5: Accuracy: 0.743223, Feature Subset: [0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0]
Generation 6: Accuracy: 0.743976, Feature Subset: [0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0]
Generation 7: Accuracy:

<h2 style="color: brown;">4.7 Final Comparison of MSEs with All Features vs. GA-Selected Features</h2> 

In [84]:
# Print final comparison of accuracies
print("\nComparison of Accuracy with All Features vs GA-Selected Features:")
for name, model in classifiers.items():
    # Evaluate model with all features
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=42)
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    all_features_accuracy = accuracy_score(y_test, predictions)
    
    print(f"{name}: Accuracy with All Features: {all_features_accuracy:.6f} | Accuracy with GA-Selected Features: {accuracy_ga_features[name]:.6f}")


Comparison of Accuracy with All Features vs GA-Selected Features:
KNN: Accuracy with All Features: 0.691265 | Accuracy with GA-Selected Features: 0.762048
Decision Tree: Accuracy with All Features: 0.660392 | Accuracy with GA-Selected Features: 0.736446


<h2 style="color: brown;">Displaying and Tracking Feature Selections After GA Optimization</h2> 

In [82]:
# Initializing counters to keep track of selected and not selected features
selected_feature_counter = Counter()
not_selected_feature_counter = Counter()

# Printing the final feature selections and non-selections after GA optimization
print("\nSelected Features After GA Optimization:")
for name in regressors.keys():
    print(f"{name}:")
    print(f"  Features to Select: {selected_features_ga[name]}")
    print(f"  Feature names: {selected_features_names_ga[name]}")
    print(f"  Not Selected Features: {not_selected_features_names_ga[name]}")

    # Updating the counters with the selected and not selected feature names
    selected_feature_counter.update(selected_features_names_ga[name])
    not_selected_feature_counter.update(not_selected_features_names_ga[name])


Selected Features After GA Optimization:
KNN:
  Features to Select: [3, 8, 11, 16, 18, 19, 21, 22, 23, 24, 27, 30, 31, 32]
  Feature names: ['Course', "Mother's qualification", "Father's occupation", 'Tuition fees up to date', 'Scholarship holder', 'Age at enrollment', 'Curricular units 1st sem (credited)', 'Curricular units 1st sem (enrolled)', 'Curricular units 1st sem (evaluations)', 'Curricular units 1st sem (approved)', 'Curricular units 2nd sem (credited)', 'Curricular units 2nd sem (approved)', 'Curricular units 2nd sem (grade)', 'Curricular units 2nd sem (without evaluations)']
  Not Selected Features: ['Marital status', 'Application mode', 'Application order', 'Daytime/evening attendance\t', 'Previous qualification', 'Previous qualification (grade)', 'Nacionality', "Father's qualification", "Mother's occupation", 'Admission grade', 'Displaced', 'Educational special needs', 'Debtor', 'Gender', 'International', 'Curricular units 1st sem (grade)', 'Curricular units 1st sem (with

<h2 style="color: brown;">Analyzing Feature Selection Frequency After GA Optimization</h2> 

In [83]:
# Categorizing the selected features based on how often they are selected
selected_2_times = [feature for feature, count in selected_feature_counter.items() if count == 2]
selected_1_time = [feature for feature, count in selected_feature_counter.items() if count == 1]

# Categorizing the not selected features based on how often they are not selected
not_selected_2_times = [feature for feature, count in not_selected_feature_counter.items() if count == 2]
not_selected_1_time = [feature for feature, count in not_selected_feature_counter.items() if count == 1]

# Printing the categorized results of feature selection frequency
print("\nFeature Selection Frequency:")
print(f"Selected 2 times: {selected_2_times}")
print(f"Selected 1 time: {selected_1_time}")

# Printing the categorized results of feature non-selection frequency
print("\nFeature Non-Selection Frequency:")
print(f"Not Selected 2 times: {not_selected_2_times}")
print(f"Not Selected 1 time: {not_selected_1_time}")


Feature Selection Frequency:
Selected 2 times: ['Course', 'Tuition fees up to date', 'Curricular units 1st sem (credited)', 'Curricular units 1st sem (approved)', 'Curricular units 2nd sem (credited)', 'Curricular units 2nd sem (approved)', 'Curricular units 2nd sem (grade)', 'Curricular units 2nd sem (without evaluations)']
Selected 1 time: ["Mother's qualification", "Father's occupation", 'Scholarship holder', 'Age at enrollment', 'Curricular units 1st sem (enrolled)', 'Curricular units 1st sem (evaluations)', 'Application order', 'Daytime/evening attendance\t', "Mother's occupation", 'Displaced', 'Educational special needs', 'Debtor', 'Curricular units 1st sem (without evaluations)', 'Curricular units 2nd sem (enrolled)', 'Curricular units 2nd sem (evaluations)', 'Inflation rate']

Feature Non-Selection Frequency:
Not Selected 2 times: ['Marital status', 'Application mode', 'Previous qualification', 'Previous qualification (grade)', 'Nacionality', "Father's qualification", 'Admissi