In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, hamming_loss, accuracy_score
from deap import base, creator, tools, algorithms

# **Load Dataset**
df = pd.read_csv('Data/New-Class-smell.csv')
print("Shape of original DataFrame: ", df.shape)

# **Drop Address Column**
df = df.drop(columns=['Address','Hierarchy Duplication','Futile Abstract Pipeline','Futile Hierarchy'])

# **Check for Missing Values**
missing_values = df.isnull().sum()
print("\nMissing Values:\n", missing_values[missing_values > 0])

# Drop rows where any label in smell_columns is NaN
smell_columns = [
    'Brain Class', 'Data Class', 
    'God Class', 'Schizofrenic Class', 'Model Class'
]

print("\nDropping rows where any label is NaN...")
initial_shape = df.shape
df = df.dropna(subset=smell_columns)
print(f"Rows before dropping: {initial_shape[0]}, Rows after dropping: {df.shape[0]}")

# **Check for Duplicate Rows**
duplicates = df[df.duplicated()]
print(f"\nDuplicate Rows Found: {duplicates.shape[0]}")
df = df.drop_duplicates()
print("Shape after removing duplicate rows: ", df.shape)

# **Split Dataset**
X = df.drop(columns=smell_columns)  # Features
y = df[smell_columns]  # Labels

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("\nShape of Training Features: ", X_train.shape)
print("Shape of Training Labels: ", y_train.shape)
print("Shape of Test Features: ", X_test.shape)
print("Shape of Test Labels: ", y_test.shape)

# **Genetic Algorithm for Feature Selection**

# Create a fitness function to evaluate feature subsets
def evaluate_feature_subset(individual):
    """
    Evaluates a feature subset using the classification performance on training data.
    """
    selected_features = [index for index, include in enumerate(individual) if include == 1]
    if len(selected_features) == 0:  # Avoid empty feature subsets
        return float('inf'),
    
    # Subset the data based on selected features
    X_train_subset = X_train.iloc[:, selected_features]
    X_test_subset = X_test.iloc[:, selected_features]
    
    # Train and evaluate the model
    multi_label_model = MultiOutputClassifier(RandomForestClassifier(class_weight="balanced", random_state=42))
    multi_label_model.fit(X_train_subset, y_train)
    y_test_pred = multi_label_model.predict(X_test_subset)
    
    # Use Hamming loss as the fitness metric (lower is better)
    return hamming_loss(y_test, y_test_pred),

# Setup DEAP framework for GA
num_features = X_train.shape[1]

creator.create("FitnessMin", base.Fitness, weights=(-1.0,))  # Minimize hamming_loss
creator.create("Individual", list, fitness=creator.FitnessMin)

toolbox = base.Toolbox()
toolbox.register("attr_bool", np.random.randint, 0, 2)  # Binary genes: 0 or 1
toolbox.register("individual", tools.initRepeat, creator.Individual, toolbox.attr_bool, n=num_features)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)

toolbox.register("mate", tools.cxTwoPoint)  # Two-point crossover
toolbox.register("mutate", tools.mutFlipBit, indpb=0.05)  # Flip bit mutation
toolbox.register("select", tools.selTournament, tournsize=3)  # Tournament selection
toolbox.register("evaluate", evaluate_feature_subset)

# GA Parameters
population_size = 20
num_generations = 10
crossover_probability = 0.8
mutation_probability = 0.2

# Initialize population
population = toolbox.population(n=population_size)

# Run GA
best_individuals = algorithms.eaSimple(
    population,
    toolbox,
    cxpb=crossover_probability,
    mutpb=mutation_probability,
    ngen=num_generations,
    verbose=True
)

# Get the best feature subset
best_individual = tools.selBest(population, k=1)[0]
selected_features = [index for index, include in enumerate(best_individual) if include == 1]
print(f"\nBest Feature Subset: {selected_features}")
# Map indices to column names
selected_columns = X_train.columns[selected_features]
print("Selected Columns:", selected_columns)


# Subset data based on selected features
X_train_selected = X_train.iloc[:, selected_features]
X_test_selected = X_test.iloc[:, selected_features]

# Train and evaluate final model on selected features
final_model = MultiOutputClassifier(RandomForestClassifier(class_weight="balanced", random_state=42))
final_model.fit(X_train_selected, y_train)

y_test_pred = final_model.predict(X_test_selected)
print("\n--- Final Model Performance on Test Data ---")
print(classification_report(y_test, y_test_pred, target_names=smell_columns))

# Overall Hamming Loss
final_hamming_loss = hamming_loss(y_test, y_test_pred)
print(f"\nFinal Hamming Loss: {final_hamming_loss:.4f}")

# **Accuracy Score**
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Test Accuracy: {test_accuracy:.4f}")


Shape of original DataFrame:  (373400, 50)

Missing Values:
 Series([], dtype: int64)

Dropping rows where any label is NaN...
Rows before dropping: 373400, Rows after dropping: 373400

Duplicate Rows Found: 134040
Shape after removing duplicate rows:  (239360, 46)

Shape of Training Features:  (191488, 41)
Shape of Training Labels:  (191488, 5)
Shape of Test Features:  (47872, 41)
Shape of Test Labels:  (47872, 5)




gen	nevals
0  	20    
1  	14    
2  	19    
3  	12    
4  	17    
5  	19    
6  	15    
7  	17    
8  	16    
9  	16    
10 	18    

Best Feature Subset: [0, 1, 2, 3, 4, 6, 8, 9, 10, 14, 15, 18, 19, 21, 23, 24, 25, 26, 27, 29, 31, 36, 37, 38, 39, 40]
Selected Columns: Index(['ABUSEINH', 'AMW', 'ATFD', 'BOvM', 'BUR', 'CC', 'CRIX', 'DAC', 'DIT',
       'GREEDY', 'HDUPCLS', 'LOCC', 'NAS', 'NDU', 'NOAM', 'NOD', 'NODD', 'NOM',
       'NOPA', 'NSPECM', 'NrBM', 'PNAS', 'SCHIZO', 'TCC', 'WMC', 'WOC'],
      dtype='object')

--- Final Model Performance on Test Data ---
                    precision    recall  f1-score   support

       Brain Class       0.98      1.00      0.99       182
        Data Class       1.00      1.00      1.00      3680
         God Class       1.00      1.00      1.00       712
Schizofrenic Class       1.00      1.00      1.00      3873
       Model Class       1.00      1.00      1.00     47795

         micro avg       1.00      1.00      1.00     56242
         ma

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
