In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from xgboost import XGBRegressor
from deap import base, creator, tools, algorithms
import random
import warnings

warnings.filterwarnings("ignore")

In [3]:
# Load dataset
df = pd.read_csv('clean_property_data.csv')

# Define target and features
target = 'TOTAL_TAX'
X = df.drop(columns=[target])
y = df[target]

# Handle categorical variables
X = pd.get_dummies(X)

# Ensure no NaNs
X.fillna(0, inplace=True)

In [4]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# GA parameters
POP_SIZE = 30
GENS = 15
CXPB = 0.5
MUTPB = 0.2

In [5]:
# GA setup
creator.create("FitnessMin", base.Fitness, weights=(-1.0,))  # Minimize MAE
creator.create("Individual", list, fitness=creator.FitnessMin)

toolbox = base.Toolbox()
n_features = X_train.shape[1]

# Attribute generator: binary inclusion for each feature
toolbox.register("attr_bool", lambda: random.randint(0, 1))
toolbox.register("individual", tools.initRepeat, creator.Individual, toolbox.attr_bool, n=n_features)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)

# Fitness function
def eval_individual(individual):
    selected = [bool(gene) for gene in individual]
    if sum(selected) == 0:
        return (float('inf'),)  # Penalize empty selection

    model = XGBRegressor(n_estimators=100, random_state=42, verbosity=0)
    model.fit(X_train.loc[:, selected], y_train)
    preds = model.predict(X_test.loc[:, selected])
    mae = mean_absolute_error(y_test, preds)
    return (mae,)

# Operators
toolbox.register("evaluate", eval_individual)
toolbox.register("mate", tools.cxTwoPoint)
toolbox.register("mutate", tools.mutFlipBit, indpb=0.1)
toolbox.register("select", tools.selTournament, tournsize=3)

In [6]:
# GA main loop
def run_ga():
    pop = toolbox.population(n=POP_SIZE)
    hof = tools.HallOfFame(1)
    stats = tools.Statistics(lambda ind: ind.fitness.values)
    stats.register("avg", np.mean)
    stats.register("min", np.min)
    
    pop, log = algorithms.eaSimple(pop, toolbox, cxpb=CXPB, mutpb=MUTPB, ngen=GENS,
                                   stats=stats, halloffame=hof, verbose=True)

    return hof[0]

In [7]:
# Run GA
best_ind = run_ga()

# Final model with best features
selected_features = [col for col, sel in zip(X_train.columns, best_ind) if sel == 1]
print("Selected Features:", selected_features)

final_model = XGBRegressor(n_estimators=100, random_state=42, verbosity=0)
final_model.fit(X_train[selected_features], y_train)
preds = final_model.predict(X_test[selected_features])
mae = mean_absolute_error(y_test, preds)

print(f"Final MAE on Test Set: {mae:.4f}")


gen	nevals	avg        	min        
0  	30    	2.90577e+06	1.77075e+06
1  	16    	2.10451e+06	1.77075e+06
2  	18    	1.96525e+06	1.77075e+06
3  	18    	1.83972e+06	1.77075e+06
4  	19    	1.96326e+06	1.73846e+06
5  	16    	1.91288e+06	1.73846e+06
6  	19    	1.77218e+06	1.73846e+06
7  	12    	1.9114e+06 	1.65504e+06
8  	21    	1.72949e+06	1.65504e+06
9  	18    	1.71621e+06	1.65504e+06
10 	20    	1.6932e+06 	1.65504e+06
11 	20    	1.71518e+06	1.65504e+06
12 	18    	1.88939e+06	1.65504e+06
13 	18    	1.7167e+06 	1.65504e+06
14 	13    	1.81615e+06	1.65504e+06
15 	21    	1.94396e+06	1.65504e+06
Selected Features: ['Unnamed: 0', 'YEAR', 'NUM_PROP_WARDNO', 'NUM_PROP_RATE', 'NUM_PROP_AREA', 'NUM_PROP_ANNUALRENT']
Final MAE on Test Set: 1655043.6950
