In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, KFold, cross_val_score, RandomizedSearchCV
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error
from random import randint
import numpy as np
import copy
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer


def initilization_of_population(size, template):
    population = copy.deepcopy(template)
    for _ in range(size):
        population['LotArea'].append(randint(3330, 16278))
        population['LotFrontage'].append(randint(34, 107))
        population['OverallQual'].append(randint(4, 9))
        population['OverallCond'].append(randint(4, 8))
        population['YearBuilt'].append(randint(1916, 2007))
        population['TotalBsmtSF'].append(randint(539, 1810)) 
        population['1stFlrSF'].append(randint(682, 1844)) 
        population['GrLivArea'].append(randint(864, 2480)) 
        population['TotRmsAbvGrd'].append(randint(4, 10)) 
        population['GarageYrBlt'].append(randint(1926, 2007))
        population['GarageArea'].append(randint(240, 864))
        population['YrSold'].append(randint(2006, 2010))
        
    population = pd.DataFrame(population)
    return population

def fitness_score(population, target_price, target_lotarea, w_price, w_area):
    scores = []
    # display(population)
    XGB_GA_predictions = XGBoost.predict(population)
    for price_predictions, lotareas in zip(XGB_GA_predictions, population['LotArea']):
        err_price = abs(target_price - price_predictions)
        err_area = abs(target_lotarea - lotareas)
        norm_price = err_price / target_price
        norm_area = err_area / target_lotarea
        fitness = w_price * norm_price + w_area * norm_area
        scores.append(fitness)
    # display(scores)
    inds = np.argsort(scores)

    return list(np.array(scores)[inds]), dict(population.iloc[inds].reset_index(drop=True)), list(XGB_GA_predictions[inds])

def selection(pop_after_fit,elitism,template):
    population_nextgen = copy.deepcopy(template)
    for i in range(elitism):
        for key in pop_after_fit:
            population_nextgen[key].append(pop_after_fit[key][i])
    return population_nextgen

def crossover(pop_after_sel, size, pop_after_fit, mutation_rate, factor_space, template, pct):
    pop_nextgen = pop_after_sel
    while len(pop_nextgen['LotArea']) < round(size*(1-pct)):
        parents = pd.DataFrame(pop_after_fit).iloc[:5].sample(n=2)
        parent_1 = dict(parents.iloc[0])
        parent_2 = dict(parents.iloc[1])
        child = copy.deepcopy(template)
        for key in parent_1:
            child[key] = parent_1[key] if np.random.rand() < 0.5 else parent_2[key]
        if np.random.rand() < mutation_rate:
            child = mutation(child, factor_space)
        for key in pop_nextgen:
            pop_nextgen[key].append(child[key])
    return pd.DataFrame(pop_nextgen)

def mutation(child, factor_space):
    key = np.random.choice(['LotArea', 'LotFrontage', 'OverallQual', 'OverallCond', 'YearBuilt', 'TotalBsmtSF', 
                            '1stFlrSF', 'GrLivArea', 'TotRmsAbvGrd', 'GarageYrBlt', 'GarageArea', 'YrSold'])
    child[key] = randint(*factor_space[key])
    return child

def reinit_worst(pop_after_crossover, size, factor_space):
    worst_n = size - len(pop_after_crossover['LotArea'])
    if worst_n == 0:
        pass
    else:
        for i in range(worst_n):
            pop_after_crossover.loc[size - worst_n + i] = [randint(low, high) for low, high in factor_space.values()]
    return pop_after_crossover

def generations(size, mutation_rate, n_gen, elitism, target_price, target_lotarea, w_price, w_area, template, pct):
    best_chromo = copy.deepcopy(template)
    best_score = []
    best_predictions = []
    factor_space = {
        'LotArea': (3330, 16278),
        'LotFrontage': (34, 107),
        'OverallQual': (4, 9),
        'OverallCond': (4, 8),
        'YearBuilt': (1916, 2007),
        'TotalBsmtSF': (539, 1810),
        '1stFlrSF': (682, 1844),
        'GrLivArea': (864, 2480), 
        'TotRmsAbvGrd': (4, 10), 
        'GarageYrBlt': (1926, 2007),
        'GarageArea': (240, 864),
        'YrSold': (2006, 2010)
    }
    population_nextgen = initilization_of_population(size, template)
    for i in range(n_gen):
        scores, pop_after_fit, predictions_list = fitness_score(population_nextgen, target_price, target_lotarea, w_price, w_area)
        print('Best score in generation',i+1,':',scores[0])  #2
        pop_after_sel = selection(pop_after_fit,elitism, template)
        pop_after_crossover = crossover(pop_after_sel, size, pop_after_fit, mutation_rate, factor_space, template, pct)
        population_nextgen = reinit_worst(pop_after_crossover, size, factor_space)
        for key in best_chromo:
            best_chromo[key].append(pop_after_fit[key][0])
        best_score.append(scores[0])
        best_predictions.append(predictions_list[0])
    return best_chromo,best_score, best_predictions

In [None]:
# # LotArea, LotFrontage, OverallQual, OverallCond, YearBuilt, TotalBsmtSF, 1stFlrSF, GrLivArea, TotRmsAbvGrd, GarageYrBlt, GarageArea, YrSold, (comeout)SalePrice
# # pd.set_option('display.max_rows', None)       # 顯示所有列
# # pd.set_option('display.max_columns', None)    # 顯示所有欄
# # pd.set_option('display.width', None)          # 自動調整寬度
# # pd.set_option('display.max_colwidth', None)   # 顯示欄位中所有內容

train_home_data = pd.read_csv('home-data-for-ml-course/train.csv')
home_data = train_home_data[['LotArea', 'LotFrontage', 'OverallQual', 'OverallCond', 'YearBuilt', 'TotalBsmtSF', 
                           '1stFlrSF', 'GrLivArea', 'TotRmsAbvGrd', 'GarageYrBlt', 'GarageArea', 'YrSold', 'SalePrice']]
# home_data.dropna(axis=0, inplace=True)
# display(home_data)
Y_full = home_data["SalePrice"]
X_full = home_data
X_full.drop(['SalePrice'],axis = 1,inplace = True)

imputer = SimpleImputer(strategy='median')
X_full = pd.DataFrame(imputer.fit_transform(X_full), columns=X_full.columns)


X_full['HouseAge'] = X_full['YrSold'] - X_full['YearBuilt']
# X_full['GarageAge'] = X_full['YrSold'] - X_full['GarageYrBlt']
# X_full['oterGrLivArea'] = X_full['GrLivArea'] - X_full['1stFlrSF']

X_full['TotalArea'] = X_full['TotalBsmtSF'] + X_full['GrLivArea']
X_full.drop(['YrSold', 'TotalBsmtSF', '1stFlrSF'],axis = 1,inplace = True)
# Y_full = np.log1p(Y_full)  # also apply to target


# X_train, X_test, Y_train, Y_test = train_test_split(X_full, Y_full, test_size=0.2, random_state=42)
# display(X_test)
print("Home dataset:\n",X_full.shape[0],"Records\n", X_full.shape[1],"Features")

Home dataset:
 1460 Records
 11 Features


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_full.drop(['SalePrice'],axis = 1,inplace = True)


In [29]:
# XGBoost = XGBRegressor(objective='reg:squarederror', learning_rate=0.05, max_depth=4, subsample=0.8, colsample_bytree=0.8, 
                    #    n_estimators=500, reg_alpha=1, reg_lambda=5, random_state=42)
# XGBoost = XGBRegressor(n_estimators=1000, eta=0.01, max_depth=4, subsample=0.6, colsample_bytree=0.9)
# xgb_base = XGBRegressor(objective='reg:squarederror', random_state=42)

# 1: add TotalBsmtSF and GrLivArea (Original)
# 18467
# 2: add TotalBsmtSF, GrLivArea and 
# 
# 3: add GarageAge
# 

# XGBoost = XGBRegressor(n_estimators=1000, eta=0.01, max_depth=4, subsample=0.6, colsample_bytree=0.9, random_state=42)

params = {
    'n_estimators': [500, 1000, 1500],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 4, 5],
    'subsample': [0.5, 0.7, 0.9],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'random_state': [42]
}

XGBs = XGBRegressor()
search = RandomizedSearchCV(XGBs, param_distributions=params, scoring='neg_mean_absolute_error', cv=5, n_iter=20, random_state=42)
search.fit(X_full, Y_full)
print(search.best_params_)

XGBoost = XGBRegressor(**search.best_params_)
kf = KFold(n_splits=5, shuffle=True, random_state=42)

neg_XGB_score = cross_val_score(XGBoost, X_full, Y_full, scoring='neg_mean_absolute_error', cv=kf)
print("XGB_score:", -neg_XGB_score.mean())


XGBoost.fit(X_full, Y_full)
importances = XGBoost.feature_importances_
print(importances)
# XGB_predictions = XGBoost.predict(X_test)
# XGB_score = mean_absolute_error(Y_test, XGB_predictions)

template = {
    'LotArea': [], 
    'LotFrontage': [], 
    'OverallQual': [],
    'OverallCond': [],
    'YearBuilt': [],
    'TotalBsmtSF': [],
    '1stFlrSF': [],
    'GrLivArea': [], 
    'TotRmsAbvGrd': [], 
    'GarageYrBlt': [],
    'GarageArea': [],
    'YrSold': []
}
# chromo_df_bc,score_bc, predictions_list = generations(size=30,mutation_rate=0.25,n_gen=30, elitism=3,target_price=183000, 
                                                    #   target_lotarea=9000, w_price=0.7, w_area=0.3, template=template, pct=0)
# G_best_index = score_bc.index(min(score_bc))
# G_best_score, G_best_chromo, G_best_predictions = score_bc[G_best_index], pd.DataFrame(chromo_df_bc).iloc[G_best_index], predictions_list[G_best_index]
# print("XGB_GA_score:", G_best_score)
# display(G_best_chromo)
# print('Sale Price：', G_best_predictions)


{'subsample': 0.5, 'random_state': 42, 'n_estimators': 1500, 'max_depth': 4, 'learning_rate': 0.01, 'colsample_bytree': 0.6}
XGB_score: 17797.799534460617
[0.03195796 0.02496193 0.39537162 0.0282692  0.04943192 0.07461473
 0.02576673 0.02967937 0.05537464 0.06940906 0.2151629 ]


In [30]:
quantity = np.arange(1, len(XGB_predictions)+1)
fig = plt.figure(figsize=(14, 5)) 
plt.plot(quantity, XGB_predictions, 'r')
plt.plot(quantity, Y_test, 'g')
plt.title('Compare Predictions with Target Values')
plt.xlabel('Index')
plt.ylabel('SalePrice')
plt.legend(['XGB Predictions', 'Target Values'])
plt.show()
fig2 = plt.figure(figsize=(14, 5)) 
quantity2 = np.arange(1, len(predictions_list)+1)
desired_price = [183000 for _ in range(len(quantity2))]
plt.plot(quantity2, predictions_list, 'r')
plt.plot(quantity2, desired_price, 'g')
plt.title('Compare Predictions with Target Values')
plt.xlabel('Index')
plt.ylabel('Desired Price')
plt.legend(['XGB_GA Predictions', 'Target Values'])
plt.show()

NameError: name 'XGB_predictions' is not defined