In [None]:
#%pip install ISLP

In [42]:
import numpy as np
import pandas as pd
from matplotlib.pyplot import subplots
import sklearn.model_selection as skm
from ISLP import load_data, confusion_table
from ISLP.models import ModelSpec as MS
import pandas as pd

import xgboost as xgb


from sklearn.tree import (DecisionTreeClassifier as DTC,
                          DecisionTreeRegressor as DTR,
                          plot_tree,
                          export_text)
from sklearn.metrics import (accuracy_score,
                             log_loss)
from sklearn.ensemble import \
     (RandomForestRegressor as RF,
      GradientBoostingRegressor as GBR)
from ISLP.bart import BART

In [43]:
# normalize function 
def normalize_single (x_array, lower, upper):
    min_x = x_array.min()*np.ones(len(x_array))
    max_x = x_array.max()*np.ones(len(x_array))
    lower_bound = lower * np.ones(len(x_array))
    upper_bound = upper * np.ones(len(x_array))
    x_array = lower_bound + (x_array - min_x) * (upper_bound - lower_bound) / (max_x - min_x)
    return x_array

In [50]:
# load the preprocessed training data test.csv
preprocessed_data = pd.read_csv('train_processed.csv')

ms_model = MS(preprocessed_data, intercept=False)
D = ms_model.fit_transform(preprocessed_data)
feature_names = list(D.columns)
X = np.asarray(D)

# Drop the ID column since it is not a feature
X = np.delete(X, 0, axis=1)

# split the data into training data and testing data
(X_train,
 X_test,
 y_train,
 y_test) = skm.train_test_split(X,
                                preprocessed_data['Price'],
                                test_size=0.2,
                                random_state=0)

log_y_train = np.log1p(y_train) # log(1 + y)

print(X_train.shape) # for checking the dimension
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

print(X_train)

(3576, 25)
(894, 25)
(3576,)
(894,)
[[2008 48233 13.4 ... 0 1 0]
 [2014 49800 23.1 ... 0 1 0]
 [2011 75000 16.73 ... 0 0 0]
 ...
 [2008 95291 17.92 ... 0 1 0]
 [2016 61578 18.9 ... 0 1 0]
 [2013 32953 18.6 ... 1 0 0]]


In [51]:
# Fit a regression tree to the training data
# reg = DTR(max_depth=10,criterion="absolute_error")
# reg.fit(X_train, y_train)


# train with log_y_train
reg = DTR(max_depth=10,criterion="squared_error")
reg.fit(X_train, log_y_train)

# evaluate  Mean Absolute Percentage Error (MAPE) on test data
y_pred = np.expm1(reg.predict(X_test))  # inverse of log1p
mape = np.mean(np.abs((y_test - y_pred) / y_test))
print(f'Mean Absolute Percentage Error (MAPE): {mape:.2f}%')


Mean Absolute Percentage Error (MAPE): 0.00%


In [52]:
dtrain = xgb.DMatrix(X_train, label=y_train)
def mape_obj(preds, dtrain):
    y = dtrain.get_label()
    
    # Avoid division by zero
    eps = 1e-7
    y_safe = np.where(y == 0, eps, y)

    # Gradient of |y - yhat| / |y|
    grad = np.sign(preds - y) / y_safe
    
    # Hessian (approximate, constant)
    hess = 1.0 / (y_safe)   # positive => required
    
    return grad, hess

params = {
    'max_depth': 10,
    'eta': 0.1,
    # objective replaced by mape_obj, so we use "reg:squarederror" only for initialization
    'objective': 'reg:squarederror'
}

xgb_model = xgb.train(
    params,
    dtrain,
    num_boost_round=300,
    obj=mape_obj  
)

# ---------------------------------------------
# 5. Predict
# ---------------------------------------------
dtest = xgb.DMatrix(X_test)
y_pred = xgb_model.predict(dtest)

In [53]:
# evaluate  Mean Absolute Percentage Error (MAPE) on test data
mape = np.mean(np.abs((y_test - y_pred) / y_test))
print(f'Mean Absolute Percentage Error (MAPE): {mape:.2f}')

Mean Absolute Percentage Error (MAPE): 0.09


In [49]:
# Generate a prediction with the test_processed.csv data
# load the preprocessed training data test.csv
preprocessed_test_data = pd.read_csv('test_processed.csv')

ms_test_model = MS(preprocessed_test_data.columns, intercept=False)
D = ms_test_model.fit_transform(preprocessed_test_data)

X_test_submission = np.asarray(ms_test_model.transform(preprocessed_test_data))

ID_column = X_test_submission[:, 0]  # Save the ID column
X_test_submission = np.delete(X_test_submission, 0, axis=1)  # Remove the ID column for prediction
print(X_test_submission.shape)

dtest = xgb.DMatrix(X_test_submission)
y_pred = xgb_model.predict(dtest)

output = pd.DataFrame({'ID': ID_column, 'Price': y_pred})
print(output.shape)
output.to_csv('submission.csv', index=False)

(1491, 23)
(1491, 2)
