In [1]:
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

df = pd.read_csv("G:\\Assignment\\DT-Credit.csv")
df.head(5)

Unnamed: 0,Income,Limit,Rating,Cards,Age,Education,Own,Student,Married,Region,Balance
0,14.891,3606,283,2,34,11,No,No,Yes,South,333
1,106.025,6645,483,3,82,15,Yes,Yes,Yes,West,903
2,104.593,7075,514,4,71,11,No,No,No,West,580
3,148.924,9504,681,3,36,11,Yes,No,No,West,964
4,55.882,4897,357,2,68,16,No,No,Yes,South,331


In [2]:
df.shape

(400, 11)

In [3]:
df.describe()

Unnamed: 0,Income,Limit,Rating,Cards,Age,Education,Balance
count,400.0,400.0,400.0,400.0,400.0,400.0,400.0
mean,45.218885,4735.6,354.94,2.9575,55.6675,13.45,520.015
std,35.244273,2308.198848,154.724143,1.371275,17.249807,3.125207,459.758877
min,10.354,855.0,93.0,1.0,23.0,5.0,0.0
25%,21.00725,3088.0,247.25,2.0,41.75,11.0,68.75
50%,33.1155,4622.5,344.0,3.0,56.0,14.0,459.5
75%,57.47075,5872.75,437.25,4.0,70.0,16.0,863.0
max,186.634,13913.0,982.0,9.0,98.0,20.0,1999.0


In [4]:
df.describe(exclude=np.number)

Unnamed: 0,Own,Student,Married,Region
count,400,400,400,400
unique,2,2,2,3
top,Yes,No,Yes,South
freq,207,360,245,199


In [6]:
from sklearn.model_selection import train_test_split
# Extract feature and target arrays
X = df.drop(columns=['Balance', 'Own'])
y = df['Balance']


In [7]:
# Extract text features
cats = X.select_dtypes(exclude=np.number).columns.tolist()

# Convert to Pandas category
for col in cats:
    X[col] = X[col].astype('category')

In [8]:
X.dtypes

Income        float64
Limit           int64
Rating          int64
Cards           int64
Age             int64
Education       int64
Student      category
Married      category
Region       category
dtype: object

In [9]:
# Split the data into 70% train, 15% validation, and 15% test sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=1)
X_valid, X_test, y_valid, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=1)


In [10]:
print("Train Size: ",len(X_train))
print("Test Size: ",len(X_test))
print("Valid Size: ",len(X_valid))

Train Size:  280
Test Size:  60
Valid Size:  60


In [12]:
import xgboost as xgb

# Creating regression matrices
dtrain_reg = xgb.DMatrix(X_train, y_train, enable_categorical=True)
dtest_reg = xgb.DMatrix(X_test, y_test, enable_categorical=True)
dval_reg = xgb.DMatrix(X_valid, y_valid, enable_categorical=True)

In [13]:
# Define hyperparameters
params = {"objective": "reg:squarederror", "tree_method": "gpu_hist"}

# Define a range of hyperparameters for tuning
hyperparams = {
    "max_depth": [3, 5, 7],
    "learning_rate": [0.1, 0.01, 0.001],
    "subsample": [0.7, 0.8, 0.9],
    "colsample_bytree": [0.7, 0.8, 0.9]
}

In [14]:
def calculate_mse(y, y_pred):
  mse = np.square(np.subtract(y, y_pred)).mean() 
  return mse

In [15]:
# Perform grid search for hyperparameter tuning
best_mse = float("inf")
best_params = None
best_test_mse = float("inf")
for max_depth in hyperparams["max_depth"]:
    for learning_rate in hyperparams["learning_rate"]:
        for subsample in hyperparams["subsample"]:
            for colsample_bytree in hyperparams["colsample_bytree"]:
                params["max_depth"] = max_depth
                params["learning_rate"] = learning_rate
                params["subsample"] = subsample
                params["colsample_bytree"] = colsample_bytree

                model = xgb.train(params=params, dtrain=dtrain_reg, num_boost_round=100)
                
                preds_valid = model.predict(dval_reg)
                val_mse = calculate_mse(y_valid, preds_valid)
                preds_test = model.predict(dtest_reg)
                test_mse = calculate_mse(y_test, preds_test)
                
                if val_mse < best_mse:
                    best_mse = val_mse
                    best_params = params.copy()
                    best_test_mse = test_mse
print("Best Parameters: Maximum Depth is ", best_params["max_depth"], ", Learning Rate is ", best_params["learning_rate"], ", Subsample is ", best_params["subsample"], ", Colsample by tree is ", best_params["colsample_bytree"])
print(f"Best Validation MSE: {best_mse}")
print(f"Mean Squared Error on Test Set: {best_test_mse}")

Best Parameters: Maximum Depth is  5 , Learning Rate is  0.1 , Subsample is  0.7 , Colsample by tree is  0.9
Best Validation MSE: 8650.988788028697
Mean Squared Error on Test Set: 10286.082291848074
