In [1]:
import seaborn as sns

import pandas as pd

import numpy as np

import matplotlib.pyplot as plt

import warnings

warnings.filterwarnings("ignore")

df = pd.read_csv("G:\\Assignment\\DT-Wage.csv")
df.head()

Unnamed: 0,year,age,maritl,race,education,region,jobclass,health,health_ins,logwage,wage
0,2006,18,1. Never Married,1. White,1. < HS Grad,2. Middle Atlantic,1. Industrial,1. <=Good,2. No,4.318063,75.043154
1,2004,24,1. Never Married,1. White,4. College Grad,2. Middle Atlantic,2. Information,2. >=Very Good,2. No,4.255273,70.47602
2,2003,45,2. Married,1. White,3. Some College,2. Middle Atlantic,1. Industrial,1. <=Good,1. Yes,4.875061,130.982177
3,2003,43,2. Married,3. Asian,4. College Grad,2. Middle Atlantic,2. Information,2. >=Very Good,1. Yes,5.041393,154.685293
4,2005,50,4. Divorced,1. White,2. HS Grad,2. Middle Atlantic,2. Information,1. <=Good,1. Yes,4.318063,75.043154


In [2]:
df.shape

(3000, 11)

In [3]:
df.describe()

Unnamed: 0,year,age,logwage,wage
count,3000.0,3000.0,3000.0,3000.0
mean,2005.791,42.414667,4.653905,111.703608
std,2.026167,11.542406,0.351753,41.728595
min,2003.0,18.0,3.0,20.085537
25%,2004.0,33.75,4.447158,85.38394
50%,2006.0,42.0,4.653213,104.921507
75%,2008.0,51.0,4.857332,128.680488
max,2009.0,80.0,5.763128,318.34243


In [4]:
df.describe(exclude=np.number)

Unnamed: 0,maritl,race,education,region,jobclass,health,health_ins
count,3000,3000,3000,3000,3000,3000,3000
unique,5,4,5,1,2,2,2
top,2. Married,1. White,2. HS Grad,2. Middle Atlantic,1. Industrial,2. >=Very Good,1. Yes
freq,2074,2480,971,3000,1544,2142,2083


In [5]:
from sklearn.model_selection import train_test_split
# Extract feature and target arrays
X = df.drop('wage', axis=1)
y = df['wage']


In [6]:
# Extract text features
cats = X.select_dtypes(exclude=np.number).columns.tolist()

# Convert to Pandas category
for col in cats:
    X[col] = X[col].astype('category')

In [7]:
X.dtypes

year             int64
age              int64
maritl        category
race          category
education     category
region        category
jobclass      category
health        category
health_ins    category
logwage        float64
dtype: object

In [10]:
# Split the data into 70% train, 15% validation, and 15% test sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=1)
X_valid, X_test, y_valid, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=1)


In [11]:
print("Train Size: ",len(X_train))
print("Test Size: ",len(X_test))
print("Valid Size: ",len(X_valid))

Train Size:  2100
Test Size:  450
Valid Size:  450


In [23]:
import xgboost as xgb

# Creating regression matrices
dtrain_reg = xgb.DMatrix(X_train, y_train, enable_categorical=True)
dtest_reg = xgb.DMatrix(X_test, y_test, enable_categorical=True)
dval_reg = xgb.DMatrix(X_valid, y_val, enable_categorical=True)

In [20]:
# Define hyperparameters
params = {"objective": "reg:squarederror", "tree_method": "gpu_hist"}

# Define a range of hyperparameters for tuning
hyperparams = {
    "max_depth": [3, 5, 7],
    "learning_rate": [0.1, 0.01, 0.001],
    "subsample": [0.7, 0.8, 0.9],
    "colsample_bytree": [0.7, 0.8, 0.9]
}

In [21]:
def calculate_mse(y, y_pred):
  mse = np.square(np.subtract(y, y_pred)).mean() 
  return mse

In [24]:
# Perform grid search for hyperparameter tuning
best_mse = float("inf")
best_params = None
best_test_mse = float("inf")
for max_depth in hyperparams["max_depth"]:
    for learning_rate in hyperparams["learning_rate"]:
        for subsample in hyperparams["subsample"]:
            for colsample_bytree in hyperparams["colsample_bytree"]:
                params["max_depth"] = max_depth
                params["learning_rate"] = learning_rate
                params["subsample"] = subsample
                params["colsample_bytree"] = colsample_bytree

                model = xgb.train(params=params, dtrain=dtrain_reg, num_boost_round=100)
                
                preds_valid = model.predict(dval_reg)
                val_mse = calculate_mse(y_valid, preds_valid)
                preds_test = model.predict(dtest_reg)
                test_mse = calculate_mse(y_test, preds_test)
                
                if val_mse < best_mse:
                    best_mse = val_mse
                    best_params = params.copy()
                    best_test_mse = test_mse
print("Best Parameters: Maximum Depth is ", best_params["max_depth"], ", Learning Rate is ", best_params["learning_rate"], ", Subsample is ", best_params["subsample"], ", Colsample by tree is ", best_params["colsample_bytree"])
print(f"Best Validation MSE: {best_mse}")
print(f"Mean Squared Error on Test Set: {best_test_mse}")

Best Parameters: Maximum Depth is  5 , Learning Rate is  0.1 , Subsample is  0.9 , Colsample by tree is  0.9
Best Validation MSE: 1.599237268037541
Mean Squared Error on Test Set: 6.588283050277561
