In [2]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb

In [3]:

# Step 1: Load the dataset (make sure the CSV is in your working directory)
churn_data = pd.read_csv('Telco-Customer-Churn.csv')

# Step 2: Drop unnecessary column (customerID is not useful for prediction)
churn_data = churn_data.drop('customerID', axis=1)

# Step 3: Convert TotalCharges to numeric (it's stored as string; some values are blank)
churn_data['TotalCharges'] = pd.to_numeric(churn_data['TotalCharges'], errors='coerce')
# Fill NaN in TotalCharges (only 11 rows) with median
churn_data['TotalCharges'].fillna(churn_data['TotalCharges'].median(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  churn_data['TotalCharges'].fillna(churn_data['TotalCharges'].median(), inplace=True)


In [4]:
# Step 4: Encode categorical variables
label_encoders = {}
for column in churn_data.columns:
    if churn_data[column].dtype == 'object':
        le = LabelEncoder()
        churn_data[column] = le.fit_transform(churn_data[column])
        label_encoders[column] = le

# Now all columns are numeric
X = churn_data.iloc[:, :-1]  # All features except last column
y = churn_data.iloc[:, -1]   # Last column is 'Churn' (0 = No, 1 = Yes)

# Step 5: Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=123
)

# Step 6: Train XGBoost classifier
xg_cl = xgb.XGBClassifier(
    objective='binary:logistic',
    n_estimators=100,  # Increased for better performance
    seed=123,
    use_label_encoder=False,  # Suppress warning
    eval_metric='logloss'
)

xg_cl.fit(X_train, y_train)

# Step 7: Predict and evaluate
preds = xg_cl.predict(X_test)
accuracy = float(np.sum(preds == y_test)) / y_test.shape[0]
print("Accuracy: %f" % (accuracy))

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Accuracy: 0.784244


In [5]:
# Import necessary modules
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
import numpy as np

# Load the breast cancer dataset
data = load_breast_cancer()
X = data.data
y = data.target

# Split the data: 80% training, 20% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

# Instantiate the DecisionTreeClassifier with max_depth=4
dt_clf_4 = DecisionTreeClassifier(max_depth=4, random_state=123)

# Fit the classifier to the training data
dt_clf_4.fit(X_train, y_train)

# Predict labels for the test set
y_pred_4 = dt_clf_4.predict(X_test)

# Compute accuracy
accuracy = float(np.sum(y_pred_4 == y_test)) / y_test.shape[0]
print(f"Accuracy: {accuracy:.4f}")



Accuracy: 0.9649


In [6]:
# Import necessary libraries
import xgboost as xgb
import pandas as pd
from sklearn.datasets import load_breast_cancer

# Load the breast cancer dataset
data = load_breast_cancer()
X = data.data
y = data.target

# Create the DMatrix
churn_dmatrix = xgb.DMatrix(data=X, label=y)

# Create the parameter dictionary
params = {"objective": "binary:logistic", "max_depth": 3}

# Perform 3-fold cross-validation
cv_results = xgb.cv(dtrain=churn_dmatrix, params=params, nfold=3, num_boost_round=5, 
                    metrics="error", as_pandas=True, seed=123)

# Print cv_results
print(cv_results)

# Print the accuracy
print(f"Accuracy: {((1 - cv_results['test-error-mean']).iloc[-1]):.4f}")

   train-error-mean  train-error-std  test-error-mean  test-error-std
0          0.055358         0.014118         0.098459        0.016569
1          0.026367         0.003758         0.061524        0.013876
2          0.012302         0.001236         0.061533        0.013930
3          0.011427         0.002497         0.052752        0.015618
4          0.008790         0.002494         0.052752        0.015618
Accuracy: 0.9472


In [7]:
# Import necessary libraries
import xgboost as xgb
import pandas as pd
from sklearn.datasets import load_breast_cancer

# Load the breast cancer dataset
data = load_breast_cancer()
X = data.data
y = data.target

# Create the DMatrix
churn_dmatrix = xgb.DMatrix(data=X, label=y)

# Create the parameter dictionary
params = {"objective": "binary:logistic", "max_depth": 3}

# Perform 3-fold cross-validation with AUC metric
cv_results = xgb.cv(dtrain=churn_dmatrix, params=params, nfold=3, num_boost_round=5, 
                    metrics="auc", as_pandas=True, seed=123)

# Print cv_results
print(cv_results)

# Print the AUC
print(f"AUC: {(cv_results['test-auc-mean']).iloc[-1]:.4f}")




   train-auc-mean  train-auc-std  test-auc-mean  test-auc-std
0        0.989659       0.005199       0.959695      0.025255
1        0.995100       0.003751       0.972271      0.023940
2        0.997122       0.002032       0.973122      0.025047
3        0.997103       0.002030       0.982087      0.013069
4        0.997832       0.001851       0.982567      0.013554
AUC: 0.9826


In [8]:
# Import necessary libraries
import xgboost as xgb
import numpy as np
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Generate synthetic regression dataset
X, y = make_regression(n_samples=1000, n_features=4, noise=0.1, random_state=123)

# Split the data: 80% training, 20% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

# Create DMatrix for XGBoost
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# Define parameters
params = {"objective": "reg:squarederror", "max_depth": 3, "eta": 0.1}

# Train the model
xgb_model = xgb.train(params, dtrain, num_boost_round=10)

# Predict on test set
y_pred = xgb_model.predict(dtest)

# Compute RMSE and MAE
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)

print(f"RMSE: {rmse:.4f}")
print(f"MAE: {mae:.4f}")

RMSE: 45.1356
MAE: 35.6638


In [9]:
# Import necessary libraries
import xgboost as xgb                 # XGBoost library for gradient boosting
import pandas as pd                   # For data manipulation
import numpy as np                    # For numerical operations
from sklearn.model_selection import train_test_split  # For splitting data
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score  # For evaluation

In [10]:
# Load the dataset
boston_data = pd.read_csv("boston_housing.csv")

# Separate features (X) and target (y)
X = boston_data.iloc[:, :-1]  # All columns except the last
y = boston_data.iloc[:, -1]   # Last column as target

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

# Initialize the XGBoost regressor
xg_reg = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=10, seed=123)

# Train the model
xg_reg.fit(X_train, y_train)

# Make predictions on the test set
preds = xg_reg.predict(X_test)

# Evaluate model performance
mae = mean_absolute_error(y_test, preds)
mse = mean_squared_error(y_test, preds)
r2 = r2_score(y_test, preds)

# Print accuracy metrics
print("Model Performance Metrics:")
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"R² Score: {r2:.2f}")

Model Performance Metrics:
Mean Absolute Error (MAE): 2.85
Mean Squared Error (MSE): 21.20
R² Score: 0.74


In [11]:

# Load the dataset
boston_data = pd.read_csv("boston_housing.csv")

# Separate features (X) and target (y)
X = boston_data.iloc[:, :-1]  # All columns except the last
y = boston_data.iloc[:, -1]   # Last column as target

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

DM_train = xgb.DMatrix(data=X_train, label=y_train)

DM_test = xgb.DMatrix(data=X_test, label=y_test)  # Use y_test here instead of y_train

params= {"booster":"gblinear", "objective":"reg:squarederror"}
xg_reg = xgb.train(params=params, dtrain=DM_train, num_boost_round=10)

preds =xg_reg.predict(DM_test)

# Evaluate model performance
mae = mean_absolute_error(y_test, preds)
mse = mean_squared_error(y_test, preds)
r2 = r2_score(y_test, preds)

# Print accuracy metrics
print("Model Performance Metrics:")
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"R² Score: {r2:.2f}")

Model Performance Metrics:
Mean Absolute Error (MAE): 4.19
Mean Squared Error (MSE): 37.06
R² Score: 0.55


In [12]:
# Create the training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

# Instantiate the XGBRegressor: xg_reg
xg_reg = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=10, seed=123)

# Fit the regressor to the training set
xg_reg.fit(X_train, y_train)

# Predict the labels of the test set: preds
preds = xg_reg.predict(X_test)

# Compute the rmse: rmse
rmse = np.sqrt(mean_squared_error(y_test, preds))
print("RMSE: %f" % (rmse))

RMSE: 4.604776


In [13]:
# Convert the training and testing sets into DMatrixes
DM_train = xgb.DMatrix(data=X_train, label=y_train)
DM_test = xgb.DMatrix(data=X_test, label=y_test) 

# Create the parameter dictionary
params = {"booster": "gblinear", "objective": "reg:squarederror"} 

# Train the model
xg_reg = xgb.train(params=params, dtrain=DM_train, num_boost_round=5)

# Predict the labels of the test set
preds = xg_reg.predict(DM_test) #

# Compute and print the RMSE
rmse = np.sqrt(mean_squared_error(y_test, preds))
print("RMSE: %f" % (rmse))


RMSE: 6.454039


In [14]:
# Create the DMatrix: housing_dmatrix
housing_dmatrix = xgb.DMatrix(data=X, label=y)

# Create the parameter dictionary: params
params = {"objective":"reg:squarederror", "max_depth":4}

# Perform cross-validation: cv_results
cv_results = xgb.cv(dtrain=housing_dmatrix, params=params, nfold=4, num_boost_round=5, metrics="rmse", as_pandas=True, seed=123)

# Print cv_results
print(cv_results)

# Extract and print final boosting round metric
print((cv_results["test-rmse-mean"]).tail(1))

   train-rmse-mean  train-rmse-std  test-rmse-mean  test-rmse-std
0         6.989180        0.032758        7.188751       0.188563
1         5.459640        0.032484        5.959252       0.216005
2         4.369413        0.028176        5.150507       0.258300
3         3.612622        0.043512        4.570613       0.344081
4         3.071021        0.044801        4.296591       0.447778
4    4.296591
Name: test-rmse-mean, dtype: float64


In [15]:
import xgboost as xgb
import pandas as pd

# Load the dataset
boston_data = pd.read_csv("boston_housing.csv")

# Separate features and target
X = boston_data.iloc[:, :-1]
y = boston_data.iloc[:, -1]

# Create DMatrix
boston_dmatrix = xgb.DMatrix(data=X, label=y)

# Set base parameters
params = {"objective": "reg:squarederror", "max_depth": 4}

# L1 regularization values to test
l1_params = [1, 10, 100]
rmses_l1 = []

# Loop through each alpha value
for reg in l1_params:
    params["alpha"] = reg  # Set L1 regularization
    cv_results = xgb.cv(
        dtrain=boston_dmatrix,
        params=params,
        nfold=4,
        num_boost_round=10,
        metrics="rmse",
        as_pandas=True,
        seed=123
    )
    # Extract final RMSE
    final_rmse = cv_results["test-rmse-mean"].tail(1).values[0]
    rmses_l1.append(final_rmse)

# Display results
print("Best RMSE as a function of alpha:")
print(pd.DataFrame(list(zip(l1_params, rmses_l1)), columns=["alpha", "rmse"]))


Best RMSE as a function of alpha:
   alpha      rmse
0      1  3.685441
1     10  3.761246
2    100  4.461392


In [None]:

from matplotlib import pyplot as plt
from matplotlib import image

# Create the DMatrix: housing_dmatrix
housing_dmatrix = xgb.DMatrix(data=X, label=y)

# Create the parameter dictionary: params
params = {"objective":"reg:squarederror", "max_depth":2}

# Train the model: xg_reg
xg_reg = xgb.train(params=params, dtrain=housing_dmatrix, num_boost_round=10)

# Plot all trees in a loop
for i in range(10):
    xgb.plot_tree(xg_reg, num_trees=i)
    plt.title(f"Tree {i}")
    

In [None]:
# Create the DMatrix: housing_dmatrix
housing_dmatrix = xgb.DMatrix(data=X, label=y)

# Create the parameter dictionary: params
params = {"objective":"reg:squarederror", "max_depth":4}

# Train the model: xg_reg
xg_reg = xgb.train(params=params, dtrain=housing_dmatrix, num_boost_round=10)

# Plot the feature importances
xgb.plot_importance(xg_reg)
plt.show()

# Day 2

## tuning parameters manually (tuning boosting rounds)

In [None]:
# Tuning the number of boosting rounds

# Create the DMatrix: housing_dmatrix
housing_dmatrix = xgb.DMatrix(data=X, label=y)

# Create the parameter dictionary for each tree: params 
params = {"objective": "reg:squarederror", "max_depth": 3}

# Create list of number of boosting rounds
num_rounds = [5, 10, 15]

# Empty list to store final round rmse per XGBoost model
final_rmse_per_round = []

# Iterate over num_rounds and build one model per num_boost_round parameter
for curr_num_rounds in num_rounds:
    # Perform cross-validation: cv_results
    cv_results = xgb.cv(dtrain=housing_dmatrix, params=params, nfold=3, num_boost_round=curr_num_rounds, metrics="rmse", as_pandas=True, seed=123)
    # Append final round RMSE
    final_rmse_per_round.append(cv_results["test-rmse-mean"].tail().values[-1])

# Print the resultant DataFrame
num_rounds_rmses = list(zip(num_rounds, final_rmse_per_round))
print(pd.DataFrame(num_rounds_rmses, columns=["num_boosting_rounds", "rmse"]))

## # Automated boosting round selection using early stopping

In [None]:
# Automated boosting round selection using early stopping

# Create your housing DMatrix: housing_dmatrix
housing_dmatrix = xgb.DMatrix(data=X, label=y)

# Create the parameter dictionary for each tree: params
params = {"objective": "reg:squarederror", "max_depth": 4}

# Perform cross-validation with early stopping: cv_results
cv_results = xgb.cv(
    dtrain=housing_dmatrix,
    params=params,
    nfold=3,
    num_boost_round=50,
    early_stopping_rounds=10,
    metrics="rmse",
    as_pandas=True,
    seed=123
)

# Print cv_results
print(cv_results)

## ETA (shrinkage ) 

## tuning max depth of tree

In [None]:
# Tuning max_depth

# Create your housing DMatrix
housing_dmatrix = xgb.DMatrix(data=X, label=y)

# Create the parameter dictionary
params = {"objective": "reg:squarederror"}

# Create list of max_depth values
max_depths = [2, 5, 10, 20]
best_rmse = []

# Systematically vary the max_depth
for curr_val in max_depths:
    params["max_depth"] = curr_val
    # Perform cross-validation
    cv_results = xgb.cv(
        dtrain=housing_dmatrix,
        params=params,
        nfold=2,
        num_boost_round=10,
        early_stopping_rounds=5,
        metrics="rmse",
        seed=123,
        as_pandas=True
    )
    # Append the final round RMSE to best_rmse 
    best_rmse.append(cv_results["test-rmse-mean"].tail().values[-1])

# Print the resultant DataFrame
print(pd.DataFrame(list(zip(max_depths, best_rmse)), columns=["max_depth", "best_rmse"]))

## tuning colsample by tree

In [None]:
# Tuning colsample_bytree

# Create your housing DMatrix
housing_dmatrix = xgb.DMatrix(data=X, label=y)

# Create the parameter dictionary
params = {"objective": "reg:squarederror", "max_depth": 3}

# Create list of hyperparameter values: colsample_bytree_vals
colsample_bytree_vals = [0.1, 0.5, 0.8, 1]
best_rmse = []

# Systematically vary the hyperparameter value 
for curr_val in colsample_bytree_vals:
    params["colsample_bytree"] = curr_val
    # Perform cross-validation
    cv_results = xgb.cv(
        dtrain=housing_dmatrix, 
        params=params, 
        nfold=2,
        num_boost_round=10, 
        early_stopping_rounds=5,
        metrics="rmse", 
        as_pandas=True, 
        seed=123
    )
    # Append the final round rmse to best_rmse
    best_rmse.append(cv_results["test-rmse-mean"].tail().values[-1])

# Print the resultant DataFrame
print(pd.DataFrame(list(zip(colsample_bytree_vals, best_rmse)), columns=["colsample_bytree", "best_rmse"]))

## girdsearch cv

In [1]:
# Grid Search with XGBoost

import pandas as pd               # Load pandas for data manipulation
import xgboost as xgb             # Load XGBoost library
import numpy as np                # Load NumPy for numerical operations
from sklearn.model_selection import GridSearchCV  # Import GridSearchCV for hyperparameter tuning

housing_data = pd.read_csv("AmesHousing.csv")  
# Load the preprocessed Ames housing dataset from CSV into a DataFrame

X, y = housing_data[housing_data.columns.tolist()[:-1]], housing_data[housing_data.columns.tolist()[-1]]  
# Split the dataset into features (X) and target (y). Assumes last column is the target variable.

housing_dmatrix = xgb.DMatrix(data=X, label=y)  
# Convert the data into XGBoost’s optimized DMatrix format (optional for scikit-learn interface, but useful for native XGBoost)

gbm_param_grid = {
    'learning_rate': [0.01, 0.1, 0.5, 0.9],   # Try 4 different learning rates (eta)
    'n_estimators': [200],                   # Fix number of boosting rounds to 200
    'subsample': [0.3, 0.5, 0.9]             # Try 3 different subsample ratios
}
# Define the grid of hyperparameters to search over. Total combinations = 4 × 1 × 3 = 12

gbm = xgb.XGBRegressor()  
# Create an XGBoost regressor object using scikit-learn API

grid_mse = GridSearchCV(
    estimator=gbm,                   # The model to tune
    param_grid=gbm_param_grid,       # The hyperparameter grid
    scoring='neg_mean_squared_error',# Use negative MSE as scoring (scikit-learn convention)
    cv=4,                            # Use 4-fold cross-validation
    verbose=1                        # Print progress during training
)
# Set up the grid search with cross-validation and scoring metric
grid_mse.fit(X, y)  
# Fit the grid search object to the data. Trains 12 models (one for each parameter combo) using 4-fold CV.

print("Best parameters found: ", grid_mse.best_params_)  
# Print the best combination of hyperparameters found during grid search

print("Lowest RMSE found: ", np.sqrt(np.abs(grid_mse.best_score_)))  
# Convert the best negative MSE score to RMSE and print it

ValueError: DataFrame.dtypes for data must be int, float, bool or category. When categorical type is supplied, the experimental DMatrix parameter`enable_categorical` must be set to `True`.  Invalid columns:MS Zoning: object, Street: object, Alley: object, Lot Shape: object, Land Contour: object, Utilities: object, Lot Config: object, Land Slope: object, Neighborhood: object, Condition 1: object, Condition 2: object, Bldg Type: object, House Style: object, Roof Style: object, Roof Matl: object, Exterior 1st: object, Exterior 2nd: object, Mas Vnr Type: object, Exter Qual: object, Exter Cond: object, Foundation: object, Bsmt Qual: object, Bsmt Cond: object, Bsmt Exposure: object, BsmtFin Type 1: object, BsmtFin Type 2: object, Heating: object, Heating QC: object, Central Air: object, Electrical: object, Kitchen Qual: object, Functional: object, Fireplace Qu: object, Garage Type: object, Garage Finish: object, Garage Qual: object, Garage Cond: object, Paved Drive: object, Pool QC: object, Fence: object, Misc Feature: object, Sale Type: object, Sale Condition: object

## Day 3

In [None]:
import pandas as pd
import xgboost as xgb
import numpy as np
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBRegressor

# Load data
housing_data = pd.read_csv("AmesHousing.csv")
X = housing_data[housing_data.columns.tolist()[:-1]]
y = housing_data[housing_data.columns.tolist()[-1]]

# Define parameter grid
gbm_param_grid = {
    'learning_rate': np.arange(0.05, 1.05, 0.05),
    'n_estimators': [200],
    'subsample': np.arange(0.05, 1.05, 0.05)
}

# Initialize model
gbm = XGBRegressor(objective='reg:squarederror')

# Setup RandomizedSearchCV
randomized_mse = RandomizedSearchCV(
    estimator=gbm,
    param_distributions=gbm_param_grid,
    n_iter=25,
    scoring='neg_mean_squared_error',
    cv=4,
    verbose=1
)

# Fit model
randomized_mse.fit(X, y)

# Output results
print("Best parameters found:", randomized_mse.best_params_)
print("Lowest RMSE found:", np.sqrt(np.abs(randomized_mse.best_score_)))


In [None]:
gbm_param_grid = {
'colsample_bytree': [0.3, 0.7],
'n_estimators': [50],
'max_depth': [2, 5]
}

# Instantiate the regressor: gbm
gbm = xgb.XGBRegressor()

# Perform grid search: grid_mse
grid_mse = GridSearchCV(estimator= gbm,
param_grid=gbm_param_grid,
scoring= 'neg_mean_squared_error',
cv =4,
verbose = 1)

# Fit grid_mse to the data
grid_mse.fit(X,y)

# Print the best parameters and lowest RMSE
print("Best parameters found: ", grid_mse.best_params_)
print("Lowest RMSE found: ", np.sqrt(np.abs(grid_mse.best_score_)))

In [None]:
gbm_param_grid = {
'n_estimators': [25],
'max_depth': range(2,12)
}

# Instantiate the regressor: gbm
gbm = xgb.XGBRegressor(n_estimators=10)

# Perform random search: grid_mse
randomized_mse = RandomizedSearchCV(estimator =gbm,
param_distributions = gbm_param_grid,
scoring='neg_mean_squared_error',
cv =4, 
n_iter = 5,
verbose =1 )

# Fit randomized_mse to the data
randomized_mse.fit(X,y)

# Print the best parameters and lowest RMSE
print("Best parameters found: ", randomized_mse.best_params_)
print("Lowest RMSE found: ", np.sqrt(np.abs(randomized_mse.best_score_)))

In [11]:
import pandas as pd
from sklearn. ensemble import RandomForestRegressor
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score


In [12]:
names = ["crime", "zone", "industry", "charles", "no", "rooms",
"age", "distance", "radial", "tax", "pupil", "aam", "lower", "med_price"]
data = pd.read_csv("boston_housing.csv", names=names)
X, y = data.iloc[:,:-1], data.iloc[:, -1]


In [None]:
X_encoded = pd.get_dummies(X)

X, y = data.iloc[:, :-1], data.iloc[:, -1]
X_encoded = pd.get_dummies(X)

rf_pipeline = Pipeline([
    ("st_scaler", StandardScaler()),
    ("rf_model", RandomForestRegressor())
])

scores = cross_val_score(rf_pipeline, X_encoded, y, scoring="neg_mean_squared_error", cv=10)
final_avg_rmse = np.mean(np.sqrt(np.abs(scores)))
print("Final Average RMSE:", round(final_avg_rmse, 2))



In [6]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score
from xgboost import XGBRegressor


In [7]:

# Load dataset
df = pd.read_csv("AmesHousing.csv")

# Separate features and target
X = df.drop(columns="SalePrice")
y = df["SalePrice"]



In [8]:
# Fill missing values
X = X.fillna("Missing")  # for categorical
X = X.fillna(X.mean(numeric_only=True))  # for numeric

# Convert to dictionary format
df_dict = X.to_dict(orient="records")



In [9]:
# Apply DictVectorizer
dv = DictVectorizer(sparse=False)
X_encoded = dv.fit_transform(df_dict)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)



In [10]:
# Initialize and train model
model = XGBRegressor(objective='reg:squarederror', n_estimators=200, learning_rate=0.1, subsample=0.8)
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print("Final Test RMSE:", round(rmse, 2))
print("Final Test R²:", round(r2, 4))

Final Test RMSE: 24966.9
Final Test R²: 0.9223


# Day 4

In [18]:
import pandas as pd
from scipy.io import arff

# Load ARFF file
data, meta = arff.loadarff("chronic_kidney_disease.arff")

# Convert to DataFrame
kidney_data = pd.DataFrame(data)

# Decode byte strings to regular strings
for col in kidney_data.select_dtypes([object]):
    kidney_data[col] = kidney_data[col].str.decode("utf-8")

# Target and features
kidney_target_name = "class"
kidney_feature_names = [col for col in kidney_data.columns if col != kidney_target_name]
X = kidney_data[kidney_feature_names]
y = kidney_data[kidney_target_name].apply(lambda x: 1 if x == "ckd" else 0)


In [None]:
from sklearn_pandas import DataFrameMapper
from sklearn.impute import SimpleImputer

# Identify categorical and numeric columns
categorical_columns = X.select_dtypes(include="object").columns.tolist()
non_categorical_columns = X.select_dtypes(exclude="object").columns.tolist()

# Numeric imputer
numeric_imputation_mapper = DataFrameMapper(
    [([col], SimpleImputer(strategy="median")) for col in non_categorical_columns],
    input_df=True,
    df_out=True
)

# Categorical imputer
categorical_imputation_mapper = DataFrameMapper(
    [(col, SimpleImputer(strategy="most_frequent")) for col in categorical_columns],
    input_df=True,
    df_out=True
)


In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import cross_val_score
import xgboost as xgb
import numpy as np

pipeline = Pipeline([
    ("featureunion", numeric_categorical_union),
    ("dictifier", Dictifier()),
    ("vectorizer", DictVectorizer(sort=False)),
    ("clf", xgb.XGBClassifier(max_depth=3))
])

# 3-fold cross-validation
cross_val_scores = cross_val_score(
    pipeline,
    kidney_data[kidney_feature_names],
    y,
    scoring="roc_auc",
    cv=3
)

print("3-fold AUC: ", np.mean(cross_val_scores))


In [None]:
from sklearn.pipeline import FeatureUnion
from sklearn_pandas import DataFrameMapper
from sklearn.impute import SimpleImputer
numeric_categorical_union = FeatureUnion([
    ("num_mapper", numeric_imputation_mapper),
    ("cat_mapper", categorical_imputation_mapper)
])


# above code will return error as the sklearn_pandas is obsolete and not available now
# below code is full implementation of xgboost with hyperparam tuning and for xgb full code all in pipeline

In [50]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import GradientBoostingRegressor
import xgboost as xgb
from sklearn.impute import SimpleImputer


In [51]:

# Load the Boston Housing dataset
df = pd.read_csv("boston_housing.csv")

# Split into features and target
X = df.drop("MEDV", axis=1)
y = df["MEDV"]



In [52]:
# XGBoost Pipeline and Tuning
xgb_pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("xgb_model", xgb.XGBRegressor(objective="reg:squarederror"))
])

xgb_param_grid = {
    "xgb_model__subsample": [0.5, 0.7, 1.0],
    "xgb_model__max_depth": [3, 5, 7],
    "xgb_model__colsample_bytree": [0.5, 0.7, 1.0],
    "xgb_model__n_estimators": [100, 200],
    "xgb_model__learning_rate": [0.01, 0.1, 0.2]
}

xgb_search = RandomizedSearchCV(
    estimator=xgb_pipeline,
    param_distributions=xgb_param_grid,
    scoring="neg_mean_squared_error",
    cv=5,
    n_iter=10,
    random_state=42
)



In [53]:
xgb_search.fit(X, y)
xgb_rmse = np.sqrt(-xgb_search.best_score_)
print(" XGBoost Best RMSE:", xgb_rmse)
print("  XGBoost Best Estimator:\n", xgb_search.best_estimator_)


# GBM Pipeline and Tuning
gbm_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
    ("gbm_model", GradientBoostingRegressor())
])

gbm_param_grid = {
    "gbm_model__subsample": [0.5, 0.7, 1.0],
    "gbm_model__max_depth": [3, 5, 7],
    "gbm_model__n_estimators": [100, 200],
    "gbm_model__learning_rate": [0.01, 0.1, 0.2],
    "gbm_model__min_samples_split": [2, 5, 10]
}

gbm_search = RandomizedSearchCV(
    estimator=gbm_pipeline,
    param_distributions=gbm_param_grid,
    scoring="neg_mean_squared_error",
    cv=5,
    n_iter=10,
    random_state=42
)


 XGBoost Best RMSE: 4.3590699644111135
  XGBoost Best Estimator:
 Pipeline(steps=[('scaler', StandardScaler()),
                ('xgb_model',
                 XGBRegressor(base_score=None, booster=None, callbacks=None,
                              colsample_bylevel=None, colsample_bynode=None,
                              colsample_bytree=1.0, device=None,
                              early_stopping_rounds=None,
                              enable_categorical=False, eval_metric=None,
                              feature_types=None, feature_weights=None,
                              gamma=None, grow_policy=None,
                              importance_type=None,
                              interaction_constraints=None, learning_rate=0.2,
                              max_bin=None, max_cat_threshold=None,
                              max_cat_to_onehot=None, max_delta_step=None,
                              max_depth=3, max_leaves=None,
                              min_child_w

In [54]:
gbm_search.fit(X, y)
gbm_rmse = np.sqrt(-gbm_search.best_score_)
print("GBM Best RMSE:", gbm_rmse)
print("GBM Best Estimator:\n", gbm_search.best_estimator_)


GBM Best RMSE: 4.070759769939876
GBM Best Estimator:
 Pipeline(steps=[('imputer', SimpleImputer(strategy='median')),
                ('scaler', StandardScaler()),
                ('gbm_model', GradientBoostingRegressor(subsample=0.7))])


# Termino de informe

In [None]:
# finished practice