In [2]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb

In [3]:

# Step 1: Load the dataset (make sure the CSV is in your working directory)
churn_data = pd.read_csv('Telco-Customer-Churn.csv')

# Step 2: Drop unnecessary column (customerID is not useful for prediction)
churn_data = churn_data.drop('customerID', axis=1)

# Step 3: Convert TotalCharges to numeric (it's stored as string; some values are blank)
churn_data['TotalCharges'] = pd.to_numeric(churn_data['TotalCharges'], errors='coerce')
# Fill NaN in TotalCharges (only 11 rows) with median
churn_data['TotalCharges'].fillna(churn_data['TotalCharges'].median(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  churn_data['TotalCharges'].fillna(churn_data['TotalCharges'].median(), inplace=True)


In [4]:
# Step 4: Encode categorical variables
label_encoders = {}
for column in churn_data.columns:
    if churn_data[column].dtype == 'object':
        le = LabelEncoder()
        churn_data[column] = le.fit_transform(churn_data[column])
        label_encoders[column] = le

# Now all columns are numeric
X = churn_data.iloc[:, :-1]  # All features except last column
y = churn_data.iloc[:, -1]   # Last column is 'Churn' (0 = No, 1 = Yes)

# Step 5: Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=123
)

# Step 6: Train XGBoost classifier
xg_cl = xgb.XGBClassifier(
    objective='binary:logistic',
    n_estimators=100,  # Increased for better performance
    seed=123,
    use_label_encoder=False,  # Suppress warning
    eval_metric='logloss'
)

xg_cl.fit(X_train, y_train)

# Step 7: Predict and evaluate
preds = xg_cl.predict(X_test)
accuracy = float(np.sum(preds == y_test)) / y_test.shape[0]
print("Accuracy: %f" % (accuracy))

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Accuracy: 0.784244


In [5]:
# Import necessary modules
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
import numpy as np

# Load the breast cancer dataset
data = load_breast_cancer()
X = data.data
y = data.target

# Split the data: 80% training, 20% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

# Instantiate the DecisionTreeClassifier with max_depth=4
dt_clf_4 = DecisionTreeClassifier(max_depth=4, random_state=123)

# Fit the classifier to the training data
dt_clf_4.fit(X_train, y_train)

# Predict labels for the test set
y_pred_4 = dt_clf_4.predict(X_test)

# Compute accuracy
accuracy = float(np.sum(y_pred_4 == y_test)) / y_test.shape[0]
print(f"Accuracy: {accuracy:.4f}")



Accuracy: 0.9649


In [6]:
# Import necessary libraries
import xgboost as xgb
import pandas as pd
from sklearn.datasets import load_breast_cancer

# Load the breast cancer dataset
data = load_breast_cancer()
X = data.data
y = data.target

# Create the DMatrix
churn_dmatrix = xgb.DMatrix(data=X, label=y)

# Create the parameter dictionary
params = {"objective": "binary:logistic", "max_depth": 3}

# Perform 3-fold cross-validation
cv_results = xgb.cv(dtrain=churn_dmatrix, params=params, nfold=3, num_boost_round=5, 
                    metrics="error", as_pandas=True, seed=123)

# Print cv_results
print(cv_results)

# Print the accuracy
print(f"Accuracy: {((1 - cv_results['test-error-mean']).iloc[-1]):.4f}")

   train-error-mean  train-error-std  test-error-mean  test-error-std
0          0.055358         0.014118         0.098459        0.016569
1          0.026367         0.003758         0.061524        0.013876
2          0.012302         0.001236         0.061533        0.013930
3          0.011427         0.002497         0.052752        0.015618
4          0.008790         0.002494         0.052752        0.015618
Accuracy: 0.9472


In [7]:
# Import necessary libraries
import xgboost as xgb
import pandas as pd
from sklearn.datasets import load_breast_cancer

# Load the breast cancer dataset
data = load_breast_cancer()
X = data.data
y = data.target

# Create the DMatrix
churn_dmatrix = xgb.DMatrix(data=X, label=y)

# Create the parameter dictionary
params = {"objective": "binary:logistic", "max_depth": 3}

# Perform 3-fold cross-validation with AUC metric
cv_results = xgb.cv(dtrain=churn_dmatrix, params=params, nfold=3, num_boost_round=5, 
                    metrics="auc", as_pandas=True, seed=123)

# Print cv_results
print(cv_results)

# Print the AUC
print(f"AUC: {(cv_results['test-auc-mean']).iloc[-1]:.4f}")




   train-auc-mean  train-auc-std  test-auc-mean  test-auc-std
0        0.989659       0.005199       0.959695      0.025255
1        0.995100       0.003751       0.972271      0.023940
2        0.997122       0.002032       0.973122      0.025047
3        0.997103       0.002030       0.982087      0.013069
4        0.997832       0.001851       0.982567      0.013554
AUC: 0.9826


In [8]:
# Import necessary libraries
import xgboost as xgb
import numpy as np
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Generate synthetic regression dataset
X, y = make_regression(n_samples=1000, n_features=4, noise=0.1, random_state=123)

# Split the data: 80% training, 20% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

# Create DMatrix for XGBoost
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# Define parameters
params = {"objective": "reg:squarederror", "max_depth": 3, "eta": 0.1}

# Train the model
xgb_model = xgb.train(params, dtrain, num_boost_round=10)

# Predict on test set
y_pred = xgb_model.predict(dtest)

# Compute RMSE and MAE
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)

print(f"RMSE: {rmse:.4f}")
print(f"MAE: {mae:.4f}")

RMSE: 45.1356
MAE: 35.6638


In [9]:
# Import necessary libraries
import xgboost as xgb                 # XGBoost library for gradient boosting
import pandas as pd                   # For data manipulation
import numpy as np                    # For numerical operations
from sklearn.model_selection import train_test_split  # For splitting data
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score  # For evaluation

In [10]:
# Load the dataset
boston_data = pd.read_csv("boston_housing.csv")

# Separate features (X) and target (y)
X = boston_data.iloc[:, :-1]  # All columns except the last
y = boston_data.iloc[:, -1]   # Last column as target

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

# Initialize the XGBoost regressor
xg_reg = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=10, seed=123)

# Train the model
xg_reg.fit(X_train, y_train)

# Make predictions on the test set
preds = xg_reg.predict(X_test)

# Evaluate model performance
mae = mean_absolute_error(y_test, preds)
mse = mean_squared_error(y_test, preds)
r2 = r2_score(y_test, preds)

# Print accuracy metrics
print("Model Performance Metrics:")
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"R² Score: {r2:.2f}")

Model Performance Metrics:
Mean Absolute Error (MAE): 2.85
Mean Squared Error (MSE): 21.20
R² Score: 0.74


In [11]:

# Load the dataset
boston_data = pd.read_csv("boston_housing.csv")

# Separate features (X) and target (y)
X = boston_data.iloc[:, :-1]  # All columns except the last
y = boston_data.iloc[:, -1]   # Last column as target

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

DM_train = xgb.DMatrix(data=X_train, label=y_train)

DM_test = xgb.DMatrix(data=X_test, label=y_test)  # Use y_test here instead of y_train

params= {"booster":"gblinear", "objective":"reg:squarederror"}
xg_reg = xgb.train(params=params, dtrain=DM_train, num_boost_round=10)

preds =xg_reg.predict(DM_test)

# Evaluate model performance
mae = mean_absolute_error(y_test, preds)
mse = mean_squared_error(y_test, preds)
r2 = r2_score(y_test, preds)

# Print accuracy metrics
print("Model Performance Metrics:")
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"R² Score: {r2:.2f}")

Model Performance Metrics:
Mean Absolute Error (MAE): 4.19
Mean Squared Error (MSE): 37.06
R² Score: 0.55


In [12]:
# Create the training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

# Instantiate the XGBRegressor: xg_reg
xg_reg = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=10, seed=123)

# Fit the regressor to the training set
xg_reg.fit(X_train, y_train)

# Predict the labels of the test set: preds
preds = xg_reg.predict(X_test)

# Compute the rmse: rmse
rmse = np.sqrt(mean_squared_error(y_test, preds))
print("RMSE: %f" % (rmse))

RMSE: 4.604776


In [13]:
# Convert the training and testing sets into DMatrixes
DM_train = xgb.DMatrix(data=X_train, label=y_train)
DM_test = xgb.DMatrix(data=X_test, label=y_test) 

# Create the parameter dictionary
params = {"booster": "gblinear", "objective": "reg:squarederror"} 

# Train the model
xg_reg = xgb.train(params=params, dtrain=DM_train, num_boost_round=5)

# Predict the labels of the test set
preds = xg_reg.predict(DM_test) #

# Compute and print the RMSE
rmse = np.sqrt(mean_squared_error(y_test, preds))
print("RMSE: %f" % (rmse))


RMSE: 6.454039


In [14]:
# Create the DMatrix: housing_dmatrix
housing_dmatrix = xgb.DMatrix(data=X, label=y)

# Create the parameter dictionary: params
params = {"objective":"reg:squarederror", "max_depth":4}

# Perform cross-validation: cv_results
cv_results = xgb.cv(dtrain=housing_dmatrix, params=params, nfold=4, num_boost_round=5, metrics="rmse", as_pandas=True, seed=123)

# Print cv_results
print(cv_results)

# Extract and print final boosting round metric
print((cv_results["test-rmse-mean"]).tail(1))

   train-rmse-mean  train-rmse-std  test-rmse-mean  test-rmse-std
0         6.989180        0.032758        7.188751       0.188563
1         5.459640        0.032484        5.959252       0.216005
2         4.369413        0.028176        5.150507       0.258300
3         3.612622        0.043512        4.570613       0.344081
4         3.071021        0.044801        4.296591       0.447778
4    4.296591
Name: test-rmse-mean, dtype: float64


In [15]:
import xgboost as xgb
import pandas as pd

# Load the dataset
boston_data = pd.read_csv("boston_housing.csv")

# Separate features and target
X = boston_data.iloc[:, :-1]
y = boston_data.iloc[:, -1]

# Create DMatrix
boston_dmatrix = xgb.DMatrix(data=X, label=y)

# Set base parameters
params = {"objective": "reg:squarederror", "max_depth": 4}

# L1 regularization values to test
l1_params = [1, 10, 100]
rmses_l1 = []

# Loop through each alpha value
for reg in l1_params:
    params["alpha"] = reg  # Set L1 regularization
    cv_results = xgb.cv(
        dtrain=boston_dmatrix,
        params=params,
        nfold=4,
        num_boost_round=10,
        metrics="rmse",
        as_pandas=True,
        seed=123
    )
    # Extract final RMSE
    final_rmse = cv_results["test-rmse-mean"].tail(1).values[0]
    rmses_l1.append(final_rmse)

# Display results
print("Best RMSE as a function of alpha:")
print(pd.DataFrame(list(zip(l1_params, rmses_l1)), columns=["alpha", "rmse"]))


Best RMSE as a function of alpha:
   alpha      rmse
0      1  3.685441
1     10  3.761246
2    100  4.461392


In [None]:

from matplotlib import pyplot as plt
from matplotlib import image

# Create the DMatrix: housing_dmatrix
housing_dmatrix = xgb.DMatrix(data=X, label=y)

# Create the parameter dictionary: params
params = {"objective":"reg:squarederror", "max_depth":2}

# Train the model: xg_reg
xg_reg = xgb.train(params=params, dtrain=housing_dmatrix, num_boost_round=10)

# Plot all trees in a loop
for i in range(10):
    xgb.plot_tree(xg_reg, num_trees=i)
    plt.title(f"Tree {i}")
    

In [None]:
# Create the DMatrix: housing_dmatrix
housing_dmatrix = xgb.DMatrix(data=X, label=y)

# Create the parameter dictionary: params
params = {"objective":"reg:squarederror", "max_depth":4}

# Train the model: xg_reg
xg_reg = xgb.train(params=params, dtrain=housing_dmatrix, num_boost_round=10)

# Plot the feature importances
xgb.plot_importance(xg_reg)
plt.show()

# Day 7