In [1]:
# for structured data manupulation, data exploration 
import pandas as pd

# for numerical computin in python
import numpy as np

# for data visulisation and graph ploting 
import matplotlib.pyplot as plt

# for  scatter plots, heatmaps, distribution plots
import seaborn as sns

# for spliting data in train and test 
from sklearn.model_selection import train_test_split

# for ploting the data in decision tree import decision tree classifier
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report

# for data preprocessing 
from sklearn import preprocessing 
from sklearn.preprocessing import StandardScaler

import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score

## Data Exploration
#### - Analyze the dataset's structure and characteristics.
#### - Explore features' distributions and relationships with the target variable.
#### - Gain insights into potential feature engineering opportunities.

In [2]:
# Load the Titanic_Train dataset
df= pd.read_csv('Assignmet _Data/Irish,,/Iris.csv', index_col=0)
df.head(8)

Unnamed: 0_level_0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,5.1,3.5,1.4,0.2,Iris-setosa
2,4.9,3.0,1.4,0.2,Iris-setosa
3,4.7,3.2,1.3,0.2,Iris-setosa
4,4.6,3.1,1.5,0.2,Iris-setosa
5,5.0,3.6,1.4,0.2,Iris-setosa
6,5.4,3.9,1.7,0.4,Iris-setosa
7,4.6,3.4,1.4,0.3,Iris-setosa
8,5.0,3.4,1.5,0.2,Iris-setosa


In [3]:
df.isnull().sum()

SepalLengthCm    0
SepalWidthCm     0
PetalLengthCm    0
PetalWidthCm     0
Species          0
dtype: int64

In [4]:
df.shape

(150, 5)

In [5]:
df.columns

Index(['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm',
       'Species'],
      dtype='object')

In [6]:
df['Species'].value_counts()

Species
Iris-setosa        50
Iris-versicolor    50
Iris-virginica     50
Name: count, dtype: int64

In [7]:
# use label encoder to convert species into numbers

label_encoder = preprocessing.LabelEncoder()
df['Species'] = label_encoder.fit_transform(df['Species'])

In [8]:
df['Species'].value_counts()

Species
0    50
1    50
2    50
Name: count, dtype: int64

In [9]:
df.head()

Unnamed: 0_level_0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,5.1,3.5,1.4,0.2,0
2,4.9,3.0,1.4,0.2,0
3,4.7,3.2,1.3,0.2,0
4,4.6,3.1,1.5,0.2,0
5,5.0,3.6,1.4,0.2,0


In [10]:
x = df.iloc[:,0:4]
y = df.iloc[:,4]

In [11]:
x

Unnamed: 0_level_0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,5.1,3.5,1.4,0.2
2,4.9,3.0,1.4,0.2
3,4.7,3.2,1.3,0.2
4,4.6,3.1,1.5,0.2
5,5.0,3.6,1.4,0.2
...,...,...,...,...
146,6.7,3.0,5.2,2.3
147,6.3,2.5,5.0,1.9
148,6.5,3.0,5.2,2.0
149,6.2,3.4,5.4,2.3


In [12]:
y

Id
1      0
2      0
3      0
4      0
5      0
      ..
146    2
147    2
148    2
149    2
150    2
Name: Species, Length: 150, dtype: int64

##  Model Building and Hyper-Parameter Tuning
- Split the dataset into training and testing sets.
- Initialize an XGBoost classifier model.
- Tune hyper-parameters using techniques like grid search or random search.
- Utilize cross-validation for robust parameter selection.

In [13]:
# Splitting the dataset into train and test 

x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2, random_state=44)

In [14]:
x_train.shape,x_test.shape,y_train.shape,y_test.shape

((120, 4), (30, 4), (120,), (30,))

In [15]:
# Define the XGBoost classifier
xgb_model = XGBClassifier()
xgb_model

#### Let’s look at the overview of the most frequently tuned hyperparameters:

1) learning_rate: also called eta, it specifies how quickly the model fits the residual errors by using additional base learners.(typical values: 0.01–0.2 )

2) gamma, reg_alpha, reg_lambda: these 3 parameters specify the values for 3 types of regularization done by XGBoost - minimum loss reduction to create a new split, L1 reg on leaf weights, L2 reg leaf weights respectively

3)  typical values for gamma: 0 - 0.5 but highly dependent on the data typical values for reg_alpha and reg_lambda: 0 - 1 is a good starting point but again, depends on the data

4) max_depth - how deep the tree's decision nodes can go. Must be a positive integer, typical values: 1–10

5) subsample - fraction of the training set that can be used to train each tree. If this value is low, it may lead to underfitting or if it is too high, it may lead to overfitting , typical values: 0.5–0.9

6) colsample_bytree- fraction of the features that can be used to train each tree. A large value means almost all features can be used to build the decision tree , typical values: 0.5–0.9

#### The above are the main hyperparameters people often tune.

In [16]:
# Define the hyperparameter grid to search
param = {
    'learning_rate': [0.1 , 0.2],
    'n_estimators': [50, 100],
    'max_depth': [5, 10],  # Adjusted based on typical values
    'subsample': [0.5, 0.9],  # Adjusted based on typical values
    'colsample_bytree': [0.5, 0.6],  # Adjusted based on typical values
    'gamma': [0.2, 0.3 ],  # Adjusted based on typical values
    'reg_alpha': [0, 0.5],  # Adjusted based on typical values
    'reg_lambda': [0.5 , 1 ],  # Adjusted based on typical values
}

### Use GridSearchCV to find the best hyperparameters

In [17]:

grid_search = GridSearchCV(
    xgb_model,
    param_grid=param,
    scoring='accuracy',  # Use an appropriate scoring metric for your problem
    cv=5,  # Number of cross-validation folds
    verbose=2,
    n_jobs=-1,  # Use -1 to use all available CPU cores
)

In [18]:
# Fit the grid search to the data
grid_search.fit(x_train, y_train)

Fitting 5 folds for each of 256 candidates, totalling 1280 fits
[CV] END colsample_bytree=0.5, gamma=0.2, learning_rate=0.1, max_depth=5, n_estimators=50, reg_alpha=0, reg_lambda=0.5, subsample=0.5; total time=   0.1s
[CV] END colsample_bytree=0.5, gamma=0.2, learning_rate=0.1, max_depth=5, n_estimators=50, reg_alpha=0, reg_lambda=0.5, subsample=0.5; total time=   0.1s
[CV] END colsample_bytree=0.5, gamma=0.2, learning_rate=0.1, max_depth=5, n_estimators=50, reg_alpha=0, reg_lambda=0.5, subsample=0.5; total time=   0.1s
[CV] END colsample_bytree=0.5, gamma=0.2, learning_rate=0.1, max_depth=5, n_estimators=50, reg_alpha=0, reg_lambda=0.5, subsample=0.9; total time=   0.1s
[CV] END colsample_bytree=0.5, gamma=0.2, learning_rate=0.1, max_depth=5, n_estimators=50, reg_alpha=0, reg_lambda=0.5, subsample=0.9; total time=   0.1s
[CV] END colsample_bytree=0.5, gamma=0.2, learning_rate=0.1, max_depth=5, n_estimators=50, reg_alpha=0, reg_lambda=0.5, subsample=0.9; total time=   0.1s
[CV] END col

### Model Training and Evaluation
- Train the model using the training set with the tuned hyper-parameters.
- Evaluate the model's performance using metrics such as accuracy, precision, recall, F1-score, and ROC-AUC.
- Visualize evaluation metrics and ROC curve.

In [19]:
# Get the best hyperparameters
best_params1 = grid_search.best_params_
print("Best Hyperparameters:", best_params1)

Best Hyperparameters: {'colsample_bytree': 0.5, 'gamma': 0.2, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 50, 'reg_alpha': 0.5, 'reg_lambda': 1, 'subsample': 0.9}


In [20]:
# Evaluate the model with the best hyperparameters on the test set
best_model1 = grid_search.best_estimator_

In [21]:
y_pred1 = best_model1.predict(x_test)

In [22]:
Gcv_accuracy = accuracy_score(y_test, y_pred1)*100
print("Test Accuracy:", Gcv_accuracy)

Test Accuracy: 93.33333333333333


In [23]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, confusion_matrix
import matplotlib.pyplot as plt

# Evaluate the model's performance
precision = precision_score(y_test, y_pred1, average='weighted')  # Adjust 'average' parameter accordingly
print("Precision:", precision)

Precision: 0.9366666666666668


In [24]:
recall = recall_score(y_test, y_pred1, average='weighted')  # Adjust 'average' parameter accordingly
print("Recall:", recall)

Recall: 0.9333333333333333


In [25]:
f1 = f1_score(y_test, y_pred1, average='weighted')  # Adjust 'average' parameter accordingly
print("F1 Score:", f1)

F1 Score: 0.9334858886346301


In [31]:
y_proba = best_model1.predict_proba(x_test)
roc_auc = roc_auc_score(y_test, y_proba, multi_class='ovr')  # Use 'ovr' for multiclass
print("ROC AUC Score:", roc_auc)

ROC AUC Score: 0.9867724867724869


In [36]:
# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred1)
print("Confusion Matrix:")
print(conf_matrix)

Confusion Matrix:
[[ 9  0  0]
 [ 1  8  0]
 [ 0  1 11]]
