# 🌟 Exercise 1 : Exploratory Data Analysis
Instructions

- Load the data from CSV files
- Remove target column from the training data
- Split the data intro train/test split
- Understand the data

In [1]:
# import libraries for data manipulation
import pandas as pd
import numpy as np

# import libraries for machine learning
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
import xgboost as xgb

from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler

In [2]:
# Load the data from CSV files
data = pd.read_csv('dataset_heart.csv')

data.head()

# Remove target column from the training data
y = data['heart disease']
y=y-1
X = data.drop(columns=['heart disease'])


# Split the data into train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Understand the data
display(X_train.describe())
display(X_test.describe())

Unnamed: 0,age,sex,chest pain type,resting blood pressure,serum cholestoral,fasting blood sugar,resting electrocardiographic results,max heart rate,exercise induced angina,oldpeak,ST segment,major vessels,thal
count,189.0,189.0,189.0,189.0,189.0,189.0,189.0,189.0,189.0,189.0,189.0,189.0,189.0
mean,54.751323,0.693122,3.21164,131.809524,251.529101,0.137566,1.042328,149.31746,0.37037,1.155556,1.608466,0.698413,4.783069
std,9.132393,0.462423,0.966278,17.952233,52.55489,0.345359,0.999099,22.877443,0.484186,1.195707,0.614655,0.967035,1.948957
min,29.0,0.0,1.0,94.0,141.0,0.0,0.0,88.0,0.0,0.0,1.0,0.0,3.0
25%,48.0,0.0,3.0,120.0,215.0,0.0,0.0,132.0,0.0,0.0,1.0,0.0,3.0
50%,56.0,1.0,4.0,130.0,244.0,0.0,2.0,154.0,0.0,1.0,2.0,0.0,3.0
75%,62.0,1.0,4.0,140.0,282.0,0.0,2.0,166.0,1.0,1.8,2.0,1.0,7.0
max,74.0,1.0,4.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,3.0,3.0,7.0


Unnamed: 0,age,sex,chest pain type,resting blood pressure,serum cholestoral,fasting blood sugar,resting electrocardiographic results,max heart rate,exercise induced angina,oldpeak,ST segment,major vessels,thal
count,81.0,81.0,81.0,81.0,81.0,81.0,81.0,81.0,81.0,81.0,81.0,81.0,81.0
mean,53.691358,0.641975,3.08642,130.259259,245.296296,0.17284,0.975309,150.518519,0.234568,0.803704,1.530864,0.604938,4.493827
std,9.067307,0.482407,0.911009,17.711421,49.644094,0.380464,0.999691,23.94844,0.426369,0.981,0.614134,0.88993,1.917834
min,35.0,0.0,1.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,1.0,0.0,3.0
25%,46.0,0.0,3.0,120.0,208.0,0.0,0.0,138.0,0.0,0.0,1.0,0.0,3.0
50%,54.0,1.0,3.0,130.0,249.0,0.0,0.0,153.0,0.0,0.5,1.0,0.0,3.0
75%,59.0,1.0,4.0,140.0,273.0,0.0,2.0,167.0,0.0,1.4,2.0,1.0,7.0
max,77.0,1.0,4.0,192.0,407.0,1.0,2.0,195.0,1.0,4.0,3.0,3.0,7.0


# 🌟 Exercise 2 : Logistic Regression without Grid Search
Instructions

Use the Iris dataset to build a logistic regression model without using grid search. Split the data into training and testing sets, then train a logistic regression model and evaluate its performance on the test set.

In [3]:
# Train a logistic regression model
log_reg = LogisticRegression(solver= 'lbfgs', max_iter=1500) # need to increase max_iter to allow the model to converge
log_reg.fit(X_train, y_train)

# Predict the target on the test data
y_pred = log_reg.predict(X_test)
print('Accuracy of logistic regression model: ', accuracy_score(y_test, y_pred))

Accuracy of logistic regression model:  0.8271604938271605


In [4]:
# perform cross validation
cv_scores = cross_val_score(log_reg, X, y, cv=5)
print('Cross validation scores: ', cv_scores)
print('Mean cross validation score: ', np.mean(cv_scores))

Cross validation scores:  [0.81481481 0.83333333 0.87037037 0.83333333 0.88888889]
Mean cross validation score:  0.8481481481481481


# 🌟 Exercise 3 : Logistic Regression with Grid Search
Instructions

Build a logistic regression model using the Iris dataset, but this time, use GridSearchCV to optimize the hyperparameters such as C and penalty.

In [5]:
# standardize the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# define parameter grid for grid search
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear']  # because 'liblinear' is suitable for small datasets and supports both L1 and L2 penalties
}

In [6]:
# build logistic regression model with grid search
grid_search = GridSearchCV(
    estimator=LogisticRegression(max_iter=1500),
    param_grid=param_grid,
    cv=5
)

grid_search.fit(X_train_scaled, y_train)

In [7]:
# get the best model
best_log_reg = grid_search.best_estimator_
y_pred = best_log_reg.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
print('## Accuracy of logistic regression model after grid search: ', accuracy)

# print the best parameters
print('## Best parameters: ', grid_search.best_params_)

## Accuracy of logistic regression model after grid search:  0.8271604938271605
## Best parameters:  {'C': 0.1, 'penalty': 'l2', 'solver': 'liblinear'}


# 🌟 Exercise 4 : SVM without Grid Search
Instructions

Train a Support Vector Machine (SVM) classifier on the Iris dataset without using grid search. Choose an appropriate kernel and set the hyperparameters manually.

In [8]:
# train a support vector machine model
svm_clf = SVC(kernel='linear', C=1.0)
svm_clf.fit(X_train_scaled, y_train)

# predict the target on the test data
y_pred = svm_clf.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)

# print the accuracy in percentage * 100
print('## Accuracy : {:.2f}'.format(accuracy * 100), '%')

## Accuracy : 83.95 %


# 🌟 Exercise 5 : SVM with Grid Search
Instructions

Implement an SVM classifier on the Iris dataset with GridSearchCV to find the best combination of C, kernel, and gamma hyperparameters.

In [9]:
param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf', 'poly'],
    'gamma': ['scale', 'auto']
}

# build support vector machine model with grid search
grid_search = GridSearchCV(
    estimator=SVC(),
    param_grid=param_grid,
    cv=5
)

# fit the model
grid_search.fit(X_train_scaled, y_train)

In [10]:
# evaluate the model
best_svm_clf = grid_search.best_estimator_
y_pred = best_svm_clf.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)

# print the accuracy in percentage * 100
print('## Accuracy of SVM model after grid search: {:.2f}'.format(accuracy * 100), '%')
print('## Best parameters: ', grid_search.best_params_)

## Accuracy of SVM model after grid search: 82.72 %
## Best parameters:  {'C': 1, 'gamma': 'scale', 'kernel': 'rbf'}


# 🌟 Exercise 6 : XGBoost without Grid Search
Instructions

Use the Iris dataset to train an XGBoost classifier without hyperparameter tuning. Set the hyperparameters manually and justify your choices.

In [11]:
# train a XGBoost model
xgb_clf = xgb.XGBClassifier(
    learning_rate=0.1,    # Chosen as a conservative learning rate
    n_estimators=100,     # A reasonable number of boosting rounds
    max_depth=3,          # A conservative depth for simplicity and efficiency
    subsample=0.8,        # Use 80% of the data to prevent overfitting
    colsample_bytree=0.8, # Use 80% of features to prevent overfitting
    random_state=42       # For reproducibility
)

# fit the model
xgb_clf.fit(X_train_scaled, y_train)

In [12]:
# evaluate the model
y_pred = xgb_clf.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)

# print the accuracy in percentage * 100
print('## Accuracy of XGBoost model: {:.2f}'.format(accuracy * 100), '%')

## Accuracy of XGBoost model: 80.25 %


# 🌟 Exercise 7 : XGBoost with Grid Search
Instructions

Train an XGBoost classifier on the Iris dataset using GridSearchCV to optimize hyperparameters such as learning_rate, n_estimators, max_depth, etc.

In [13]:
# Define the parameter grid
param_grid = {
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 4, 5],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0]
}

# create an xgboost model
xgb_clf = xgb.XGBClassifier(random_state=42)

# build the grid search
grid_search = GridSearchCV(
    estimator=xgb_clf,
    param_grid=param_grid,
    cv=5,
    n_jobs=-1,
    verbose=1
)

In [14]:
# fit the model
grid_search.fit(X_train_scaled, y_train)

Fitting 5 folds for each of 243 candidates, totalling 1215 fits


In [15]:
# get the best estimator
best_xgb_clf = grid_search.best_estimator_

# evaluate the model
y_pred = best_xgb_clf.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)

# print the accuracy in percentage * 100
print('## Accuracy of XGBoost model after grid search: {:.2f}'.format(accuracy * 100), '%')

## Accuracy of XGBoost model after grid search: 80.25 %
