# Assignment | 14th April 2023

Build a random forest classifier to predict the risk of heart disease based on a dataset of patient
information. The dataset contains 303 instances with 14 features, including age, sex, chest pain type,
resting blood pressure, serum cholesterol, and maximum heart rate achieved.

Dataset link: https://drive.google.com/file/d/1bGoIE4Z2kG5nyh-fGZAJ7LH0ki3UfmSJ/view?usp=share_link

Q1. Preprocess the dataset by handling missing values, encoding categorical variables, and scaling the
numerical features if necessary.

Ans.


In [None]:
# Load the Dataset:

import pandas as pd

data_url = "https://drive.google.com/uc?export=download&id=1bGoIE4Z2kG5nyh-fGZAJ7LH0ki3UfmSJ"
df = pd.read_csv(data_url)

df.head()

In [None]:
# Handling/Checking missing values:

# Check for missing values
print(df.isnull().sum())

# Fill missing values with the mean
df.fillna(df.mean(), inplace=True)


In [None]:
# Perform one-hot encoding
df_encoded = pd.get_dummies(df, columns=['chest_pain_type', 'sex'])


In [None]:
# Scaling numerical features:

from sklearn.preprocessing import StandardScaler

# Select the numerical features to scale
numerical_features = ['age', 'resting_blood_pressure', 'serum_cholesterol', 'maximum_heart_rate']

# Perform standardization on the numerical features
scaler = StandardScaler()
df_encoded[numerical_features] = scaler.fit_transform(df_encoded[numerical_features])


Q2. Split the dataset into a training set (70%) and a test set (30%).

Ans.

In [None]:
from sklearn.model_selection import train_test_split

# Split the dataset into features (X) and target variable (y)
X = df_encoded.drop('target', axis=1)  # Assuming 'target' is the column name for the target variable
y = df_encoded['target']

# Split the dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


Q3. Train a random forest classifier on the training set using 100 trees and a maximum depth of 10 for each
tree. Use the default values for other hyperparameters.

Ans.

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Create the random forest classifier
rf_classifier = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)

# Train the classifier on the training set
rf_classifier.fit(X_train, y_train)


Q4. Evaluate the performance of the model on the test set using accuracy, precision, recall, and F1 score.

Ans.

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Make predictions on the test set
y_pred = rf_classifier.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

# Calculate precision
precision = precision_score(y_test, y_pred)

# Calculate recall
recall = recall_score(y_test, y_pred)

# Calculate F1 score
f1 = f1_score(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)


Q5. Use the feature importance scores to identify the top 5 most important features in predicting heart
disease risk. Visualise the feature importances using a bar chart.

Ams.

In [None]:
import matplotlib.pyplot as plt

# Get feature importances
importances = rf_classifier.feature_importances_

# Get feature names
feature_names = X_train.columns

# Sort feature importances in descending order
indices = np.argsort(importances)[::-1]

# Select the top 5 most important features
top_features = feature_names[indices][:5]
top_importances = importances[indices][:5]

# Visualize feature importances
plt.figure(figsize=(10, 6))
plt.bar(range(5), top_importances, align='center')
plt.xticks(range(5), top_features, rotation=45)
plt.xlabel('Features')
plt.ylabel('Importance')
plt.title('Top 5 Most Important Features')
plt.show()


Q6. Tune the hyperparameters of the random forest classifier using grid search or random search. Try
different values of the number of trees, maximum depth, minimum samples split, and minimum samples
leaf. Use 5-fold cross-validation to evaluate the performance of each set of hyperparameters.

Ans.



In [None]:
from sklearn.model_selection import GridSearchCV

# Define the hyperparameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 4, 6],
    'min_samples_leaf': [1, 2, 3]
}

# Create the random forest classifier
rf_classifier = RandomForestClassifier(random_state=42)

# Create the GridSearchCV object
grid_search = GridSearchCV(rf_classifier, param_grid, cv=5, scoring='accuracy')

# Fit the GridSearchCV object to the training data
grid_search.fit(X_train, y_train)

# Get the best hyperparameters
best_params = grid_search.best_params_

# Get the best model
best_rf_classifier = grid_search.best_estimator_

# Evaluate the best model using the test set
y_pred = best_rf_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print("Best Hyperparameters:", best_params)
print("Accuracy:", accuracy)


Q7. Report the best set of hyperparameters found by the search and the corresponding performance
metrics. Compare the performance of the tuned model with the default model.

Ans.



In [None]:
# Report the best hyperparameters
print("Best Hyperparameters:", best_params)

# Evaluate the best model using the test set
y_pred_tuned = best_rf_classifier.predict(X_test)
accuracy_tuned = accuracy_score(y_test, y_pred_tuned)
precision_tuned = precision_score(y_test, y_pred_tuned)
recall_tuned = recall_score(y_test, y_pred_tuned)
f1_tuned = f1_score(y_test, y_pred_tuned)

# Evaluate the default model using the test set
y_pred_default = rf_classifier.predict(X_test)
accuracy_default = accuracy_score(y_test, y_pred_default)
precision_default = precision_score(y_test, y_pred_default)
recall_default = recall_score(y_test, y_pred_default)
f1_default = f1_score(y_test, y_pred_default)

# Print the performance metrics
print("Performance Metrics - Tuned Model:")
print("Accuracy:", accuracy_tuned)
print("Precision:", precision_tuned)
print("Recall:", recall_tuned)
print("F1 Score:", f1_tuned)

print("\nPerformance Metrics - Default Model:")
print("Accuracy:", accuracy_default)
print("Precision:", precision_default)
print("Recall:", recall_default)
print("F1 Score:", f1_default)


Q8. Interpret the model by analysing the decision boundaries of the random forest classifier. Plot the
decision boundaries on a scatter plot of two of the most important features. Discuss the insights and
limitations of the model for predicting heart disease risk.

Ans.

In [None]:
from sklearn.decomposition import PCA

# Reduce dimensionality to 2D using PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_train)

# Select the two most important features for plotting
feature1_idx = np.where(feature_names == top_features[0])[0][0]
feature2_idx = np.where(feature_names == top_features[1])[0][0]
feature1_values = X_pca[:, feature1_idx]
feature2_values = X_pca[:, feature2_idx]

# Plot the decision boundaries
plt.figure(figsize=(10, 6))
plt.scatter(feature1_values, feature2_values, c=y_train, cmap='coolwarm', alpha=0.8)
plt.xlabel(top_features[0])
plt.ylabel(top_features[1])
plt.title('Decision Boundaries of Random Forest Classifier')
plt.colorbar(label='Heart Disease')
plt.show()
