### Assignment-13 Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.filterwarnings('ignore')

In [None]:
df1 = pd.read_excel('C:\\Users\\kavya\\OneDrive\\Attachments\\Desktop\\Assignments\\heart_disease.xlsx',sheet_name=0)


In [None]:
df1

In [None]:
df = pd.read_excel('C:\\Users\\kavya\\OneDrive\\Attachments\\Desktop\\Assignments\\heart_disease.xlsx',sheet_name=1)

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.shape

## Exploratory Data Analysis

In [None]:
df.isnull().sum()

In [None]:
null_values = df[df.isnull().any(axis=1)]
null_values

In [None]:
null_percentage = (df['oldpeak'].isnull().sum() / len(df)) * 100
print("Percentage of null values in 'oldpeak':", null_percentage)

In [None]:
non_null_values = df['oldpeak'].dropna()
summary_stats = non_null_values.describe()
print("Summary statistics of non-null values in 'oldpeak':")
print(summary_stats)
     

In [None]:
sns.boxplot(x = df['oldpeak'])
plt.show()

In [None]:
median_old_peak = df['oldpeak'].median()
df['oldpeak'].fillna(median_old_peak, inplace=True)
     

In [None]:
df.isnull().sum().sum()

In [None]:
df.duplicated().sum()

In [None]:
df = df.drop_duplicates() #remove duplicates

In [None]:
df.duplicated().sum()

In [None]:
categorical_columns = df.select_dtypes(include=['object']).columns
for column in categorical_columns:
    unique_values = df[column].unique()
    print("\nUnique Values in", column, ":", unique_values)

In [None]:
df.describe().T

In [None]:
df.plot(kind='box')
plt.show()

In [None]:
# Function to replace outliers with the nearest non-outlier value
def replace_outliers(series):
    if np.issubdtype(series.dtype, np.number):  # Check if the series is numeric
        Q1 = series.quantile(0.25)
        Q3 = series.quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR

        # Replace outliers with the nearest non-outlier value
        series[series < lower_bound] = series[series >= lower_bound].min()
        series[series > upper_bound] = series[series <= upper_bound].max()
    return series

# Apply the function to each column of the DataFrame
df_cleaned = df.apply(replace_outliers, axis=0)
     

import plotly.graph_objects as go

fig = go.Figure()

for column in df.columns:
    fig.add_trace(go.Box(y=df[column], name=column + ' (Before)'))

for column in df_cleaned.columns:
    fig.add_trace(go.Box(y=df_cleaned[column], name=column + ' (After)'))

fig.update_layout(title="Box Plot of Dataset Before and After Replacing Outliers",
                  yaxis_title="Values")

fig.show()
     

In [None]:
fig = go.Figure()

for column in df_cleaned.columns:
    fig.add_trace(go.Histogram(x=df_cleaned[column], name=column, marker_color='yellowgreen'))

fig.update_layout(title='Histograms of Features',
                  xaxis_title='Values',
                  yaxis_title='Frequency',
                  barmode='overlay',
                  bargap=0.1)

fig.show()

In [None]:
df2 = df_cleaned
df2.info()

In [None]:
# Perform one-hot encoding for categorical columns
df_encoded = pd.get_dummies(df2, columns=['sex', 'cp', 'restecg', 'exang', 'slope', 'thal'])
df_encoded = df_encoded.astype(int)

# Calculate the correlation matrix
correlation_matrix = df_encoded.corr()

plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix of Features', fontsize=16)
plt.show()

In [None]:
pd.set_option('display.max_columns', None)
df_encoded.head()

In [None]:
df3 = df_encoded
df3.info()

In [None]:
from sklearn.model_selection import train_test_split

# Separate features (X) and target variable (y)
X = df3.drop(columns=['num'])  # Features
y = df3['num']  # Target variable

# Split the dataset into training and testing sets (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
     

In [None]:
print("Training set shape:", X_train.shape, y_train.shape)
print("Testing set shape:", X_test.shape, y_test.shape)

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# Create a Decision Tree Classifier
clf = DecisionTreeClassifier(random_state=42)

# Train the classifier on the training data
clf.fit(X_train, y_train)

# Make predictions on the testing data
y_pred = clf.predict(X_test)

# Calculate accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
roc_auc = roc_auc_score(y_test, clf.predict_proba(X_test), average='macro', multi_class='ovr')

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("ROC AUC Score:", roc_auc)

In [None]:
import plotly.graph_objects as go

# Evaluation metrics
metrics = ['Accuracy', 'Precision', 'Recall', 'F1 Score', 'ROC AUC Score']
values = [accuracy, precision, recall, f1, roc_auc]

# Create a bar plot
fig = go.Figure(data=[go.Bar(x=metrics, y=values, marker_color=['blue', 'green', 'orange', 'purple', 'red'])])
fig.update_layout(title='Evaluation Metrics',
                  xaxis_title='Metrics',
                  yaxis_title='Score',
                  yaxis=dict(range=[0, 1]),
                  showlegend=False)
fig.show()

In [None]:
from sklearn.metrics import roc_curve
import matplotlib.pyplot as plt

# Compute ROC curve
fpr, tpr, thresholds = roc_curve(y_test, clf.predict_proba(X_test)[:, 1], pos_label=1)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='blue', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='red', lw=2, linestyle='--', label='Random Guess')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()
     

In [None]:
from sklearn.model_selection import GridSearchCV

# Define the hyperparameters grid
param_grid = {
    'max_depth': [None, 5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'criterion': ['gini', 'entropy']
}

# Create a Decision Tree Classifier
clf = DecisionTreeClassifier(random_state=42)

# Instantiate GridSearchCV
grid_search = GridSearchCV(clf, param_grid, cv=5, scoring='accuracy')

# Perform grid search
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_
print("Best Parameters:", best_params)

# Get the best estimator
best_clf = grid_search.best_estimator_

# Evaluate the best model on the testing set
y_pred = best_clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy on Testing Set:", accuracy)

In [None]:
pip install graphviz pydotplus


In [None]:
from IPython.display import Image
import pydotplus
from sklearn.tree import export_graphviz

# Export the decision tree to DOT format
dot_data = export_graphviz(best_clf, out_file=None,
                           feature_names=X_train.columns,
                           class_names=class_names,
                           filled=True, rounded=True,
                           special_characters=True)

# Create a Graphviz object
graph = pydotplus.graph_from_dot_data(dot_data)

# Generate image from the graph
image = Image(graph.create_png())

display(image)


In [None]:
# Before hyperparameter implementation
before_accuracy = 0.5604395604395604
before_precision = 0.5461145400228057
before_recall = 0.5604395604395604
before_f1_score = 0.552656965312052
before_roc_auc_score = 0.6267962426386677

# After hyperparameter implementation
after_best_parameters = {'criterion': 'gini', 'max_depth': 5, 'min_samples_split': 2}
after_accuracy = 0.5769230769230769

print("Performance Insights:")
print("------------------------------")
print("Before Hyperparameter Tuning:")
print("  Accuracy:", before_accuracy)
print("  Precision:", before_precision)
print("  Recall:", before_recall)
print("  F1 Score:", before_f1_score)
print("  ROC AUC Score:", before_roc_auc_score)
print("\nAfter Hyperparameter Tuning:")
print("  Best Parameters:", after_best_parameters)
print("  Accuracy on Testing Set:", after_accuracy)
