<a href="https://colab.research.google.com/github/SarathSabu/Python-Notebooks/blob/main/Decision_Tree.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# import libraries

# For data manipulation
import pandas as pd
import numpy as np

# For model training and evaluation
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
import sklearn.model_selection as ms
from sklearn import tree
from sklearn.metrics import classification_report, confusion_matrix

# For visualization
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import Image
import pydotplus
%matplotlib inline

# For Google Colab integration
import os
from google.colab import drive



from google.colab import drive
drive.mount('/content/drive')



#Read the Data and Check the Stats & Columns

In [None]:
# import data as dataframe
file_path = '/content/drive/MyDrive/Infor648/Data/churn.csv'
df = pd.read_csv(file_path)

# calling head() method
df.head()

In [None]:
df.describe()

In [None]:
df.columns

# Remove Rows with a Missing Value

In [None]:
display(df.isna().sum()) ##check missing value

##Multiple Ways to Handle Missing Values in a Dataset

###1 Drop them directly

In [None]:
##drop them directly

df = df.dropna() ##drop missing value

###2 Do a quick check

In [None]:
missing_cols = [
    "Offer", "Avg Monthly Long Distance Charges", "Multiple Lines",
    "Internet Type", "Avg Monthly GB Download", "Online Security",
    "Online Backup", "Device Protection Plan", "Premium Tech Support",
    "Streaming TV", "Streaming Movies", "Streaming Music", "Unlimited Data"
]

# Filter DataFrame to show only columns with missing values
df[missing_cols]

In [None]:
df["Avg Monthly Long Distance Charges"] = df["Avg Monthly Long Distance Charges"].fillna(df["Avg Monthly Long Distance Charges"].mean())
df["Avg Monthly GB Download"] = df["Avg Monthly GB Download"].fillna(df["Avg Monthly GB Download"].mean())


In [None]:
df = df[["Gender", "Age", "Married", "Number of Dependents", "Tenure in Months", "Monthly Charge", "Total Charges", "Total Refunds" ,"Total Revenue","Internet Type","Customer Status"]]

In [None]:
display(df.isna().sum()) ##recheck missing value again

In [None]:
df = df.dropna() ##drop missing value

In [None]:
display(df.isna().sum()) ##recheck missing value again

#Now lets check how many categorical/numeric variables we have

In [None]:
# Numeric Variables
numeric_variables = [col for col in df.columns if df[col].dtype != "object" and col not in "Customer Status"] ##exclude our target variable: customer status
numeric_variables

In [None]:
categorical_variables = [col for col in df.columns if df[col].dtype == "O" and col != "Customer Status"]  ###exclude our target: "Customer Status"
categorical_variables

In [None]:
df['Married']

In [None]:
df['Gender']

In [None]:
df['Internet Type']

##What is your recommendation ?

###The company seeks to understand the factors that influence a consumer's decision to stay or leave their service and aims to predict customer churn.

###The company hopes to take proactive steps to retain customers before they decide to leave.

###Initially, they believe factors such as family status, monthly charges, and length of time with the service may play a role, and they are looking to explore these and other potential predictors.

#Select the variables we are interested in

In [None]:
df_sub = df[["Gender", "Age", "Married", "Number of Dependents", "Tenure in Months", "Monthly Charge", "Total Charges", "Total Refunds" ,"Total Revenue","Internet Type","Customer Status"]]

#Encode our categorical data

####Do not click the encoding several times. If it does not show the category as
####{'Female': 0, 'Male': 1}
####but as {0: 0, 1: 1}
####you need to start from the select variables steps again df_sub = df[["Gender", "Age", "Married", "Number of Dependents", "Tenure in Months", "Monthly Charge", "Total Charges", "Total Refunds" ,"Total Revenue","Internet Type","Customer Status"]]

In [None]:
##encode categorical data
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
df_sub['Gender'] = label_encoder.fit_transform(df_sub['Gender'])
mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))


#Print out what we encoded for gender
print("Gender Encoding:")
print(mapping)

In [None]:
df_sub['Married'] = label_encoder.fit_transform(df_sub['Married'])
mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))


#prints out what we encoded for married
print("Married Encoding:")
print(mapping)

In [None]:
df_sub[['Married','Gender']]

#One-hot encoding

## One-hot encoding transforms categorical variables into a set of binary columns (0s and 1s), one for each category.
## This ensures no ordinal relationship is imposed between the categories (which could be problematic in label encoding).

In [None]:
df_sub = pd.get_dummies(df_sub, columns=['Internet Type'])

In [None]:
df_sub

In [None]:
#df_sub.columns = df_sub.columns.str.replace(' ', '_')

#Now lets check our target variable

In [None]:
display(df_sub['Customer Status'].value_counts())
##Our target variable is a categorical variable

In [None]:
df_sub['Customer Status']

In [None]:
####We are only interested in why people stayed and churned
df_sub = df_sub[df_sub['Customer Status'] !='Joined'] # we drop all the new customers

####Encode our target variable
target_label_encoder = LabelEncoder()
df_sub['Customer Status'] = target_label_encoder.fit_transform(df_sub['Customer Status'])


##display the stats after encoding
display(df_sub['Customer Status'].value_counts())
mapping = dict(zip(target_label_encoder.classes_, target_label_encoder.transform(target_label_encoder.classes_)))
print(mapping)

#Let see how each feature is related to our target variable

In [None]:
corr_matrix = df_sub.corr()
plt.figure(figsize=(9,9)) ###change the figure size here
sns.heatmap(corr_matrix, cmap='Blues', annot=True)
plt.show()

# Define features (X) and target (y), then split the data into training/testing sets


In [None]:
X = df_sub.drop('Customer Status', axis=1)  # Drop the target column to get independent variables
y = df_sub['Customer Status']  # Select the target column directly as our y


# Split the dataset into training and testing sets test_size using 0.3: 70% training and 30% testing
X_train, X_test, y_train, y_test = ms.train_test_split(X, y, test_size=0.3, random_state=1)

In [None]:
X_train

In [None]:
X_test

In [None]:
y_train

In [None]:
y_test

#Train our first decision tree model

In [None]:
#criterion='entropy': Specifies that the decision tree should use entropy to measure the quality of a split.
#max_depth=12: Limits the maximum depth of the tree to 12 levels.
#min_samples_split=5: Specifies that a node must have at least 5 samples to be considered for splitting


dt_clf = DecisionTreeClassifier(criterion='entropy', max_depth=12, min_samples_split = 5, random_state=1)


###train the model
dt_clf = dt_clf.fit(X_train, y_train)

###making predictions on test data
y_pred = dt_clf.predict(X_test)

In [None]:
#####run this step to mark the features to the plot we are going to generate
feature_names = X_train.columns.tolist()

# Automatically get all unique target classes
##It should give you the target label before we encode it

class_names = target_label_encoder.inverse_transform(np.arange(len(target_label_encoder.classes_)))

##print out the features we selected for predictions and our classification target
print("features:",feature_names)
print("Classes:", class_names)

In [None]:
###Text representation of our trained Decision Tree
from sklearn.tree import export_text
feature_names = X_train.columns.tolist()
text_representation = export_text(dt_clf, feature_names=feature_names)
print(text_representation)

In [None]:
###Figure visualization of our trained Decision Tree
from sklearn.tree import export_graphviz
import graphviz
from IPython.display import display



# Generate the DOT data for the tree
dot_data = export_graphviz(dt_clf,
                           out_file=None,
                           feature_names=feature_names,
                           class_names=class_names,
                           filled=True,
                           rounded=True,
                           special_characters=True)

# Create the Graphviz source object
decision_tree_graph = graphviz.Source(dot_data, format="png")
decision_tree_graph.render("decision_tree_graph")
# Display the decision tree within the notebook
display(Image(filename="decision_tree_graph.png"))




In [None]:
##Uncomment this only if you want to save your decision tree image
##You need to create a folder called Image under My Drive first
#image_folder_path = '/content/drive/My Drive/Image/'
#decision_tree_graph.render(image_folder_path + "decision_tree_graph")

#Now lets evaluate our first decision tree

In [None]:

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

evaluation_metrics = pd.DataFrame({
    "Evaluation Metric": ["Train Accuracy", "Test Accuracy", "Recall", "Precision", "F1 Score"],
    "Value": [
        dt_clf.score(X_train, y_train),
        accuracy_score(y_test, y_pred),
        recall_score(y_test, y_pred),
        precision_score(y_test, y_pred),
        f1_score(y_test, y_pred)
    ]
})

evaluation_metrics

In [None]:
# Generate the confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# Plot the confusion matrix using seaborn
plt.figure(figsize=(8, 6))###change this number to adjust figure size
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=class_names, yticklabels=class_names)
plt.ylabel('True Class')
plt.xlabel('Predicted')
plt.title('Confusion Matrix')
plt.show()

In [None]:
##Generate the evaluation figure for all the classes
from yellowbrick.model_selection import ValidationCurve
from yellowbrick.classifier import ClassificationReport

visualizer = ClassificationReport(dt_clf, classes=class_names, support=False, title = "Decision Tree Classifier Evaluation")
visualizer.fit(X_train, y_train)
visualizer.score(X_test, y_test)

visualizer.ax.set_xticklabels(['Precision', 'Recall', 'F1'])
visualizer.show()



#Feature importance for predicting target

In [None]:
#The feature importance values in a decision tree model indicate how much a particular feature contributes to the prediction of the target variable.
#The importance score is calculated based on how often and how effectively a feature is used to split the data
#Features that are used to split the data closer to the root of the tree or that result in greater reductions in entropy will have higher importance scores.
#The importance scores are typically normalized, meaning they sum up to 1 (or 100% when expressed as percentages).
#A higher score indicates a more important feature in determining the outcome of the target variable.


# Retrieve the feature importances from the trained model
feature_importances = dt_clf.feature_importances_
# Create a DataFrame for better visualization
feature_importance_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': feature_importances
})

# Sort the DataFrame by importance
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Plotting the feature importances
plt.figure(figsize=(8, 5))###########change this number to adjust figure size
plt.barh(feature_importance_df['Feature'], feature_importance_df['Importance'], color='skyblue')

for index, value in enumerate(feature_importance_df['Importance']):
    plt.text(value, index, f'{value:.3f}', va='center')    #.3f means the value is round up to 3 decimal places

plt.xlabel('Importance')
plt.ylabel('Feature')
plt.title('Feature Importance')
plt.gca().invert_yaxis()  # To have the most important feature at the top
plt.show()


#

#Cross-Validation Evaluation on the training set

In [None]:
from sklearn.model_selection import cross_val_score, cross_val_predict


# Perform cross-validation and get aggregated predictions across the entire training set
y_pred_cross = cross_val_predict(dt_clf, X_train, y_train, cv=5)


# Cross-Validation Results on Training Set
y_pred_cross = cross_val_predict(dt_clf, X_train, y_train, cv=5)

accuracy_cv = accuracy_score(y_train, y_pred_cross)
recall_cv = recall_score(y_train, y_pred_cross)
precision_cv = precision_score(y_train, y_pred_cross)
f1_cv = f1_score(y_train, y_pred_cross)
matrix_cv = confusion_matrix(y_train, y_pred_cross)

# Create DataFrame for evaluation metrics with cross-validation
evaluation_metrics_with_cv = pd.DataFrame({
    "Evaluation Metric_CV": ["Train Accuracy", "Test Accuracy", "Recall", "Precision", "F1 Score"],
    "Value": [
        accuracy_cv,
        accuracy_cv,  # Cross-validation doesn't separate train/test, so the same accuracy is used
        recall_cv,
        precision_cv,
        f1_cv
    ]
})

print("Performance Metrics With Cross-Validation:")
display(evaluation_metrics_with_cv)
print('\nConfusion Matrix CV:','\n', matrix_cv)


print("\nPerformance Metrics Without Cross-Validation:")
display(evaluation_metrics)
print('\nConfusion Matrix:','\n', conf_matrix)


## Alternatively, use cross_val_score to get the average performance metrics across folds/different data splits
# scores_acc = cross_val_score(dt_clf, X_train, y_train, cv=5, scoring='accuracy')
#scores_rc = cross_val_score(dt_clf, X_train, y_train, cv=5, scoring='recall')
#scores_prc = cross_val_score(dt_clf, X_train, y_train, cv=5, scoring='precision')
#scores_f1 = cross_val_score(dt_clf, X_train, y_train, cv=5, scoring='f1_macro')

#print('\nCross-Validation Scores on Training Set:')
#print('Average Accuracy: ', scores_acc.mean())
#print('Average Recall: ', scores_rc.mean())
#print('Average Precision: ', scores_prc.mean())
#print('Average F1-score: ', scores_f1.mean())


#Hyperparameter Fine-tuning (Pruning)

In [None]:
# Define a range of max_depth values to evaluate
max_depth_range = range(3, 10)  # You can adjust the range as needed

# Lists to store performance metrics
train_accuracies = []
cv_test_accuracies = []

for depth in max_depth_range:
    # Initialize the model with the current max_depth
    dt_depth = DecisionTreeClassifier(criterion='entropy', max_depth=depth, random_state=1)

    # Train the model on the full training set
    dt_depth.fit(X_train, y_train)

    # Calculate training accuracy
    y_pred_train = dt_depth.predict(X_train)
    train_accuracies.append(accuracy_score(y_train, y_pred_train))

    # Perform cross-validation and calculate the mean accuracy
    cv_test_accuracy = cross_val_score(dt_depth, X_train, y_train, cv=5, scoring='accuracy').mean()
    cv_test_accuracies.append(cv_test_accuracy)

# Plot the performance metrics
plt.figure(figsize=(9, 5)) #################change this number to adjust figure size###########
plt.plot(max_depth_range, train_accuracies, label='Train Accuracy', marker='o', color='blue')
plt.plot(max_depth_range, cv_test_accuracies, label='Mean Cross-Validated Test Accuracy', marker='o', color='green')

plt.xlabel('Max Depth of Decision Tree')
plt.ylabel('Accuracy')
plt.title('Performance of Decision Tree with Varying Max Depth')
plt.legend(loc='upper left')
plt.grid(True)
plt.show()



In [None]:
from sklearn.model_selection import GridSearchCV

hyperparameter_tuning_DT = DecisionTreeClassifier(random_state=1)
parameters = {'max_depth': [4,5,8],
              'min_samples_split': [5, 10, 20],
              'splitter': ['best'],}     # 'best': Selects the best split based on the highest information gain

grid_dt = GridSearchCV(hyperparameter_tuning_DT, param_grid = parameters, cv = 10 )

grid_dt.fit(X_train, y_train)

result = pd.DataFrame(grid_dt.cv_results_['params'])
result['mean_CV_test_score'] = grid_dt.cv_results_['mean_test_score']
#result['std_test_score'] = grid_dt.cv_results_['std_test_score']
result.sort_values(by='mean_CV_test_score', ascending=False)

# Now let's fine-tune our decision tree model

In [None]:
dt_clf_tune = DecisionTreeClassifier(criterion='entropy', max_depth=5, min_samples_split = 5, splitter= 'best', random_state=1)
dt_clf_tune = dt_clf_tune.fit(X_train, y_train)
y_pred_tune = dt_clf_tune.predict(X_test)


# Generate the DOT data for the tuned tree
dot_data_tune = export_graphviz(dt_clf_tune,
                           out_file=None,
                           feature_names=feature_names,
                           class_names=class_names,
                           filled=True,
                           rounded=True,
                           special_characters=True)

# Create the Graphviz source object
decision_tree_graph_tune = graphviz.Source(dot_data_tune)

# Render the graph to a PNG image
decision_tree_graph_tune = graphviz.Source(dot_data_tune, format="png")
decision_tree_graph_tune.render("decision_tree_graph_tune")

# Display the image within the notebook
Image(filename="decision_tree_graph_tune.png")


In [None]:
##Uncomment this only if you want to save your decision tree image
##You need to create a folder called Image under My Drive first
#image_folder_path = '/content/drive/My Drive/Image/'
#decision_tree_graph_tune.render(image_folder_path + "decision_tree_graph_tune")

# Compare Performance: Original vs Tuned Decision Tree

In [None]:
# Original Decision Tree Performance Metrics
train_accuracy_original = dt_clf.score(X_train, y_train)
test_accuracy_original = accuracy_score(y_test, y_pred)
precision_original = precision_score(y_test, y_pred)
recall_original = recall_score(y_test, y_pred)
f1_original = f1_score(y_test, y_pred)
confusion_matrix_original = confusion_matrix(y_test, y_pred)

# Tuned Decision Tree Performance Metrics
train_accuracy_tuned = dt_clf_tune.score(X_train, y_train)
test_accuracy_tuned = accuracy_score(y_test, y_pred_tune)
precision_tuned = precision_score(y_test, y_pred_tune)
recall_tuned = recall_score(y_test, y_pred_tune)
f1_tuned = f1_score(y_test, y_pred_tune)
confusion_matrix_tuned = confusion_matrix(y_test, y_pred_tune)


# Create Comparison Table

# Create a DataFrame with the metrics
comparison_df = pd.DataFrame({
    'Evaluation Metric': ['Train Accuracy', 'Test Accuracy', 'Precision', 'Recall', 'F1 Score'],
    'Original Decision Tree': [train_accuracy_original, test_accuracy_original, precision_original, recall_original, f1_original],
    'Tuned Decision Tree': [train_accuracy_tuned, test_accuracy_tuned, precision_tuned, recall_tuned, f1_tuned]
})

# Set precision for floating point numbers
pd.set_option("display.precision", 4)

# Display the comparison table

display(comparison_df)




In [None]:
# Compare Confusion Matrices


# Set up the matplotlib figure with two subplots side by side
fig, axes = plt.subplots(1, 2, figsize=(14, 6))####change figure size here

# Define a common color map
cmap = sns.color_palette("Blues")

# Plot Confusion Matrix for Original Decision Tree
sns.heatmap(confusion_matrix_original, annot=True, fmt='d', cmap=cmap, xticklabels=class_names, yticklabels=class_names, ax=axes[0])
axes[0].set_title('Original Decision Tree\nConfusion Matrix', fontsize=14)
axes[0].set_xlabel('Predicted Class', fontsize=12)
axes[0].set_ylabel('True Class', fontsize=12)

# Plot Confusion Matrix for Tuned Decision Tree
sns.heatmap(confusion_matrix_tuned, annot=True, fmt='d', cmap=cmap, xticklabels=class_names, yticklabels=class_names, ax=axes[1])
axes[1].set_title('Tuned Decision Tree\nConfusion Matrix', fontsize=14)
axes[1].set_xlabel('Predicted Class', fontsize=12)
axes[1].set_ylabel('')

# Adjust layout and display the plot
plt.tight_layout()
plt.show()

In [None]:
# Let's check if the feature importance changes for the tuned model
# It should not change much because we only tuned the hyperparameters

feature_importances_tune = dt_clf_tune.feature_importances_

feature_importance_tune_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': feature_importances
})

# Sort the DataFrame by importance
feature_importance_tune_df = feature_importance_tune_df.sort_values(by='Importance', ascending=False)

# Plotting the feature importances
plt.figure(figsize=(10, 6))###########change this number to adjust figure size
plt.barh(feature_importance_tune_df['Feature'], feature_importance_tune_df['Importance'], color='skyblue')

for index, value in enumerate(feature_importance_tune_df['Importance']):
    plt.text(value, index, f'{value:.3f}', va='center')    #.3f means the value is round up to 3 decimal places

plt.xlabel('Importance')
plt.ylabel('Feature')
plt.title('Feature Importance')
plt.gca().invert_yaxis()  # To have the most important feature at the top
plt.show()