# Decision Tree Analysis of Prakriti Dataset

## 1. Importing Libraries and Loading Data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_curve, auc
from sklearn.preprocessing import LabelEncoder
from sklearn import tree

print("Libraries imported")

df = pd.read_csv('Prakriti_With_Features.csv')
print("dataframe of dataset created")

print(df.head())

## 2. Cleaning The Dataset

In [None]:
#Selecting the required columns
df = df[['Dosha', 'Body Size', 'Body Weight', 'Height', 'Bone Structure', 'Complexion', 'General feel of skin', 'Texture of Skin', 'Hair Color', 'Appearance of Hair', 'Shape of face', 'Eyes', 'Eyelashes', 'Blinking of Eyes', 'Cheeks', 'Nose', 'Teeth and gums', 'Lips', 'Nails', 'Appetite', 'Liking tastes', 'Metabolism Type', 'Climate Preference', 'Stress Levels', 'Sleep Patterns', 'Dietary Habits', 'Physical Activity Level', 'Water Intake', 'Digestion Quality', 'Skin Sensitivity']]

# Finding NULL instances
print(df.isnull().sum())

# Encoding categorical data
le = LabelEncoder()
for column in df.columns:
    if df[column].dtype == 'object':
        df[column] = le.fit_transform(df[column])

# X- Features y- Label
X = df.drop('Dosha', axis=1)
y = df['Dosha']

print(df.head())

## 3. Splitting the Data into Training and Testing Sets

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

## 4. Training the Decision Tree Model

In [None]:
clf = DecisionTreeClassifier(random_state=0, criterion='gini')
clf.fit(X_train, y_train)

## 5. Evaluating the Model's Performance

In [None]:
# Checking accuracy of testing dataset
predictions_test = clf.predict(X_test)
print("Accuracy of testing dataset: ", accuracy_score(y_test, predictions_test) * 100)

# Checking accuracy of training dataset
predictions_train = clf.predict(X_train)
print("Accuracy of training dataset: ", accuracy_score(y_train, predictions_train) * 100)

## 6. Visualizing the Decision Tree

In [None]:
plt.figure(figsize=(15, 10))
tree.plot_tree(clf, filled=True)
plt.show()

## 7. Pruning the Decision Tree to Avoid Overfitting

In [None]:
path = clf.cost_complexity_pruning_path(X_train, y_train)
ccp_alphas, impurities = path.ccp_alphas, path.impurities

clfs = []
for ccp_alpha in ccp_alphas:
    clf = DecisionTreeClassifier(random_state=0, ccp_alpha=ccp_alpha)
    clf.fit(X_train, y_train)
    clfs.append(clf)

print("Number of nodes in the last tree is: {} with ccp_alpha: {}".format(
    clfs[-1].tree_.node_count, ccp_alphas[-1]))

train_scores = [clf.score(X_train, y_train) for clf in clfs]
test_scores = [clf.score(X_test, y_test) for clf in clfs]

fig, ax = plt.subplots(figsize=(10, 8))
ax.set_xlabel("alpha")
ax.set_ylabel("accuracy")
ax.set_title("Accuracy vs alpha for training and testing sets")
ax.plot(ccp_alphas, train_scores, marker='o', label="train", drawstyle="steps-post")
ax.plot(ccp_alphas, test_scores, marker='o', label="test", drawstyle="steps-post")
ax.legend()
plt.grid()
plt.show()

# 8. Evaluating the Pruned Tree

In [None]:
clf = DecisionTreeClassifier(random_state=0, ccp_alpha=0.016)
clf.fit(X_train, y_train)

# Accuracy of test dataset after pruning
pred = clf.predict(X_test)
print("Accuracy of test dataset after pruning: ", accuracy_score(y_test, pred))

# Accuracy of training dataset after pruning
pred_1 = clf.predict(X_train)
print("Accuracy of training dataset after pruning: ", accuracy_score(y_train, pred_1))

## 9. Visualizing the Pruned Decision Tree

In [None]:
plt.figure(figsize=(15, 10))
tree.plot_tree(clf, filled=True)
plt.show()

## 10. ROC Curve and AUC for the Pruned Tree

In [None]:
dt_probs = clf.predict_proba(X_test)[:, 1]
fpr_dt, tpr_dt, thresholds_dt = roc_curve(y_test, dt_probs, pos_label=1)

auc_score_dt = auc(fpr_dt, tpr_dt)

def plot_roc_curve(fpr, tpr):
    plt.figure(figsize=(10, 8))
    plt.plot(fpr_dt, tpr_dt, color='orange', label='AUC = %0.2f' % auc_score_dt)
    plt.plot([0, 1], [0, 1], color='darkblue', linestyle='--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC) Curve')
    plt.legend()
    plt.show()

plot_roc_curve(fpr_dt, tpr_dt)