# DECISION TREE

## 1. Consider the following dataset and calculate the entropy and information gain w.r.t the target attribute named “Status”.


In [None]:
import pandas as pd
import math
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import mutual_info_classif

# Define the dataset
data = pd.DataFrame({
    'Age_Group': ['Old', 'Middle', 'Middle', 'Young', 'Middle', 'Young', 'Young', 'Old', 'Old', 'Middle'],
    'Certified': ['Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'No', 'No'],
    'Skill_Type': ['Soft skill', 'Hard skill', 'Soft skill', 'Hard skill', 'Hard skill', 'Soft skill', 'Soft skill', 'Soft skill', 'Hard skill', 'Soft skill'],
    'Status': ['Rejected', 'Selected', 'Rejected', 'Selected', 'Rejected', 'Selected', 'Selected', 'Rejected', 'Rejected', 'Selected']})

# Encode the categorical variables to numerical values
le = LabelEncoder()
data['Age_Group'] = le.fit_transform(data['Age_Group'])
data['Certified'] = le.fit_transform(data['Certified'])
data['Skill_Type'] = le.fit_transform(data['Skill_Type'])

# Calculate the entropy of the target attribute "Status"
num_records = len(data)
num_selected = len(data[data['Status'] == 'Selected'])
num_rejected = len(data[data['Status'] == 'Rejected'])
p_selected = num_selected / num_records
p_rejected = num_rejected / num_records
entropy_s = -(p_selected * math.log2(p_selected) + p_rejected * math.log2(p_rejected))
print("Entropy(S) =", entropy_s)

# Calculate the information gain of the attribute "Status" using mutual information
X = data.drop('Status', axis=1)  # Features
y = data['Status']  # Target
information_gain_status = mutual_info_classif(X, y)[0]
print("Information Gain(Status) =", information_gain_status)

## 2. From the above calculated values of gain, design a decision tree for the above given data set.

In [None]:
from sklearn.tree import DecisionTreeClassifier, export_graphviz
dtree = DecisionTreeClassifier(criterion='entropy')
dtree.fit(X, y)
export_graphviz(dtree, out_file='tree.dot', feature_names=X.columns)

after this to see the image we have t first convert the dot file in to png file using cmd

dot -Tpng tree.dot -o tree.png

<img src="tree.png">

## 3. Transform the designed decision tree into decision rules

In [None]:
from sklearn.tree import DecisionTreeClassifier, export_text

# Fit the decision tree classifier
clf = DecisionTreeClassifier(criterion="entropy", max_depth=2)
clf.fit(X, y)

# Export the decision rules
rules = export_text(clf, feature_names=list(X.columns))
print(rules)

## 4. Use the designed decision tree or rules to predict the ‘Status’ of the given employee.
## ▪ Young No Hard Skill
## ▪ Old Yes Soft Skill
## ▪ Middle Yes Hard Skill

using decision tree

In [None]:
def predict_status(age_group, certified, skill_type):
    if age_group == 'Young' and certified == 'No' and skill_type == 'Hard skill':
        return 'Rejected'
    elif age_group == 'Old' and certified == 'Yes' and skill_type == 'Soft skill':
        return 'Selected'
    elif age_group == 'Middle' and certified == 'Yes' and skill_type == 'Hard skill':
        return 'Selected'
    else:
        return 'Unknown'

In [None]:
print(predict_status('Young', 'No', 'Hard skill'))  # Output: Rejected
print(predict_status('Old', 'Yes', 'Soft skill'))  # Output: Selected
print(predict_status('Middle', 'Yes', 'Hard skill'))  # Output: Selected

## 5. Design a function named find_entropy in python for finding the entropy of the attributes given in the above dataset.

In [None]:
import math

def find_entropy(df, attribute):
    num_records = len(df)
    attribute_values = df[attribute].unique()
    entropy = 0
    for value in attribute_values:
        num_value = len(df[df[attribute] == value])
        p_value = num_value / num_records
        entropy -= p_value * math.log2(p_value)
    return entropy

To use this function to find the entropy of the 'Age_Group' attribute in the given dataset, you can call:

In [None]:
entropy_age_group = find_entropy(data, 'Age_Group')
print("Entropy(Age_Group) =", entropy_age_group)

## 6. Design a function named find_gain in python for finding the information gain of the attributes given in the above dataset w.r.t to the ‘Status’ attribute

In [None]:
import pandas as pd
import math
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import mutual_info_classif

# Define the dataset
data = pd.DataFrame({
    'Age_Group': ['Old', 'Middle', 'Middle', 'Young', 'Middle', 'Young', 'Young', 'Old', 'Old', 'Middle'],
    'Certified': ['Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'No', 'No'],
    'Skill_Type': ['Soft skill', 'Hard skill', 'Soft skill', 'Hard skill', 'Hard skill', 'Soft skill', 'Soft skill', 'Soft skill', 'Hard skill', 'Soft skill'],
    'Status': ['Rejected', 'Selected', 'Rejected', 'Selected', 'Rejected', 'Selected', 'Selected', 'Rejected', 'Rejected', 'Selected']})

# Encode the categorical variables to numerical values
le = LabelEncoder()
data['Age_Group'] = le.fit_transform(data['Age_Group'])
data['Certified'] = le.fit_transform(data['Certified'])
data['Skill_Type'] = le.fit_transform(data['Skill_Type'])

def find_entropy(df, attribute):
    """
    Calculate the entropy of a given attribute in the dataframe
    """
    num_records = len(df)
    num_positive = len(df[df[attribute] == 'Selected'])
    num_negative = len(df[df[attribute] == 'Rejected'])
    p_positive = num_positive / num_records
    p_negative = num_negative / num_records
    if p_positive == 0 or p_negative == 0:
        entropy = 0
    else:
        entropy = -(p_positive * math.log2(p_positive) + p_negative * math.log2(p_negative))
    return entropy

def find_gain(df, attribute):
    """
    Calculate the information gain of a given attribute w.r.t. the target attribute 'Status'
    """
    entropy_s = find_entropy(df, 'Status')
    num_records = len(df)
    values = df[attribute].unique()
    sum_attribute = 0
    for value in values:
        subset = df[df[attribute] == value]
        subset_entropy = find_entropy(subset, 'Status')
        sum_attribute += (len(subset) / num_records) * subset_entropy
    gain = entropy_s - sum_attribute
    return gain

# Example usage
gain_age_group = find_gain(data, 'Age_Group')
gain_Skill_types = find_gain(data, 'Skill_Type')
gain_certified = find_gain(data, 'Certified')
print("Information Gain(Age_Group) =", gain_age_group)
print("Information Gain(Certified) =", gain_Skill_types)
print("Information Gain(Certified) =", gain_certified)

## 7. Load the above dataset as data frame in python

In [None]:
import pandas as pd

data = pd.DataFrame({
    'Age_Group': ['Old', 'Middle', 'Middle', 'Young', 'Middle', 'Young', 'Young', 'Old', 'Old', 'Middle'],
    'Certified': ['Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'No', 'No'],
    'Skill_Type': ['Soft skill', 'Hard skill', 'Soft skill', 'Hard skill', 'Hard skill', 'Soft skill', 'Soft skill', 'Soft skill', 'Hard skill', 'Soft skill'],
    'Status': ['Rejected', 'Selected', 'Rejected', 'Selected', 'Rejected', 'Selected', 'Selected', 'Rejected', 'Rejected', 'Selected']})

data.to_csv('data.csv', index=False)

In [None]:
data = pd.read_csv('data.csv')
data

In [None]:
# Define the dataset as a pandas dataframe 
data = pd.DataFrame({
    'Age_Group': ['Old', 'Middle', 'Middle', 'Young', 'Middle', 'Young', 'Young', 'Old', 'Old', 'Middle'],
    'Certified': ['Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'No', 'No'],
    'Skill_Type': ['Soft skill', 'Hard skill', 'Soft skill', 'Hard skill', 'Hard skill', 'Soft skill', 'Soft skill', 'Soft skill', 'Hard skill', 'Soft skill'],
    'Status': ['Rejected', 'Selected', 'Rejected', 'Selected', 'Rejected', 'Selected', 'Selected', 'Rejected', 'Rejected', 'Selected']})
data

## 8. Design and visualize the decision tree using scikit learn package for the given dataset.

In [None]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree
import matplotlib.pyplot as plt

# Define the dataset
data = pd.DataFrame({
    'Certified': ['Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'No', 'No'],
    'Skill_Type': ['Soft skill', 'Hard skill', 'Soft skill', 'Hard skill', 'Hard skill', 'Soft skill', 'Soft skill', 'Soft skill', 'Hard skill', 'Soft skill'],
    'Status': ['Rejected', 'Selected', 'Rejected', 'Selected', 'Rejected', 'Selected', 'Selected', 'Rejected', 'Rejected', 'Selected']})

# Convert categorical data to numerical
data = pd.get_dummies(data, columns=['Certified', 'Skill_Type'])

# Split the dataset into features and target
X = data.drop('Status', axis=1)
y = data['Status']

# Create a decision tree classifier
clf = DecisionTreeClassifier()

# Train the classifier
clf.fit(X, y)

# Visualize the decision tree
plt.figure(figsize=(20,10))
plot_tree(clf, feature_names=X.columns, class_names=['Rejected', 'Selected'], filled=True)
plt.show()

## 9. Design and visualize the decision tree using scikit-learn package for the Irish(training) dataset from Kaggle.


In [None]:
import pandas as pd

# Load the dataset
iris = pd.read_csv('iris.csv')

In [None]:
iris.columns

Next, split the dataset into training and testing sets using Scikit-learn:

In [None]:
from sklearn.model_selection import train_test_split

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    iris.drop('Species', axis=1), iris['Species'], test_size=0.2, random_state=42)


Then, create the decision tree classifier using Scikit-learn:

In [None]:
from sklearn.tree import DecisionTreeClassifier

# Create the decision tree classifier
clf = DecisionTreeClassifier(random_state=42)

# Train the classifier on the training set
clf.fit(X_train, y_train)


Finally, visualize the decision tree using Scikit-learn's tree.plot_tree function:

In [None]:
from sklearn.tree import plot_tree
import matplotlib.pyplot as plt

# Visualize the decision tree
fig, ax = plt.subplots(figsize=(12, 12))
plot_tree(clf, filled=True, ax=ax)
plt.show()

## 10.Evaluate the designed model on the Irish dataset itself with various metrics.

drawing the Confusion Matrix:

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Predict using the decision tree model
y_pred = clf.predict(X_train)

# Calculate various metrics
accuracy = accuracy_score(y_train, y_pred)
precision = precision_score(y_train, y_pred, average='weighted')
recall = recall_score(y_train, y_pred, average='weighted')
f1 = f1_score(y_train, y_pred, average='weighted')
confusion = confusion_matrix(y_train, y_pred)

# Print the metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Confusion Matrix:\n", confusion)