# Imports

In [99]:
import pandas as pd
import numpy as np
import random

# Clean Dataset

Airplane

- Read csv file

In [100]:
data = pd.read_csv("csv/Airplane.csv") 

- Dropping missing data

In [101]:
data = data.dropna()

- Converting non-numeric values to numeric

In [102]:
data["satisfaction"] = data["satisfaction"].map({"neutral or dissatisfied": 0, "satisfied": 1}).astype(int)
data["Customer Type"] = data["Customer Type"].map({"disloyal Customer": 0, "Loyal Customer": 1}).astype(int)
data["Type of Travel"] = data["Type of Travel"].map({"Personal Travel": 0, "Business travel": 1}).astype(int)
data["Gender"] = data["Gender"].map({"Female": 0, "Male": 1}).astype(int)
data["Class"] = data["Class"].map({"Eco": 0, "Eco Plus": 1, "Business": 2}).astype(int)

- Categorizing continuous data

* -- Arrival Delay in Minutes

In [103]:
data.loc[data["Arrival Delay in Minutes"] <= 5, "Arrival Delay in Minutes"] = 0
data.loc[(data["Arrival Delay in Minutes"] > 5), "Arrival Delay in Minutes"] = 1

* -- Age

In [104]:
data.loc[data["Age"] <= 20, "Age"] = 0
data.loc[(data["Age"] > 20) & (data["Age"] <= 39), "Age"] = 1
data.loc[(data["Age"] > 39) & (data["Age"] <= 60), "Age"] = 2
data.loc[(data["Age"] > 60), "Age"] = 3

* -- Cleanliness

In [105]:
data.loc[data["Cleanliness"] < 3, "Cleanliness"] = 0
data.loc[data["Cleanliness"] == 3, "Cleanliness"] = 1
data.loc[(data["Cleanliness"] > 3), "Cleanliness"] = 2

* -- Flight Distance

In [106]:
data.loc[data["Flight Distance"] <= 1000, "Flight Distance"] = 0
data.loc[(data["Flight Distance"] > 1000) & (data["Flight Distance"] <= 2000), "Flight Distance"] = 1
data.loc[(data["Flight Distance"] > 2000) & (data["Flight Distance"] <= 3000), "Flight Distance"] = 2
data.loc[(data["Flight Distance"] > 3000), "Flight Distance"] = 3

* -- Departure Delay in Minutes

In [107]:
data.loc[data["Departure Delay in Minutes"] <= 5, "Departure Delay in Minutes"] = 0
data.loc[(data["Departure Delay in Minutes"] > 5) & (data["Departure Delay in Minutes"] <= 25), "Departure Delay in Minutes"] = 1
data.loc[(data["Departure Delay in Minutes"] > 25), "Departure Delay in Minutes"] = 2

* Selecting the last 10,000 rows as test data

In [108]:
test = data.tail(10000)

* Removing the last 10,000 rows from the data frame

In [109]:
data = data.head(90000)

data

Unnamed: 0,index,id,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,...,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction
0,0,70172,1,1,0,0,1,0,3,4,...,5,4,3,4,4,5,2,1,1.0,0
1,1,5047,1,0,1,1,2,0,3,2,...,1,1,5,3,1,4,0,0,1.0,0
2,2,110028,0,1,1,1,2,1,2,2,...,5,4,3,4,4,4,2,0,0.0,1
3,3,24026,0,1,1,1,2,0,2,5,...,2,2,5,3,1,4,0,1,1.0,0
4,4,119299,1,1,3,1,2,0,3,3,...,3,3,4,4,3,3,1,0,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
90266,90266,86270,0,1,0,0,0,0,2,5,...,2,4,4,5,3,4,0,1,0.0,0
90267,90267,81715,0,1,1,0,1,0,2,5,...,4,3,5,4,5,5,2,2,1.0,0
90268,90268,93597,1,1,1,1,2,0,4,4,...,4,3,5,4,4,5,2,0,0.0,1
90269,90269,97030,1,1,1,0,1,0,1,4,...,1,2,2,3,3,3,0,0,0.0,0


* Separating data for satisfied and neutral or dissatisfied

In [110]:
satisfaction_0 = data[data['satisfaction'] == 0]
satisfaction_1 = data[data['satisfaction'] == 1]
random.seed(43)

* Selecting 10,000 random samples from each group

In [111]:
satisfaction_0_random = random.sample(satisfaction_0.index.tolist(), 10000)
satisfaction_1_random = random.sample(satisfaction_1.index.tolist(), 10000)

* Combining these two data sets

In [112]:
data = pd.concat([data.loc[satisfaction_0_random], data.loc[satisfaction_1_random]])

* Dropping unnecessary columns

In [113]:
data = data.drop(["index", "id", "Gender"], axis=1)
test = test.drop(["index", "id", "Gender"], axis=1)

* show clean data

In [114]:
data

Unnamed: 0,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,Gate location,Food and drink,...,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction
4477,0,1,1,0,0,2,0,2,4,1,...,1,3,2,3,4,3,0,1,1.0,0
33362,1,2,0,1,0,1,3,1,5,2,...,2,2,4,1,2,3,0,2,1.0,0
80877,1,2,1,2,2,2,3,3,3,3,...,2,2,1,2,3,2,1,0,0.0,0
88513,1,0,0,1,0,2,5,2,4,3,...,3,3,4,5,2,1,1,0,0.0,0
16844,1,2,0,1,1,1,4,1,3,1,...,1,4,4,4,4,3,0,2,1.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1664,1,2,1,1,0,4,4,4,4,4,...,4,1,4,5,2,5,2,0,0.0,1
33320,1,2,1,2,0,3,3,3,3,5,...,4,4,4,4,4,4,2,0,1.0,1
63384,1,2,1,2,1,5,5,5,5,3,...,5,5,5,5,3,5,1,0,0.0,1
66370,1,1,1,2,3,1,1,1,1,3,...,4,4,4,4,4,4,1,0,0.0,1


In [115]:
print(data[["Age","satisfaction"]].groupby(["Age"],as_index=False).mean());

   Age  satisfaction
0    0      0.245791
1    1      0.456366
2    2      0.634953
3    3      0.253857


Restaurant

In [116]:
# data = pd.read_csv("csv/Restaurant.csv");
# test = data.tail(1);
# data = data.head(10);

# Tree Node

In [117]:
class TreeNode:
    def __init__(self, feature=None, value=None, subtrees=None):
        self.feature = feature
        self.value = value # for leaf node
        self.subtrees = subtrees if subtrees is not None else {}

# Gini index / Entropy

**Gini index**

In [118]:
# Function to calculate Gini index
def calculate_gini(labels):
    total_samples = len(labels)
    if total_samples is None:
        return 0
    
    gini = 1
    unique_labels, counts = np.unique(labels, return_counts=True)
    for count in counts:
        probability = count / total_samples
        gini -= probability ** 2
    # This part is written based on the mathematical formula of the Gini index
    return gini

# Function to select the feature with the best Gini index
def best_feature_gini(df, target_name):
    best_feature = None
    min_gini = float('inf')
    
    # This loop checks all features present in the input DataFrame
    for feature in df.columns:
        if feature == target_name:
            continue
        
        # All possible values of a feature are stored in this variable
        unique_values = df[feature].unique()
        weighted_gini = 0
        
        # Calculate Gini for all possible values
        for value in unique_values:
            subset = df[df[feature] == value]
            subset_size = len(subset)
            subset_gini = calculate_gini(subset[target_name])
            weighted_gini += (subset_size / len(df)) * subset_gini
        
        # If the calculated Gini is less than the minimum Gini, update the minimum value
        if weighted_gini < min_gini:
            min_gini = weighted_gini
            best_feature = feature
    
    return best_feature


**Entropy**

In [119]:
def calculate_entropy(labels):
    total_samples = len(labels)
    if total_samples == 0:
        return 0
    
    entropy = 0
    unique_labels, counts = np.unique(labels, return_counts=True)
    for count in counts:
        probability = count / total_samples
        entropy -= probability * np.log2(probability)
    # This loop implements the entropy formula
    return entropy

# Function to select the best feature using entropy
def best_feature_entropy(df, target_name):
    best_feature = None
    min_entropy = float('inf')
    
    # This loop checks all features present in the input DataFrame
    for feature in df.columns:
        if feature == target_name:
            continue
        
        # All possible values of a feature are stored in this variable
        unique_values = df[feature].unique()
        weighted_entropy = 0
        
        # Calculate entropy for all possible values
        for value in unique_values:
            subset = df[df[feature] == value]
            subset_size = len(subset)
            subset_entropy = calculate_entropy(subset[target_name])
            weighted_entropy += (subset_size / len(df)) * subset_entropy
        
        # If the calculated entropy is less than the minimum entropy, update the minimum value
        if weighted_entropy < min_entropy:
            min_entropy = weighted_entropy
            best_feature = feature
    
    return best_feature

# Build Desision Tree

**Gini index**

In [120]:
# Function to build a decision tree using Gini index
def build_tree_gini(df, target_name=None):
    # If all samples have the same target value, create a leaf node with that value
    if len(df[target_name].unique()) == 1:
        return TreeNode(value=df[target_name].iloc[0])
    
    # If only one feature is left (excluding the target), create a leaf node with the most frequent target value
    if len(df.columns) == 1:
        return TreeNode(value=df[target_name].mode()[0])
    
    # Select the best feature using the Gini index
    best_feature = best_feature_gini(df, target_name)
    
    # Create a decision node with the selected feature
    current_node = TreeNode(feature=best_feature)
    
    # Recursively build subtrees for each value of the selected feature
    unique_values = df[best_feature].unique()
    for value in unique_values:
        subset = df[df[best_feature] == value]
        subtree = build_tree_gini(subset, target_name)
        current_node.subtrees[value] = subtree
    
    return current_node


**Entropy**

In [121]:
def build_tree_entropy(df, target_name=None):
    # If all samples have the same target value, create a leaf node with that value.
    if len(df[target_name].unique()) == 1:
        return TreeNode(value=df[target_name].iloc[0])
    
    # If only one feature remains (excluding the target), create a leaf node with the most frequent target value.
    if len(df.columns) == 1:
        return TreeNode(value=df[target_name].mode()[0])
    
    # Select the best feature using entropy.
    best_feature = best_feature_entropy(df, target_name)
    
    # Create a decision node with the selected feature.
    current_node = TreeNode(feature=best_feature)
    
    # Recursively build subtrees for each value.
    unique_values = df[best_feature].unique()
    for value in unique_values:
        subset = df[df[best_feature] == value]
        subtree = build_tree_entropy(subset, target_name)
        current_node.subtrees[value] = subtree
    
    return current_node


# Predict

In [122]:
def predict(tree, data_point):
    if tree.value is not None:
        return tree.value
    
    # Recursively traverses the tree
    feature = tree.feature
    feature_value = data_point[feature]
    
    if feature_value in tree.subtrees:
        return predict(tree.subtrees[feature_value], data_point)
    else:
        return 0

# Run & accuracy

In [123]:
target_name = 'satisfaction'; # Specifying the target column
decision_tree_gini = build_tree_gini(data, target_name) # Calculating the tree with Gini index
print("Gini Done!")
decision_tree_entropy = build_tree_entropy(data, target_name) # Calculating the tree with entropy
print("entropy Done!")

Gini Done!
entropy Done!


In [124]:
# Predicting all test file and saving it for both Gini and Entropy trees
predictions_gini = test.apply(lambda row: predict(decision_tree_gini, row), axis=1)
predictions_entropy = test.apply(lambda row: predict(decision_tree_entropy, row), axis=1)

# Calculating accuracy for both Gini and Entropy
accuracy_gini = (predictions_gini == test[target_name]).mean()
accuracy_entropy = (predictions_entropy == test[target_name]).mean()

print("Gini Accuracy = ",(accuracy_gini * 100),"%")
print("Entropy Accuracy = ",(accuracy_entropy * 100),"%")

Gini Accuracy =  93.46 %
Entropy Accuracy =  93.32000000000001 %


# Print Tree

In [125]:
def print_tree(tree, depth=0, parent_feature=None, value = None):
    if tree is None:
        return

    if tree.feature is not None:
        if parent_feature is not None:
            print("|    " * (depth - 1) + f"{value} -->|-{tree.feature}")
        else:
            print(f"{tree.feature}")
        for value, subtree in tree.subtrees.items():
            print_tree(subtree, depth + 1, f"{tree.feature}",value=value)

    else:
        print("|    " * (depth -1),int(value),"-->",tree.value)

# The following line is to display the tree
# print_tree(decision_tree_gini); 

    * The tree is not shown because it is too big 

**Finish**