Problem Statement 1 and 2 :
Randomly select 60 percent of labeled data (from each class) for constructing the tree (training).  Test for the rest of 40 percent data.  Find out the accuracy of the classification tree with the help of confusion matrix and F-score. Use the entropy measure for selection of attributes.
Repeat the above exercise 20 times.  Calculate the average accuracy of classification.


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, f1_score
import numpy as np

# Load the car dataset from CSV
url = 'car.csv'  # Replace with the actual path or URL of your dataset
car_data = pd.read_csv(url)
print(car_data)
# Convert categorical variables to numerical using one-hot encoding
features = ['buying', 'maint', 'doors', 'person', 'lug_boot', 'safety']

# Separate features (X) and target variable (y)
y = car_data.cls
X = pd.get_dummies(car_data[features])
# # Number of repetitions
num_repeats = 20

# # Lists to store results
f1_scores = []
conf_matrices = []

for _ in range(num_repeats):
    # Split the data into training (60%) and testing (40%)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, stratify=y)

    # Create a Decision Tree classifier with entropy as the criterion
    clf = DecisionTreeClassifier(criterion="entropy", random_state=42)

    # Train the classifier on the training data
    clf.fit(X_train, y_train)

    # Make predictions on the test data
    y_pred = clf.predict(X_test)

    # Evaluate the performance using confusion matrix and F1-score
    conf_matrix = confusion_matrix(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')

    # Store results
    conf_matrices.append(conf_matrix)
    f1_scores.append(f1)

# Calculate and print the average F1-score and confusion matrix
average_f1 = np.mean(f1_scores, axis=0)
average_conf_matrix = np.mean(conf_matrices, axis=0)

print("Average Confusion Matrix:")
print(average_conf_matrix.astype(int))
print("\nAverage F1 Score:", average_f1)


     buying  maint  doors person lug_boot safety    cls
0     vhigh  vhigh      2      2    small    low  unacc
1     vhigh  vhigh      2      2    small    med  unacc
2     vhigh  vhigh      2      2    small   high  unacc
3     vhigh  vhigh      2      2      med    low  unacc
4     vhigh  vhigh      2      2      med    med  unacc
...     ...    ...    ...    ...      ...    ...    ...
1723    low    low  5more   more      med    med   good
1724    low    low  5more   more      med   high  vgood
1725    low    low  5more   more      big    low  unacc
1726    low    low  5more   more      big    med   good
1727    low    low  5more   more      big   high  vgood

[1728 rows x 7 columns]
Average Confusion Matrix:
[[142   2   7   1]
 [  2  23   0   1]
 [  6   0 477   0]
 [  1   1   0  23]]

Average F1 Score: 0.9639742078829098


Problem Statement 3 :
Repeat steps 1 and 2 with Gini index as a measure for selection of attributes.


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, f1_score
import numpy as np

# Load the car dataset from CSV
url = 'car.csv'  # Replace with the actual path or URL of your dataset
car_data = pd.read_csv(url)
print(car_data)
# Convert categorical variables to numerical using one-hot encoding
features = ['buying', 'maint', 'doors', 'person', 'lug_boot', 'safety']

# Separate features (X) and target variable (y)
y = car_data.cls
X = pd.get_dummies(car_data[features])
# # Number of repetitions
num_repeats = 20

# # Lists to store results
f1_scores = []
conf_matrices = []

for _ in range(num_repeats):
    # Split the data into training (60%) and testing (40%)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, stratify=y)

    # Create a Decision Tree classifier with Gini index as the criterion
    clf = DecisionTreeClassifier(criterion="gini", random_state=42)

    # Train the classifier on the training data
    clf.fit(X_train, y_train)

    # Make predictions on the test data
    y_pred = clf.predict(X_test)

    # Evaluate the performance using confusion matrix and F1-score
    conf_matrix = confusion_matrix(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')

    # Store results
    conf_matrices.append(conf_matrix)
    f1_scores.append(f1)

# Calculate and print the average F1-score and confusion matrix
average_f1 = np.mean(f1_scores, axis=0)
average_conf_matrix = np.mean(conf_matrices, axis=0)

print("Average Confusion Matrix:")
print(average_conf_matrix.astype(int))
print("\nAverage F1 Score:", average_f1)


     buying  maint  doors person lug_boot safety    cls
0     vhigh  vhigh      2      2    small    low  unacc
1     vhigh  vhigh      2      2    small    med  unacc
2     vhigh  vhigh      2      2    small   high  unacc
3     vhigh  vhigh      2      2      med    low  unacc
4     vhigh  vhigh      2      2      med    med  unacc
...     ...    ...    ...    ...      ...    ...    ...
1723    low    low  5more   more      med    med   good
1724    low    low  5more   more      med   high  vgood
1725    low    low  5more   more      big    low  unacc
1726    low    low  5more   more      big    med   good
1727    low    low  5more   more      big   high  vgood

[1728 rows x 7 columns]
Average Confusion Matrix:
[[141   1   9   1]
 [  3  23   0   1]
 [  7   0 475   0]
 [  1   1   0  22]]

Average F1 Score: 0.9595434604863483


Entropy Measure with 70% training dataset and 30% testing :

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, f1_score
import numpy as np

# Load the car dataset from CSV
url = 'car.csv'  # Replace with the actual path or URL of your dataset
car_data = pd.read_csv(url)
print(car_data)
# Convert categorical variables to numerical using one-hot encoding
features = ['buying', 'maint', 'doors', 'person', 'lug_boot', 'safety']

# Separate features (X) and target variable (y)
y = car_data.cls
X = pd.get_dummies(car_data[features])
# # Number of repetitions
num_repeats = 20

# # Lists to store results
f1_scores = []
conf_matrices = []

for _ in range(num_repeats):
    # Split the data into training (70%) and testing (30%)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y)

    # Create a Decision Tree classifier with entropy as the criterion
    clf = DecisionTreeClassifier(criterion="entropy", random_state=42)

    # Train the classifier on the training data
    clf.fit(X_train, y_train)

    # Make predictions on the test data
    y_pred = clf.predict(X_test)

    # Evaluate the performance using confusion matrix and F1-score
    conf_matrix = confusion_matrix(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')

    # Store results
    conf_matrices.append(conf_matrix)
    f1_scores.append(f1)

# Calculate and print the average F1-score and confusion matrix
average_f1 = np.mean(f1_scores, axis=0)
average_conf_matrix = np.mean(conf_matrices, axis=0)

print("Average Confusion Matrix:")
print(average_conf_matrix.astype(int))
print("\nAverage F1 Score:", average_f1)


     buying  maint  doors person lug_boot safety    cls
0     vhigh  vhigh      2      2    small    low  unacc
1     vhigh  vhigh      2      2    small    med  unacc
2     vhigh  vhigh      2      2    small   high  unacc
3     vhigh  vhigh      2      2      med    low  unacc
4     vhigh  vhigh      2      2      med    med  unacc
...     ...    ...    ...    ...      ...    ...    ...
1723    low    low  5more   more      med    med   good
1724    low    low  5more   more      med   high  vgood
1725    low    low  5more   more      big    low  unacc
1726    low    low  5more   more      big    med   good
1727    low    low  5more   more      big   high  vgood

[1728 rows x 7 columns]
Average Confusion Matrix:
[[108   1   3   1]
 [  2  17   0   0]
 [  4   0 357   0]
 [  1   0   0  18]]

Average F1 Score: 0.968405506332892


Gini Index Measure with 70% training dataset and 30% testing :

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, f1_score
import numpy as np

# Load the car dataset from CSV
url = 'car.csv'  # Replace with the actual path or URL of your dataset
car_data = pd.read_csv(url)
print(car_data)
# Convert categorical variables to numerical using one-hot encoding
features = ['buying', 'maint', 'doors', 'person', 'lug_boot', 'safety']

# Separate features (X) and target variable (y)
y = car_data.cls
X = pd.get_dummies(car_data[features])
# # Number of repetitions
num_repeats = 20

# # Lists to store results
f1_scores = []
conf_matrices = []

for _ in range(num_repeats):
    # Split the data into training (70%) and testing (30%)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y)

    # Create a Decision Tree classifier with Gini index as the criterion
    clf = DecisionTreeClassifier(criterion="gini", random_state=42)

    # Train the classifier on the training data
    clf.fit(X_train, y_train)

    # Make predictions on the test data
    y_pred = clf.predict(X_test)

    # Evaluate the performance using confusion matrix and F1-score
    conf_matrix = confusion_matrix(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')

    # Store results
    conf_matrices.append(conf_matrix)
    f1_scores.append(f1)

# Calculate and print the average F1-score and confusion matrix
average_f1 = np.mean(f1_scores, axis=0)
average_conf_matrix = np.mean(conf_matrices, axis=0)

print("Average Confusion Matrix:")
print(average_conf_matrix.astype(int))
print("\nAverage F1 Score:", average_f1)


     buying  maint  doors person lug_boot safety    cls
0     vhigh  vhigh      2      2    small    low  unacc
1     vhigh  vhigh      2      2    small    med  unacc
2     vhigh  vhigh      2      2    small   high  unacc
3     vhigh  vhigh      2      2      med    low  unacc
4     vhigh  vhigh      2      2      med    med  unacc
...     ...    ...    ...    ...      ...    ...    ...
1723    low    low  5more   more      med    med   good
1724    low    low  5more   more      med   high  vgood
1725    low    low  5more   more      big    low  unacc
1726    low    low  5more   more      big    med   good
1727    low    low  5more   more      big   high  vgood

[1728 rows x 7 columns]
Average Confusion Matrix:
[[106   1   6   0]
 [  1  18   0   0]
 [  5   0 357   0]
 [  1   1   0  17]]

Average F1 Score: 0.9627956592574257
