Q1


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, f1_score
import numpy as np

# Load the car dataset from CSV
url = "car.csv"
df = pd.read_csv(url, header=None)

# Assign column names
column_names = ["buying", "maint", "doors", "person", "lug_boot", "safety", "class"]
df.columns = column_names

# Separate features (X) and target variable (y)
X = pd.get_dummies(df.drop("class", axis=1))  # One-hot encoding for categorical variables
y = df["class"]

# Split the data into training (60%) and testing (40%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, stratify=y)

# Create a Decision Tree classifier with entropy as the criterion and max_depth=None
clf = DecisionTreeClassifier(criterion="entropy", random_state=42, max_depth=None)

# Train the classifier on the training data
clf.fit(X_train, y_train)

# Make predictions on the test data
y_pred = clf.predict(X_test)

# Evaluate the performance using confusion matrix and F1-score
conf_matrix = confusion_matrix(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

# Print the confusion matrix and F1 score
print("Confusion Matrix:")
print(conf_matrix)
print("F1 Score:", f1)


Confusion Matrix:
[[137   3  13   1]
 [  2  26   0   0]
 [  2   0 482   0]
 [  2   0   0  24]]
F1 Score: 0.966290161302619


Q2

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, f1_score

# Load the car dataset from CSV
url = "car.csv"
df = pd.read_csv(url, header=None)

# Assign column names
column_names = ["buying", "maint", "doors", "person", "lug_boot", "safety", "class"]
df.columns = column_names

# Separate features (X) and target variable (y)
X = pd.get_dummies(df.drop("class", axis=1))  # One-hot encoding for categorical variables
y = df["class"]

# Split the data into training (60%) and testing (40%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, stratify=y)

# Number of repetitions
num_repeats = 20
# Lists to store results
f1_scores = []
conf_matrices = []
# Define a range of max_depth values to test
max_depth_values = [3, 5, 7, 10, 15, 20,25, None]
for max_depth in max_depth_values:
    f1_scores_iter = []
    conf_matrices_iter = []
    for _ in range(num_repeats):
        # Split the data into training (60%) and testing (40%)
        X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                             test_size=0.4, stratify=y)
        # Create a Decision Tree classifier with entropy as the criterion
        clf = DecisionTreeClassifier(criterion="entropy", random_state=42,
                                     max_depth=max_depth)
        # Train the classifier on the training data
        clf.fit(X_train, y_train)
        # Make predictions on the test data
        y_pred = clf.predict(X_test)
        # Evaluate the performance using confusion matrix and F1-score
        conf_matrix = confusion_matrix(y_test, y_pred)
        f1 = f1_score(y_test, y_pred, average='weighted')
        # Store results
        conf_matrices_iter.append(conf_matrix)
        f1_scores_iter.append(f1)

    # Calculate and print the average F1-score and confusion matrix for the current max_depth
    average_f1_iter = np.mean(f1_scores_iter, axis=0)
    average_conf_matrix_iter = np.mean(conf_matrices_iter, axis=0)
    print(f"\nMax Depth: {max_depth}")
    print("Average Confusion Matrix:")
    print(average_conf_matrix_iter.astype(int))
    print("Average F1 Score:", average_f1_iter)
    # Store results for all max_depth values
    f1_scores.append(average_f1_iter)
    conf_matrices.append(average_conf_matrix_iter)
# Find the max_depth that gives the highest average F1-score
best_max_depth_index = np.argmax(f1_scores)
best_max_depth = max_depth_values[best_max_depth_index]
print("\nBest Max Depth:", best_max_depth)



Max Depth: 3
Average Confusion Matrix:
[[128   0  25   0]
 [ 28   0   0   0]
 [ 62   0 421   0]
 [ 26   0   0   0]]
Average F1 Score: 0.7755432081985887

Max Depth: 5
Average Confusion Matrix:
[[148   2   0   2]
 [ 19   4   0   4]
 [ 53   0 429   0]
 [ 15   2   0   7]]
Average F1 Score: 0.8431097568125987

Max Depth: 7
Average Confusion Matrix:
[[139   3   4   6]
 [  6  16   0   4]
 [ 25   0 457   0]
 [  2   0   0  22]]
Average F1 Score: 0.9183385475624786

Max Depth: 10
Average Confusion Matrix:
[[144   3   4   1]
 [  3  23   0   1]
 [  8   0 474   0]
 [  1   1   0  23]]
Average F1 Score: 0.9622170799463339

Max Depth: 15
Average Confusion Matrix:
[[145   2   5   0]
 [  2  23   0   0]
 [  6   0 477   0]
 [  1   1   0  23]]
Average F1 Score: 0.9675345383034737

Max Depth: 20
Average Confusion Matrix:
[[142   1   9   0]
 [  3  23   0   0]
 [  6   1 476   0]
 [  1   1   0  22]]
Average F1 Score: 0.961283169010767

Max Depth: 25
Average Confusion Matrix:
[[143   2   7   1]
 [  3  23   0 

Q3. CODE 1 Time


In [None]:
# Split the data into training (60%) and testing (40%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, stratify=y)

# Number of repetitions
num_repeats = 20
# Lists to store results
f1_scores = []
conf_matrices = []
# Define a range of max_depth values to test
max_depth_values = [3, 5, 7, 10, 15, 20,25, None]
for max_depth in max_depth_values:
    f1_scores_iter = []
    conf_matrices_iter = []
    for _ in range(num_repeats):
        # Split the data into training (60%) and testing (40%)
        X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                             test_size=0.4, stratify=y)

# Create a Decision Tree classifier with entropy as the criterion and max_depth=None
clf = DecisionTreeClassifier(criterion="gini", random_state=42, max_depth=None)
# Train the classifier on the training data
clf.fit(X_train, y_train)
# Make predictions on the test data
y_pred = clf.predict(X_test)
# Evaluate the performance using confusion matrix and F1-score
conf_matrix = confusion_matrix(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')
# Print the confusion matrix and F1 score
print("Confusion Matrix:")
print(conf_matrix)
print("F1 Score:", f1)

Confusion Matrix:
[[146   4   4   0]
 [  1  25   2   0]
 [ 10   0 474   0]
 [  3   3   0  20]]
F1 Score: 0.9611546604733567


Q3 Code 20 times


In [None]:
# Split the data into training (60%) and testing (40%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, stratify=y)

# Number of repetitions
num_repeats = 20
# Lists to store results
f1_scores = []
conf_matrices = []
# Define a range of max_depth values to test
max_depth_values = [3, 5, 7, 10, 15, 20,25, None]
for max_depth in max_depth_values:
    f1_scores_iter = []
    conf_matrices_iter = []
    for _ in range(num_repeats):
        # Split the data into training (60%) and testing (40%)
        X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                             test_size=0.2, stratify=y)

        # Lists to store results
f1_scores = []
conf_matrices = []
for _ in range(num_repeats):
    # Split the data into training (60%) and testing (40%)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
    stratify=y)
    # Create a Decision Tree classifier with Gini index as the criterion
    clf = DecisionTreeClassifier(criterion="gini", random_state=42)
    # Train the classifier on the training data
    clf.fit(X_train, y_train)
    # Make predictions on the test data
    y_pred = clf.predict(X_test)
    # Evaluate the performance using confusion matrix and F1-score
    conf_matrix = confusion_matrix(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')
    # Store results
    conf_matrices.append(conf_matrix)
    f1_scores.append(f1)

    # Calculate and print the average F1-score and confusion matrix
average_f1 = np.mean(f1_scores, axis=0)
average_conf_matrix = np.mean(conf_matrices, axis=0)
print("Average Confusion Matrix:")
print(average_conf_matrix.astype(int))
print("\nAverage F1 Score:", average_f1)


Average Confusion Matrix:
[[ 72   0   3   0]
 [  0  13   0   0]
 [  3   0 238   0]
 [  0   0   0  11]]

Average F1 Score: 0.9717123331349187


Overfitting

In [None]:
# Number of repetitions
num_repeats = 20

# Lists to store results
f1_scores = []
conf_matrices = []
# Varying maximum depth values to induce overfitting
max_depth_values = [5, 10, 15, 20, 25, 30, 35, 40, 45, 50]
for max_depth in max_depth_values:
    f1_scores_depth = []
    conf_matrices_depth = []
    for _ in range(num_repeats):
        # Split the data into training 80
        X_train, X_test, y_train, y_test = train_test_split(X, y,
    test_size=0.2, stratify=y)
        # Create a Decision Tree classifier with Gini index as the criterion
        clf = DecisionTreeClassifier(criterion="gini", max_depth=max_depth,
random_state=42)
    # Train the classifier on the training data
    clf.fit(X_train, y_train)
    # Make predictions on the test data
    y_pred = clf.predict(X_test)
    # Evaluate the performance using confusion matrix and F1-score
    conf_matrix = confusion_matrix(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')
    # Store results
    conf_matrices_depth.append(conf_matrix)
    f1_scores_depth.append(f1)
# Calculate and print the average F1-score and confusion matrix for each depth
average_f1_depth = np.mean(f1_scores_depth, axis=0)
average_conf_matrix_depth = np.mean(conf_matrices_depth, axis=0)
print(f"Results for max_depth={max_depth}:")
print("Average Confusion Matrix:")
print(average_conf_matrix_depth.astype(int))
print("Average F1 Score:", average_f1_depth)
print("\n")

Results for max_depth=50:
Average Confusion Matrix:
[[ 75   1   1   0]
 [  3  10   0   1]
 [  5   0 237   0]
 [  2   0   0  11]]
Average F1 Score: 0.9621713765788911


