In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns
from ucimlrepo import fetch_ucirepo

# Fetch the dataset
breast_cancer_wisconsin_diagnostic = fetch_ucirepo(id=17)

# Data
features = breast_cancer_wisconsin_diagnostic.data.features
labels = breast_cancer_wisconsin_diagnostic.data.targets

# Encode the labels
le = LabelEncoder()
labels = le.fit_transform(labels)

# Function to split data into different proportions
def prepare_datasets(features, labels, proportions):
    datasets = {}
    for prop in proportions:
        X_train, X_test, y_train, y_test = train_test_split(features, labels, 
                                                            test_size=prop[1], 
                                                            stratify=labels, 
                                                            random_state=42)
        datasets[f'{prop[0]}_{prop[1]}'] = {
            'X_train': X_train, 
            'X_test': X_test, 
            'y_train': y_train, 
            'y_test': y_test
        }
    return datasets

# Define the proportions
proportions = [(0.4, 0.6), (0.6, 0.4), (0.8, 0.2), (0.9, 0.1)]

# Prepare datasets
datasets = prepare_datasets(features, labels, proportions)


ConnectionError: Error connecting to server

In [None]:
def plot_class_distribution(y, title):
    sns.countplot(y)
    plt.title(title)
    plt.show()

plot_class_distribution(labels, "Original Dataset")

for key in datasets:
    plot_class_distribution(datasets[key]['y_train'], f"Training Set {key}")
    plot_class_distribution(datasets[key]['y_test'], f"Test Set {key}")


In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
import graphviz

# Function to build and visualize decision trees
def build_decision_tree(X_train, y_train, max_depth=None):
    clf = DecisionTreeClassifier(criterion='entropy', max_depth=max_depth, random_state=42)
    clf.fit(X_train, y_train)
    
    # Visualize the decision tree
    dot_data = tree.export_graphviz(clf, out_file=None, 
                                    feature_names=features.columns,  
                                    class_names=le.classes_,  
                                    filled=True, rounded=True,  
                                    special_characters=True)  
    graph = graphviz.Source(dot_data)  
    return clf, graph

# Build and visualize decision trees for each proportion
for key in datasets:
    clf, graph = build_decision_tree(datasets[key]['X_train'], datasets[key]['y_train'])
    graph.render(f'decision_tree_{key}', format='png')


In [None]:
max_depth_values = [None, 2, 3, 4, 5, 6, 7]
accuracies = []

# Use the 80/20 split
X_train = datasets['0.8_0.2']['X_train']
X_test = datasets['0.8_0.2']['X_test']
y_train = datasets['0.8_0.2']['y_train']
y_test = datasets['0.8_0.2']['y_test']

for depth in max_depth_values:
    clf, graph = build_decision_tree(X_train, y_train, max_depth=depth)
    graph.render(f'decision_tree_depth_{depth}', format='png')
    accuracy = accuracy_score(y_test, clf.predict(X_test))
    accuracies.append(accuracy)

# Report the accuracy
accuracy_df = pd.DataFrame({
    'max_depth': max_depth_values,
    'accuracy': accuracies
})

print(accuracy_df)

# Plotting the results
plt.plot(max_depth_values, accuracies, marker='o')
plt.title('Decision Tree Accuracy vs. Max Depth')
plt.xlabel('Max Depth')
plt.ylabel('Accuracy')
plt.show()
