#   LAB 04 - Data Mining

## Introduction to Python

Full Intro: https://dbdmg.polito.it/dbdmg_web/wp-content/uploads/2024/03/1-Python-Programming.pdf

### Pandas

Documentation: https://pandas.pydata.org/docs/

In [None]:
import pandas as pd
filename = 'breast.xlsx'

Read data

In [None]:
dataframe=pd.read_excel(filename) # read_csv(), read_excel(), read_txt()

Select Features

In [None]:


# see columns
columns=dataframe.columns

columns

In [None]:
dataframe.dtypes

In [None]:
dataframe.head() # show the first 5 rows

In [None]:
dataframe.tail() # show the last 5 rows

In [None]:
def clean(dataframe):
    dataframe = dataframe.dropna()
    dataframe = dataframe[~dataframe.apply(lambda row: '?' in row.values, axis=1)]
    dataframe["node-caps"] = dataframe["node-caps"].replace({"'no'": 0, "'yes'": 1})
    dataframe["breast"] = dataframe["breast"].replace({"'right'": 0, "'left'": 1})
    dataframe["irradiat"] = dataframe["irradiat"].replace({"'no'": 0, "'yes'": 1})
    dataframe["Class"] = dataframe["Class"].replace({"'no-recurrence-events'": 0, "'recurrence-events'": 1})
    return dataframe

In [None]:
dataframe = clean(dataframe)
dataframe.head()

### Matplotlib

Documentation: https://matplotlib.org/stable/index.html

In [None]:
import matplotlib.pyplot as plt

### Scikit-Learn

Documentation: https://scikit-learn.org/1.5/user_guide.html

In [None]:
from sklearn.tree import DecisionTreeClassifier, export_text
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

Encoding

In [None]:
#one hot encoding, use get dummies of scikit learn
dataframe = pd.get_dummies(dataframe, columns= ['age', 'menopause', 'tumor-size', 'inv-nodes', 'deg-malig', 'breast-quad'])
dataframe = dataframe.replace({False: 0, True: 1})

In [None]:
dataframe.columns.size

In [None]:
dataframe.head()

In [None]:
dataframe.head(50)

In [None]:
print(dataframe.dtypes)


In [None]:
X = dataframe.copy().drop("Class", axis=1)  # All columns except the last X = features
y = dataframe["Class"]   # Class column (last column) y = target

In [None]:
X.dtypes

In [None]:
y.dtypes

Classifier

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from imblearn.under_sampling import RandomUnderSampler
undersampler = RandomUnderSampler(random_state=42)
X_train_res, y_train_res = undersampler.fit_resample(X_train, y_train)

In [None]:
# Train a Decision Tree Classifier
clf = DecisionTreeClassifier(max_depth=5, random_state=42)
clf.fit(X_train, y_train)

In [None]:
# Identify the most discriminative attribute
feature_importances = clf.feature_importances_
most_discriminative = X.columns[feature_importances.argmax()]
print(f"Most discriminative attribute: {most_discriminative}")

In [None]:
# Determine the height of the tree
tree_height = clf.get_depth()
print(f"Height of the Decision Tree: {tree_height}")

In [None]:
# Find a pure partition
tree_rules = export_text(clf, feature_names=list(X.columns))
print(tree_rules)

In [None]:
from sklearn import tree

# Visualize the tree
plt.figure(figsize=(200, 100))
tree.plot_tree(
    clf,
    feature_names=X.columns,
    class_names=[str(c) for c in clf.classes_],  # Converte le classi in stringhe
    filled=True
)
plt.show()

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

predictions = clf.predict(X_test)

In [None]:
print(classification_report(y_test, predictions))
print(accuracy_score(y_test, predictions))

In [None]:
#Function used to show the impact of different hyperparameters on the tree
def train_and_plot(max_depth=None, min_samples_leaf=1, min_impurity_decrease=0.0, criterion='gini'):
    clf = DecisionTreeClassifier(
        criterion=criterion,
        max_depth=max_depth,
        min_samples_leaf=min_samples_leaf,
        min_impurity_decrease=min_impurity_decrease,
        random_state=42
    )
    clf.fit(X_train, y_train)
    predictions = clf.predict(X_test)
    print(classification_report(y_test, predictions))
    print(accuracy_score(y_test, predictions))

    # Plot the tree
    plt.figure(figsize=(200, 100))
    tree.plot_tree(clf, feature_names=X.columns, class_names=clf.classes_.astype(str), filled=True)
    plt.title(f"Depth: {max_depth}, Min Samples Leaf: {min_samples_leaf}, Impurity Decrease: {min_impurity_decrease}, Criterion: {criterion}")
    plt.show()


In [None]:
# Default configuration
train_and_plot()

In [None]:
#Tota Number of rows in the dataset
total_rows = dataframe.shape[0]
print(f"Total number of rows in the dataset: {total_rows}")

In [None]:
# Configuration 1: Limited depth
train_and_plot(max_depth=3)


In [None]:
# Configuration 3: Higher minimal impurity decrease
train_and_plot(max_depth=5, min_impurity_decrease=0.01)

In [None]:
# Configuration 2: Minimum samples per leaf
train_and_plot(min_samples_leaf=10)

In [None]:
# Configuration 3: Higher minimal impurity decrease
train_and_plot(min_impurity_decrease=0.01, criterion='entropy')

In [None]:
#Config 4
train_and_plot(min_samples_leaf=4, min_impurity_decrease=0.001, criterion='entropy')

In [None]:
#Config 5
train_and_plot(max_depth=7, min_samples_leaf=10, criterion='gini')

## DOMANDA 3

In [None]:
from sklearn.model_selection import StratifiedKFold, cross_val_predict
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import numpy as np

In [None]:
def experiment_with_cv(X, y, max_depth=None, min_samples_leaf=1,min_samples_split=2, min_impurity_decrease=0.0, criterion='gini'):
    clf = DecisionTreeClassifier(
        criterion=criterion,
        max_depth=max_depth,
        min_samples_leaf=min_samples_leaf,
        min_impurity_decrease=min_impurity_decrease,
        min_samples_split=min_samples_split,
        random_state=42
    )
    clf.fit(X_train, y_train)

    #Stratified k fold with k = 10
    stk10 = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    predictions = cross_val_predict(clf, X, y, cv=stk10) 

    #create confusion matrix
    confMatrix = confusion_matrix(y, predictions)
    print(confMatrix)
    
    #display confMatrix
    accuracy = accuracy_score(y, predictions)
    ConfusionMatrixDisplay(confusion_matrix=confMatrix, display_labels=np.unique(y)).plot(cmap='Blues')
    plt.title(f"Confusion Matrix (Acc: {accuracy:.2f})\nMax Depth: {max_depth}, Min Samples Leaf: {min_samples_leaf}, Min Samples Split: {min_samples_split}, Impurity: {min_impurity_decrease}, Criterion: {criterion}")
    plt.show()

    # Plot the tree
    plt.figure(figsize=(200, 100))
    tree.plot_tree(clf, feature_names=X.columns, class_names=clf.classes_.astype(str), filled=True)
    plt.title(f"Depth: {max_depth}, Min Samples Leaf: {min_samples_leaf}, Min Samples Split: {min_samples_split}, Impurity Decrease: {min_impurity_decrease}, Criterion: {criterion}")
    plt.show()

    return classification_report(y, predictions)



In [None]:
#Confrep ig 1
rep = experiment_with_cv(X, y, max_depth=3)
print(rep)

In [None]:
rep = experiment_with_cv(X, y, max_depth=5, min_impurity_decrease=0.01)
print(rep)

In [None]:
rep = experiment_with_cv(X, y, min_impurity_decrease=0.01, criterion='entropy')
print(rep)

In [None]:
rep = experiment_with_cv(X, y, min_samples_leaf=4, min_impurity_decrease=0.001, criterion='entropy')
print(rep)

In [None]:
rep = experiment_with_cv(X, y,  max_depth=7, min_samples_leaf=10, criterion='gini')
print(rep)

In [None]:
rep = experiment_with_cv(X, y, max_depth=4, min_samples_leaf=5, min_impurity_decrease=0.005, criterion='entropy')
print(rep)

In [None]:
#Grid Search
from sklearn.model_selection import GridSearchCV

param_grid = {
    'max_depth': [3, 5, 7, 9, None],
    'min_samples_leaf': [1, 2, 4, 10, 15, 20],
    'min_samples_split': [2, 5, 10, 15, 20],
    'min_impurity_decrease': [0.0, 0.001, 0.005, 0.01],
    'criterion': ['gini', 'entropy']
}

clf = DecisionTreeClassifier(random_state=42)
grid_search = GridSearchCV(clf, param_grid, cv=10)
grid_search.fit(X, y)

print(grid_search.best_params_)

In [None]:
#Use the best parameters
rep = experiment_with_cv(X, y, **grid_search.best_params_)
print(rep)

## EX 4

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score

In [None]:
def evaluate_knn_with_cv(X, y, n_neighbors=5):
    """
    Train a K-NN classifier with a specific K and perform 10-fold stratified cross-validation.
    """
    # Initialize the K-NN classifier
    knn = KNeighborsClassifier(n_neighbors=n_neighbors)
    
    # Perform 10-fold Stratified CV
    skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    y_pred = cross_val_predict(knn, X, y, cv=skf)
    accuracy = accuracy_score(y, y_pred)
    prec = precision_score(y, y_pred)
    
    # Generate confusion matrix
    cm = confusion_matrix(y, y_pred)
    ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=np.unique(y)).plot(cmap='Blues')
    plt.title(f"Confusion Matrix (Acc: {accuracy:.2f})\nK={n_neighbors}")
    plt.show()
    
    return accuracy


In [None]:
accuracies = {}
for k in [2, 3, 4, 5, 6, 7, 8, 9, 10, 11]:
    print(f"Evaluating K={k}")
    accuracies[k] = evaluate_knn_with_cv(X, y, n_neighbors=k)

# Print average accuracies
print("Average Accuracies for different K values:")
for k, acc in accuracies.items():
    print(f"K={k}: Accuracy={acc:.2f}")


In [None]:
k_values = np.linspace(1, 20, 20, dtype=int)  
print(k_values)
accuracies = {}

for k in k_values:
    print(f"Evaluating K={k}")
    accuracies[k] = evaluate_knn_with_cv(X, y, n_neighbors=k)

In [None]:
# Grafico della curva di accuratezza
plt.figure(figsize=(10, 6))
plt.plot(k_values, list(accuracies.values()), marker='o', linestyle='-', color='b')
plt.title("Accuracy vs K")
plt.xlabel("K (Number of Neighbors)")
plt.ylabel("Accuracy")
plt.grid(True)
plt.show()

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, confusion_matrix
import numpy as np

# Configura il modello Naïve Bayes
naive_bayes_model = GaussianNB()

# 10-Fold Stratified Cross-Validation
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

accuracies_nb = []
confusion_matrices = []

for train_index, test_index in skf.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    naive_bayes_model.fit(X_train, y_train)
    y_pred = naive_bayes_model.predict(X_test)
    
    accuracies_nb.append(accuracy_score(y_test, y_pred))
    confusion_matrices.append(confusion_matrix(y_test, y_pred))

# Accuratezza media
average_accuracy_nb = np.mean(accuracies_nb)

# Confusion Matrix aggregata
confusion_matrix_total = np.sum(confusion_matrices, axis=0)

print(f"Naïve Bayes Average Accuracy: {average_accuracy_nb:.2f}")
print("Aggregated Confusion Matrix (Naïve Bayes):")
print(confusion_matrix_total)


In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import confusion_matrix, accuracy_score

# Modello Naïve Bayes
naive_bayes_model = GaussianNB()
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# 10-Fold Cross-Validation con accuratezza
cv_scores = cross_val_score(naive_bayes_model, X, y, cv=skf, scoring='accuracy')

# Predizioni per la matrice di confusione
y_pred_cv = cross_val_predict(naive_bayes_model, X, y, cv=skf)
# Generate confusion matrix
cm = confusion_matrix(y, y_pred_cv)
ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=np.unique(y)).plot(cmap='Blues')
plt.title(f"Confusion Matrix for Naïve Bayes Classifier")
plt.show()
# Media delle accuratezze
average_accuracy_nb = cv_scores.mean()

print(f"Naïve Bayes Average Accuracy: {average_accuracy_nb:.2f}")


In [None]:
#Classification report for Naive Bayes
from sklearn.metrics import classification_report
print(classification_report(y, y_pred_cv))

In [None]:
#mostra classe domaninante nel dataset
dataframe['Class'].value_counts()

In [None]:
import seaborn as sns

#show the heatmap of the correlation matrix
plt.figure(figsize=(20, 20))
sns.heatmap(dataframe.corr(), annot=True, cmap='coolwarm', fmt=".2f")

In [None]:
#Try using Naive Bayes with Label Encoder instead of One Hot Encoding
dataframe = pd.read_excel(filename)
dataframe = dataframe.dropna()
dataframe = dataframe[~dataframe.apply(lambda row: '?' in row.values, axis=1)]

# Label Encoding
from sklearn.preprocessing import LabelEncoder
label_encoders = {}
for column in dataframe.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    dataframe[column] = le.fit_transform(dataframe[column])
    label_encoders[column] = le

# Split features and target
X = dataframe.iloc[:, :-1]  # All columns except the last
y = dataframe.iloc[:, -1]   # Class column (last column)

In [None]:
dataframe.head()

In [None]:
#Naive Bayes with Label Encoding
naive_bayes_model = GaussianNB()
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
# 10-Fold Cross-Validation con accuratezza
cv_scores = cross_val_score(naive_bayes_model, X, y, cv=skf, scoring='accuracy')

# Predizioni per la matrice di confusione
y_pred_cv = cross_val_predict(naive_bayes_model, X, y, cv=skf)

# Media delle accuratezze
average_accuracy_nb = cv_scores.mean()

cm = confusion_matrix(y, y_pred_cv)
ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=np.unique(y)).plot(cmap='Blues')
plt.title(f"Confusion Matrix for Naïve Bayes Classifier with Label Encoder, Accuracy: {average_accuracy_nb:.2f}")
plt.show()

print(f"Naïve Bayes Average Accuracy: {average_accuracy_nb:.2f}")
print(classification_report(y, y_pred_cv))

In [None]:
dataframe["Class"].value_counts()