Task from the Machine Learning course in the master's program, working with two datasets and building a decision tree.

In [31]:
# Imports needed

import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import KFold, cross_val_score
from sklearn.utils import resample
from sklearn.metrics import classification_report, accuracy_score

In [32]:
# Reading both datasets
file_path_training_data = 'C:/Users/Victor/Documents/datasets/Template - Distinguindo Frutas - Dados.csv' 
file_path_testing_data = 'C:/Users/Victor/Documents/datasets/fruit_test.csv'

treino = pd.read_csv(file_path_training_data, delimiter=';')
teste = pd.read_csv(file_path_testing_data, delimiter=';')


In [33]:
treino.head() # to understand the dataset

Unnamed: 0,mass,width,height,color_score,fruit_name
0,192,8.4,7.3,0.55,apple
1,180,8.0,6.8,0.59,apple
2,86,6.2,4.7,0.8,mandarin
3,84,6.0,4.6,0.79,mandarin
4,80,5.8,4.3,0.77,mandarin


In [34]:
teste.head() # to understand the dataset

Unnamed: 0,mass,width,height,color_score,fruit_name
0,176,7.4,7.2,0.6,apple
1,76,5.8,4.0,0.81,mandarin
2,166,6.9,7.3,0.93,apple
3,168,7.5,7.6,0.73,apple
4,170,7.6,7.9,0.88,apple


In [35]:
# Separation of features and the target variable from the 'train' dataset
X_train = treino[['mass', 'width', 'height', 'color_score']]
y_train = treino['fruit_name']

# Creating and trainin the Decision Tree Model
clf = DecisionTreeClassifier(random_state=1)
clf.fit(X_train, y_train)

In [36]:
# Definition of X and y test

X_test = teste[['mass', 'width', 'height', 'color_score']]
y_test = teste['fruit_name']

# Predicting in training dataset
y_pred = clf.predict(X_test)

# Accuracy on training dataset
test_accuracy = accuracy_score(y_test, y_pred)

# Classifier Report
test_report = classification_report(y_test, y_pred)

# Results
print(f"Accuracy training dataset : {test_accuracy:.2%}\n")
print("Classifier report:\n")
print(test_report)

Accuracy training dataset : 90.91%

Classifier report:

              precision    recall  f1-score   support

       apple       1.00      0.75      0.86         4
       lemon       1.00      1.00      1.00         3
    mandarin       1.00      1.00      1.00         1
      orange       0.75      1.00      0.86         3

    accuracy                           0.91        11
   macro avg       0.94      0.94      0.93        11
weighted avg       0.93      0.91      0.91        11



In [37]:
# Concatenate the features and labels from the training and test datasets
X_combined = pd.concat([X_train, X_test], axis=0)
y_combined = pd.concat([y_train, y_test], axis=0)

from sklearn.metrics import classification_report

# Lists to store the predictions and the true values for each fold
y_true_all = []
y_pred_all = []

# K-Fold Cross-validation with 5 folds
kf = KFold(n_splits=5, shuffle=True, random_state=0)
for train_index, test_index in kf.split(X_combined):
    X_train_fold, X_test_fold = X_combined.iloc[train_index], X_combined.iloc[test_index]
    y_train_fold, y_test_fold = y_combined.iloc[train_index], y_combined.iloc[test_index]

    # Training model in each fold
    clf.fit(X_train_fold, y_train_fold)

    # Predictions on the test set of the fold
    y_pred_fold = clf.predict(X_test_fold)

    # Results
    y_true_all.extend(y_test_fold)
    y_pred_all.extend(y_pred_fold)

# General Results after K-fold
final_report = classification_report(y_true_all, y_pred_all)

# Showing classifier report
print("Relatório de Classificação (K-Fold Cross-Validation):\n")
print(final_report)

Relatório de Classificação (K-Fold Cross-Validation):

              precision    recall  f1-score   support

       apple       0.94      0.79      0.86        19
       lemon       0.84      1.00      0.91        16
    mandarin       1.00      1.00      1.00         5
      orange       0.89      0.89      0.89        19

    accuracy                           0.90        59
   macro avg       0.92      0.92      0.92        59
weighted avg       0.90      0.90      0.90        59



In [38]:
from sklearn.metrics import classification_report

# Defining number of iterations
n_iterations = 100

# Lists to store the prediction and true values in bootstrapping
y_true_bootstrap = []
y_pred_bootstrap = []

for i in range(n_iterations):
    # Create bootstrapped samples
    X_resampled, y_resampled = resample(X_combined, y_combined, random_state=i)

    # Separate a portion of the set for testing (non-resampled samples)
    X_oob = X_combined.drop(X_resampled.index)
    y_oob = y_combined.drop(y_resampled.index)

    # Train the model on the bootstrapped samples
    clf.fit(X_resampled, y_resampled)

    # Predictions on the OOB (out-of-bag) samples
    y_pred_oob = clf.predict(X_oob)

    # Store the results for reporting
    y_true_bootstrap.extend(y_oob)
    y_pred_bootstrap.extend(y_pred_oob)

# Classification report after bootstrapping
bootstrap_report = classification_report(y_true_bootstrap, y_pred_bootstrap)

# Showing classifier report
print("Classification Report (Bootstrapping):\n")
print(bootstrap_report)

Classification Report (Bootstrapping):

              precision    recall  f1-score   support

       apple       0.83      0.84      0.84       425
       lemon       0.88      0.83      0.86       513
    mandarin       0.78      0.90      0.84        62
      orange       0.83      0.85      0.84       628

    accuracy                           0.84      1628
   macro avg       0.83      0.86      0.84      1628
weighted avg       0.85      0.84      0.84      1628

