In [1]:
import tensorflow as tf
from tensorflow import keras

import numpy as np
import matplotlib.pyplot as plt

from tensorflow.keras import models
from tensorflow.keras import layers

import pandas as pd

Na początku wczytano dane o samochodach z pliku:

In [2]:
cars = pd.read_csv('car.data', encoding='utf-8')
cars

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,accept
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc
...,...,...,...,...,...,...,...
1723,low,low,5more,more,med,med,good
1724,low,low,5more,more,med,high,vgood
1725,low,low,5more,more,big,low,unacc
1726,low,low,5more,more,big,med,good


Następnie przekonwertowano nazwy klas decyzyjnych na wartości liczbowe:

In [3]:
cars['buying'].replace(['low', 'med', 'high', 'vhigh'], [0,1,2,3], inplace=True)
cars['maint'].replace(['low', 'med', 'high', 'vhigh'], [0,1,2,3], inplace=True)
cars['doors'].replace(['2', '3', '4', '5more'], [0,1,2,3], inplace=True)
cars['persons'].replace(['2', '4', 'more'], [0,1,2], inplace=True)
cars['lug_boot'].replace(['small', 'med', 'big'], [0,1,2], inplace=True)
cars['safety'].replace(['low', 'med', 'high'], [0,1,2], inplace=True)
cars['accept'].replace(['unacc', 'acc', 'good', 'vgood'], [0,1,2,3], inplace=True)

Oddzielono też parametry wejściowe od klas decyzyjnych:

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, mean_squared_error, mean_absolute_error, classification_report

x = cars.drop("accept", axis=1)
y = cars["accept"]

Następnie wyznaczono błędy pomiaru i współczynniki TP (true positive), TN (true negative), FP (false positive) i FN (false negative):

In [5]:
X_train: list = []
X_test: list = []
y_train: list = []
y_test: list = []

def display_errors(no_of_splits: int, dt_max_depth: int) -> None:
    predictions = []
    mean_absolute_errors = []
    mean_square_errors = []
    TPRs = []
    TNRs = []
    FPRs = []
    FNRs = []
    k_folds = KFold(n_splits = no_of_splits)
    DT_classifier = DecisionTreeClassifier(max_depth=dt_max_depth, criterion='entropy', random_state=42)

    for i, (train_index, test_index) in enumerate(k_folds.split(x)):
        X_train, X_test = x.iloc[train_index,:], x.iloc[test_index,:]
        y_train, y_test = y[train_index], y[test_index]
        DT_classifier.fit(X_train, y_train)
        prediction = DT_classifier.predict(X_test)
        cm = confusion_matrix(y_test, prediction)
        predictions.append(prediction)
        print("Confusion matrix for fold #" + str(i + 1))
        print(cm)
        print("Classification report for fold #" + str(i + 1))
        print(classification_report(y_test, prediction))
        # Wyświetlić tutaj metryki MAE, MSE, TP rate, TN rate, FP rate, FN rate, sumować do tablicy i pod koniec wyświetlić
        # średnią i odchylenie standardowe dla tych metryk
        mae = mean_absolute_error(y_test, prediction)
        mean_absolute_errors.append(mae)
        mse = mean_squared_error(y_test, prediction)
        mean_square_errors.append(mse)
        print("Mean absolute error for fold #" + str(i + 1))
        print(mae)
        print("Mean squared error for fold #" + str(i + 1))
        print(mse)
        FP = cm.sum(axis=0) - np.diag(cm)  
        FN = cm.sum(axis=1) - np.diag(cm)
        TP = np.diag(cm)
        TN = cm.sum() - (FP + FN + TP)

        print("TP rate for fold #" + str(i + 1) + " (unacc, acc, good, vgood)")
        TPR = TP/(TP+FN)
        TPRs.append(TPR)
        print(TPR)
        print("TN rate for fold #" + str(i + 1) + " (unacc, acc, good, vgood)")
        TNR = TN/(TN+FP) 
        TNRs.append(TNR)
        print(TNR)
        print("FP rate for fold #" + str(i + 1) + " (unacc, acc, good, vgood)")
        FPR = FP/(FP+TN)
        FPRs.append(FPR)
        print(FPR)
        print("FN rate for fold #" + str(i + 1) + " (unacc, acc, good, vgood)")
        FNR = FN/(TP+FN)
        FNRs.append(FNR)
        print(FNR)

        print("=======================================")
        
    print("Mean for MAE:")
    print(np.mean(np.asarray(mean_absolute_errors)))
    print("Stdev for MAE:")
    print(np.std(np.asarray(mean_absolute_errors)))
    print("Mean for MSE:")
    print(np.mean(np.asarray(mean_square_errors)))
    print("Stdev for MSE:")
    print(np.std(np.asarray(mean_square_errors)))

def train(test_size: float, no_of_splits: int, dt_max_depth: int) -> None:
    global X_train, X_test, y_train, y_test
    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=test_size)
    k_folds = KFold(n_splits = no_of_splits)
    DT_classifier = DecisionTreeClassifier(max_depth=dt_max_depth, criterion='entropy', random_state=42)
    DT_classifier.fit(X_train, y_train)
    y_pred = DT_classifier.predict(X_test)
    cf_matrix = confusion_matrix(y_test, y_pred)
    print(cf_matrix)
    print(classification_report(y_test, y_pred))

Następnie wytrenowano dane na optymalnych parametrach:

In [6]:
train(0.25, 5, 5)

[[279  25   0   0]
 [  7  76   0   3]
 [  0  10   0  10]
 [  0   5   0  17]]
              precision    recall  f1-score   support

           0       0.98      0.92      0.95       304
           1       0.66      0.88      0.75        86
           2       0.00      0.00      0.00        20
           3       0.57      0.77      0.65        22

    accuracy                           0.86       432
   macro avg       0.55      0.64      0.59       432
weighted avg       0.85      0.86      0.85       432



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [7]:
display_errors(5, 5)

Confusion matrix for fold #1
[[242  64]
 [  0  40]]
Classification report for fold #1
              precision    recall  f1-score   support

           0       1.00      0.79      0.88       306
           1       0.38      1.00      0.56        40

    accuracy                           0.82       346
   macro avg       0.69      0.90      0.72       346
weighted avg       0.93      0.82      0.85       346

Mean absolute error for fold #1
0.18497109826589594
Mean squared error for fold #1
0.18497109826589594
TP rate for fold #1 (unacc, acc, good, vgood)
[0.79084967 1.        ]
TN rate for fold #1 (unacc, acc, good, vgood)
[1.         0.79084967]
FP rate for fold #1 (unacc, acc, good, vgood)
[0.         0.20915033]
FN rate for fold #1 (unacc, acc, good, vgood)
[0.20915033 0.        ]
Confusion matrix for fold #2
[[263   5]
 [ 44  34]]
Classification report for fold #2
              precision    recall  f1-score   support

           0       0.86      0.98      0.91       268
         

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  TPR = TP/(TP+FN)
  FNR = FN/(TP+FN)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Regresja logistyczna (Logistic Regression)

Kolejnym krokiem było dokonanie regresji logistycznej danych:

In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split, cross_val_score

# Initialize a Logistic Regression classifier.
logreg = LogisticRegression(solver='saga', multi_class='auto', random_state=42, n_jobs=-1)

# Train the classifier.
logreg.fit(X_train,y_train)



LogisticRegression(n_jobs=-1, random_state=42, solver='saga')

Następnie dokonano klasyfikacji danych testowych i walidacji krzyżowej:

In [9]:
# Make predictions.
log_pred=logreg.predict(X_test)

# CV score
logreg_cv = cross_val_score(logreg,X_train,y_train,cv=10)

Następnie obliczono średni błąd kwadratowy, średni błąd bezwzględny, dokładność trenowania oraz dokładność walidacji krzyżowej:

In [10]:
# The mean squared error (relative error).
print("Mean squared error (MSE): %.3f" % mean_squared_error(y_test, log_pred))
# Explained average absolute error (average error).
print("Mean absolute error (MAE): %.3f" % mean_absolute_error(y_test, log_pred))
# Explained variance score: 1 is perfect prediction.
print('Accuracy: %.3f' % logreg.score(X_test, y_test))
# CV Accuracy
print('CV Accuracy: %.3f' % logreg_cv.mean())

Mean squared error (MSE): 0.264
Mean absolute error (MAE): 0.204
Accuracy: 0.826
CV Accuracy: 0.825


Następnie dokonano klasyfikacji danych testowych za pomocą perceptronu wielowarstwowego (ang. Multi-Layer Perceptron - MLP):

In [11]:
# Initialize a Multi-layer Perceptron classifier.
mlp = MLPClassifier(hidden_layer_sizes=(5),max_iter=1000, random_state=42, shuffle=True, verbose=False)

# Train the classifier.
mlp.fit(X_train, y_train)



MLPClassifier(hidden_layer_sizes=5, max_iter=1000, random_state=42)

In [12]:
# Make predictions.
mlp_pred = mlp.predict(X_test)

# CV score
mlp_cv = cross_val_score(mlp,X_train,y_train,cv=10)



Następnie obliczono wspomniane wyżej metryki dla nowego klasyfikatora:

In [13]:
# The mean squared error (relative error).
print("Mean squared error (MSE): %.3f" % mean_squared_error(y_test, mlp_pred))

# Explained average absolute error (average error).
print("Mean absolute error (MAE): %.3f" % mean_absolute_error(y_test, mlp_pred))

# Explained variance score: 1 is perfect prediction.
print('Accuracy: %.3f' % mlp.score(X_test, y_test))

# CV Accuracy
print('CV Accuracy: %.3f' % mlp_cv.mean())

Mean squared error (MSE): 0.090
Mean absolute error (MAE): 0.081
Accuracy: 0.924
CV Accuracy: 0.920


# Wnioski

Klasyfikacja perceptronem wielowarstwowym dawała większą dokładność niż regresja logistyczna, a błąd, zarówno względny jak i bezwzględny, był dużo mniejszy. Zatem w przypadku analizowanych danych bardziej efektywna jest klasyfikacja za pomocą perceptronu wielowarstwowego.