In [22]:
# Necessary imports
import pandas as pd
from sklearn.model_selection import train_test_split as tts
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

In [5]:
def load_data():
    # Define the column names
    column_names = ['parents', 'has_nurs', 'form', 'children', 'housing', 'finance', 'social', 'health', 'class']

    # Load the data and add column names
    data = pd.read_csv('data/nursery.data', delimiter=',', names=column_names)

    return data

In [6]:
data = load_data()

# Print the shape of the data
print("Data Shape: ", data.shape)

# Print the first 10 rows
print(data.head(10).to_string(), "\n")

# Get information about the data
print(data.describe().to_string(), "\n")

# Check if there are missing values
print(data.isnull().sum())

Data Shape:  (12960, 9)
  parents has_nurs      form children     housing     finance         social       health       class
0   usual   proper  complete        1  convenient  convenient        nonprob  recommended   recommend
1   usual   proper  complete        1  convenient  convenient        nonprob     priority    priority
2   usual   proper  complete        1  convenient  convenient        nonprob    not_recom   not_recom
3   usual   proper  complete        1  convenient  convenient  slightly_prob  recommended   recommend
4   usual   proper  complete        1  convenient  convenient  slightly_prob     priority    priority
5   usual   proper  complete        1  convenient  convenient  slightly_prob    not_recom   not_recom
6   usual   proper  complete        1  convenient  convenient    problematic  recommended    priority
7   usual   proper  complete        1  convenient  convenient    problematic     priority    priority
8   usual   proper  complete        1  convenient  conveni

In [26]:
def split_data(data):
    # Separate the features and target variable before encoding
    X = data.drop('class', axis=1)
    y = data['class']

    X_encoded = pd.get_dummies(X)

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = tts(X_encoded, y, test_size=0.2, random_state=13)

    return X_train, X_test, y_train, y_test, X_encoded, y

In [17]:
def training(criteria, X_train, y_train):
    # Train the model with criteria
    model = DecisionTreeClassifier(criterion=criteria, random_state=13, max_depth=3, min_samples_leaf=3)

    # Fit the model
    model.fit(X_train, y_train)

    return model

In [20]:
def predict(model, X_test):
    # Predict the target variable
    y_pred = model.predict(X_test)

    return y_pred

In [21]:
def evaluate(y_test, y_pred):
    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    confusion = confusion_matrix(y_test, y_pred)
    classification_rep = classification_report(y_test, y_pred)

    return accuracy, confusion, classification_rep

In [27]:
def main():
    # Split the data
    X_train, X_test, y_train, y_test, X_encoded, y = split_data(data)

    # Train the model with gini
    gini_model = training("gini", X_train, y_train)

    # Train the model with entropy
    entropy_model = training("entropy", X_train, y_train)

    # Prediction and evaluation
    y_pred_gini = predict(gini_model, X_test)
    accuracy_gini, confusion_gini, classification_gini = evaluate(y_test, y_pred_gini)

    # Prediction and evaluation
    y_pred_entropy = predict(entropy_model, X_test)
    accuracy_entropy, confusion_entropy, classification_entropy = evaluate(y_test, y_pred_entropy)

    # Prints
    print("Gini Index")
    print("Prediction: ", y_pred_gini)
    print("Confusion Matrix: \n", confusion_gini)
    print("Accuracy: ", accuracy_gini)
    print("Classification Report: \n", classification_gini)

    # Prints
    print("Entropy")
    print("Prediction: ", y_pred_entropy)
    print("Confusion Matrix: \n", confusion_entropy)
    print("Accuracy: ", accuracy_entropy)
    print("Classification Report: \n", classification_entropy)

    # Grid search
    param_grid = {
        'max_depth': [3, 5, 7, 10],
        'min_samples_leaf': [3, 5, 7, 10],
    }

    grid_search_gini = GridSearchCV(gini_model, param_grid, cv=5)
    grid_search_gini.fit(X_encoded, y)

    grid_search_entropy = GridSearchCV(entropy_model, param_grid, cv=5)
    grid_search_entropy.fit(X_encoded, y)

    # Print best parameters and score
    print("Gini Index")
    print("Best parameters: ", grid_search_gini.best_params_)
    print("Best score: ", grid_search_gini.best_score_)

    # Perform cross-validation with best model
    best_model = grid_search_gini.best_estimator_
    scores = cross_val_score(best_model, X_encoded, y, cv=5)

    print("Cross-validation scores: ", scores)
    print("Average cross-validation score: ", scores.mean())

    # Print best parameters and score
    print("Entropy")
    print("Best parameters: ", grid_search_entropy.best_params_)
    print("Best score: ", grid_search_entropy.best_score_)

    # Perform cross-validation with best model
    best_model2 = grid_search_entropy.best_estimator_
    scores2 = cross_val_score(best_model2, X_encoded, y, cv=5)

    print("Cross-validation scores: ", scores2)
    print("Average cross-validation score: ", scores2.mean())

if __name__ == "__main__":
    main()

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Gini Index
Prediction:  ['spec_prior' 'priority' 'priority' ... 'priority' 'spec_prior'
 'not_recom']
Confusion Matrix: 
 [[825   0   0   0   0]
 [  0 739   0 133   0]
 [  0   1   0   0   0]
 [  0 267   0 557   0]
 [  0  70   0   0   0]]
Accuracy:  0.8182870370370371
Classification Report: 
               precision    recall  f1-score   support

   not_recom       1.00      1.00      1.00       825
    priority       0.69      0.85      0.76       872
   recommend       0.00      0.00      0.00         1
  spec_prior       0.81      0.68      0.74       824
  very_recom       0.00      0.00      0.00        70

    accuracy                           0.82      2592
   macro avg       0.50      0.50      0.50      2592
weighted avg       0.81      0.82      0.81      2592

Entropy
Prediction:  ['priority' 'priority' 'priority' ... 'spec_prior' 'spec_prior'
 'not_recom']
Confusion Matrix: 
 [[825   0   0   0   0]
 [  0 663   0 209   0]
 [  0   1   0   0   0]
 [  0 209   0 615   0]
 [  0  



Gini Index
Best parameters:  {'max_depth': 3, 'min_samples_leaf': 3}
Best score:  0.6900462962962963
Cross-validation scores:  [0.68171296 0.70138889 0.7349537  0.6867284  0.64544753]
Average cross-validation score:  0.6900462962962963
Entropy
Best parameters:  {'max_depth': 5, 'min_samples_leaf': 7}
Best score:  0.687577160493827




Cross-validation scores:  [0.68171296 0.78433642 0.53819444 0.77314815 0.66049383]
Average cross-validation score:  0.687577160493827
