# Import packages

In [1]:
pip install c45-decision-tree

Collecting c45-decision-tree
  Downloading c45_decision_tree-1.0.2-py3-none-any.whl (5.8 kB)
Installing collected packages: c45-decision-tree
Successfully installed c45-decision-tree-1.0.2


In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from C45 import C45Classifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score, roc_curve, auc

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Import the data

In [5]:

# Load the data
data = pd.read_csv("/content/glass1.csv")

# Separate the target variable and features
y = data.iloc[:, -1]
X = data.iloc[:, :-1]

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Check the data

In [6]:
# Display summary statistics of the dataset
print("\nSummary statistics of the dataset:")
print(data.describe())

# Display the structure (info) of the dataset
print("\nStructure of the dataset:")
print(data.info())

# Display the number of missing values in each column
print("\nNumber of missing values in each column:")
print(data.isnull().sum())

# Display the column names of the dataset
print("\nColumn names of the dataset:")
print(data.columns)

# Display the data types of each column
print("\nData types of each column:")
print(data.dtypes)

# Display the number of unique values in each column
print("\nNumber of unique values in each column:")
print(data.nunique())

# Display the correlation matrix
print("\nCorrelation matrix:")
print(data.corr())


Summary statistics of the dataset:
               RI          Na          Mg          Al          Si           K  \
count  214.000000  214.000000  214.000000  214.000000  214.000000  214.000000   
mean     1.518365   13.407850    2.684533    1.444907   72.650935    0.497056   
std      0.003037    0.816604    1.442408    0.499270    0.774546    0.652192   
min      1.511150   10.730000    0.000000    0.290000   69.810000    0.000000   
25%      1.516522   12.907500    2.115000    1.190000   72.280000    0.122500   
50%      1.517680   13.300000    3.480000    1.360000   72.790000    0.555000   
75%      1.519157   13.825000    3.600000    1.630000   73.087500    0.610000   
max      1.533930   17.380000    4.490000    3.500000   75.410000    6.210000   

               Ca          Ba          Fe        Type  
count  214.000000  214.000000  214.000000  214.000000  
mean     8.956963    0.175047    0.057009    2.542056  
std      1.423153    0.497219    0.097439    1.707648  
min      5

In [None]:
print(data.head(5))

   x  x.1  x.2  x.3  o  o.1  x.4  o.2  o.3  positive
0  1    1    1    1  0    0    0    1    0         1
1  1    1    1    1  0    0    0    0    1         1
2  1    1    1    1  0    0    0   -1   -1         1
3  1    1    1    1  0    0   -1    0   -1         1
4  1    1    1    1  0    0   -1   -1    0         1


# Run picked baselines

In [16]:

y = data.iloc[:, -1]
X = data.iloc[:, :-1]
random_state =42
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)


# RandomForest
def run_random_forest(X_trainrf, X_testrf, y_trainrf, y_testrf):
    rf = RandomForestClassifier(random_state=random_state)
    rf.fit(X_trainrf, y_trainrf)
    y_predrf = rf.predict(X_testrf)
    accuracyrf = accuracy_score(y_testrf, y_predrf)
    precisionrf = precision_score(y_testrf, y_predrf, average='macro')
    recallrf = recall_score(y_testrf, y_predrf, average='macro')
    f1rf = f1_score(y_testrf, y_predrf, average='macro')
    return accuracyrf, precisionrf, recallrf, f1rf

# CART (Decision Tree)
def run_cart(X_train, X_test, y_train, y_test):
    cart = DecisionTreeClassifier(random_state=random_state)
    cart.fit(X_train, y_train)
    y_pred = cart.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='macro')
    recall = recall_score(y_test, y_pred, average='macro')
    f1 = f1_score(y_test, y_pred, average='macro')
    return accuracy, precision, recall, f1

# C4.5
def run_c45(X_trainca, X_testca, y_trainca, y_testca):
    modelca = C45Classifier()
    modelca.fit(X_trainca, y_trainca)
    y_predca = modelca.predict(X_testca)
    accuracyca = accuracy_score(y_testca, y_predca)
    precisionca = precision_score(y_testca, y_predca, average='macro')
    recallca = recall_score(y_testca, y_predca, average='macro')
    f1ca = f1_score(y_testca, y_predca, average='macro')
    return accuracyca, precisionca, recallca, f1ca



# Printing results

In [10]:
rf_accuracy, rf_precision, rf_recall, rf_f1 = run_random_forest(X_train, X_test, y_train, y_test)
print(f"Random Forest accuracy: {rf_accuracy}")
print(f"Random Forest precision: {rf_precision}")
print(f"Random Forest recall: {rf_recall}")
print(f"Random Forest F1 score: {rf_f1}")

cart_accuracy, cart_precision, cart_recall, cart_f1 = run_cart(X_train, X_test, y_train, y_test)
print(f"CART accuracy: {cart_accuracy}")
print(f"CART precision: {cart_precision}")
print(f"CART recall: {cart_recall}")
print(f"CART F1 score: {cart_f1}")

c45_accuracy, c45_precision, c45_recall, c45_f1 = run_c45(X_train, X_test, y_train, y_test)
print(f"C4.5 accuracy: {c45_accuracy}")
print(f"C4.5 precision: {c45_precision}")
print(f"C4.5 recall: {c45_recall}")
print(f"C4.5 F1 score: {c45_f1}")

Random Forest accuracy: 0.8372093023255814
Random Forest precision: 0.9127314814814816
Random Forest recall: 0.8432539682539683
Random Forest F1 score: 0.8605223570909845
CART accuracy: 0.7209302325581395
CART precision: 0.7232563732563732
CART recall: 0.720959595959596
CART F1 score: 0.6974183006535948
C4.5 accuracy: 0.3488372093023256
C4.5 precision: 0.26071428571428573
C4.5 recall: 0.2573051948051948
C4.5 F1 score: 0.21003898635477583


  _warn_prf(average, modifier, msg_start, len(result))


# Hypertuning (not used)

In [None]:
from sklearn.model_selection import GridSearchCV

# RandomForest with GridSearchCV
def run_random_forest_cv(X_train, y_train):
    rf = RandomForestClassifier()
    param_grid_rf = {
        'n_estimators': [10,50,100],
        'max_depth': [10, 20,30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': ['sqrt', 'log2'],
        'bootstrap': [True, False]
    }
    grid_search_rf = GridSearchCV(estimator=rf, param_grid=param_grid_rf, cv=5, n_jobs=-1, scoring='accuracy')
    grid_search_rf.fit(X_train, y_train)
    return grid_search_rf.best_estimator_, grid_search_rf.best_params_

# CART with GridSearchCV
def run_cart_cv(X_train, y_train):
    cart = DecisionTreeClassifier()
    param_grid_cart = {
        'max_depth': [10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': ['sqrt', 'log2'],
        'criterion': ['gini', 'entropy']
    }
    grid_search_cart = GridSearchCV(estimator=cart, param_grid=param_grid_cart, cv=5, n_jobs=-1, scoring='accuracy')
    grid_search_cart.fit(X_train, y_train)
    return grid_search_cart.best_estimator_, grid_search_cart.best_params_

# Main execution code
y = data.iloc[:, -1]
X = data.iloc[:, :-1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Random Forest
best_rf, best_rf_params = run_random_forest_cv(X_train, y_train)
rf_accuracy, rf_precision, rf_recall, rf_f1 = evaluate_model_cv(best_rf, X, y)
print(f"Random Forest best parameters: {best_rf_params}")
print(f"Random Forest accuracy: {rf_accuracy}")
print(f"Random Forest precision: {rf_precision}")
print(f"Random Forest recall: {rf_recall}")
print(f"Random Forest F1 score: {rf_f1}")

# CART
best_cart, best_cart_params = run_cart_cv(X_train, y_train)
cart_accuracy, cart_precision, cart_recall, cart_f1 = evaluate_model_cv(best_cart, X, y)
print(f"CART best parameters: {best_cart_params}")
print(f"CART accuracy: {cart_accuracy}")
print(f"CART precision: {cart_precision}")
print(f"CART recall: {cart_recall}")
print(f"CART F1 score: {cart_f1}")



  _warn_prf(average, modifier, msg_start, len(result))


Random Forest best parameters: {'bootstrap': True, 'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Random Forest accuracy: 0.6918050941306755
Random Forest precision: 0.7370695520223384
Random Forest recall: 0.646984126984127
Random Forest F1 score: 0.6677360036636888
CART best parameters: {'criterion': 'gini', 'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 5}
CART accuracy: 0.5984496124031008
CART precision: 0.5688765795989739
CART recall: 0.5793253968253967
CART F1 score: 0.4835576144356632


  _warn_prf(average, modifier, msg_start, len(result))


# AUC ROC Curve

In [14]:
data = pd.read_csv("/content/modified_cell_samples.csv")
X = data.iloc[:, :-1]
y = data.iloc[:, -1]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize the Decision Tree (CART) classifier
clf = DecisionTreeClassifier()

# Fit the model on the training data
clf.fit(X_train, y_train)

y_proba = clf.predict_proba(X_test)

# For multiclass classification, use the one-vs-rest approach
# Compute the AUC-ROC curve score
auc_roc_scores = []
for i in range(y_proba.shape[1]):
    auc_score = roc_auc_score(y_test == i, y_proba[:, i])
    auc_roc_scores.append((i, auc_score))

# Print the AUC-ROC score for each class
for class_idx, score in auc_roc_scores:
    print(f'Class {class_idx} AUC-ROC Curve Score: {score:.2f}')

Class 0 AUC-ROC Curve Score: 0.94
Class 1 AUC-ROC Curve Score: 0.94
