In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import (
    AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier
)
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier


In [None]:
# Load dataset
dataset = pd.read_csv('Book1.csv')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

# Encoding categorical features
labelencoder_X = LabelEncoder()
X[:, 1] = labelencoder_X.fit_transform(X[:, 1])
X[:, 8] = labelencoder_X.fit_transform(X[:, 8])

# Encoding the target variable
labelencoder_y = LabelEncoder()
y = labelencoder_y.fit_transform(y)

# Handling missing values
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(X[:, 1:11])
X[:, 1:11] = imputer.transform(X[:, 1:11])

In [None]:
# List of classifiers
classifiers = {
    "AdaBoost": AdaBoostClassifier(),
    "CatBoost": CatBoostClassifier(verbose=0),
    "Decision Tree": DecisionTreeClassifier(),
    "Gradient Boost": GradientBoostingClassifier(),
    "KNN": KNeighborsClassifier(),
    "Kernel SVM": SVC(kernel='rbf', probability=True),
    "LDA": LinearDiscriminantAnalysis(),
    "Logistic Regression": LogisticRegression(),
    "LGBM": LGBMClassifier(verbose=-1),
    "Linear SVM": SVC(kernel='linear', probability=True),
    "MLP Classifier": MLPClassifier(max_iter=1000),
    "Naive Bayes": GaussianNB(),
    "QDA": QuadraticDiscriminantAnalysis(),
    "Random Forest": RandomForestClassifier(),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss')
}

In [None]:
# Run experiments
for test_size in splits:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=0)

    # Feature scaling
    sc = StandardScaler()
    X_train = sc.fit_transform(X_train)
    X_test = sc.transform(X_test)

    for name, clf in classifiers.items():
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)

        acc = accuracy_score(y_test, y_pred)
        pre = precision_score(y_test, y_pred, average='weighted', zero_division=1)
        rec = recall_score(y_test, y_pred, average='weighted', zero_division=1)
        f1 = f1_score(y_test, y_pred, average='weighted', zero_division=1)

        results.append([name, f"{int((1-test_size)*100)}:{int(test_size*100)}", acc, pre, rec, f1])

In [None]:

# Splitting criteria
splits = [0.2, 0.3, 0.4]
results = []


In [None]:
# Convert results to DataFrame
df_results = pd.DataFrame(results, columns=["Algorithm", "Split", "Accuracy", "Precision", "Recall", "F1-score"])
df_results.sort_values(by=["Algorithm", "Split"], inplace=True)

# Save results to CSV
df_results.to_csv("model_comparison_results.csv", index=False)

# Display results
df_results


Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.

