In [6]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

In [None]:
# Generate dataset
n = 40000
a = np.random.uniform(-10, 10, n)
b = np.random.uniform(-5, 15, n)
c = np.random.uniform(-20, 5, n)
discriminant = b**2 - 4*a*c
# if discriminant is negative, the equation has no real roots
d = (discriminant < 0).astype(int)

# Ensure class balance (20%-40% ones)
ones_ratio = np.mean(d)
target_min, target_max = 0.2, 0.4
if ones_ratio < target_min or ones_ratio > target_max:
    zero_indices = np.where(d == 0)[0]
    one_indices = np.where(d == 1)[0]
    target_ones = int(n * np.random.uniform(target_min, target_max))

    if len(one_indices) > target_ones:
        drop_ones = np.random.choice(one_indices, len(one_indices) - target_ones, replace=False)
        d[drop_ones] = 0
    elif len(one_indices) < target_ones:
        add_ones = np.random.choice(zero_indices, target_ones - len(one_indices), replace=False)
        d[add_ones] = 1

# Convert to DataFrame
data = pd.DataFrame({'a': a, 'b': b, 'c': c, 'd': d})
data.head()

Unnamed: 0,a,b,c,d
0,-5.635385,-0.245113,-5.759791,1
1,-5.135634,11.363122,2.552492,0
2,4.475013,-3.697097,1.828806,1
3,-3.722604,11.112576,-2.963583,0
4,-4.409597,-2.151576,-9.005747,1


In [10]:
data.shape

(40000, 4)

In [None]:
# Split into train and test sets
X = data[['a', 'b', 'c']]
y = data['d']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Use Stratified K-Fold Cross-Validation
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Standardize features for models that require it
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Models
models = {
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Neural Network": MLPClassifier(hidden_layer_sizes=(32, 16), max_iter=1000, random_state=42),
    "Logistic Regression": LogisticRegression(random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "XGBoost": XGBClassifier(eval_metric="logloss", random_state=42)
}

# Training and evaluation
for name, model in models.items():
    accuracies = []
    print(f"Cross-Validating {name}...")
    for train_index, val_index in kf.split(X_train, y_train):
        X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
        y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]
        X_train_scaled_fold, X_val_scaled_fold = X_train_scaled[train_index], X_train_scaled[val_index]
        
        if name in ["Neural Network", "Logistic Regression"]:
            model.fit(X_train_scaled_fold, y_train_fold)
            y_pred = model.predict(X_val_scaled_fold)
        else:
            model.fit(X_train_fold, y_train_fold)
            y_pred = model.predict(X_val_fold)
        
        acc = accuracy_score(y_val_fold, y_pred)
        accuracies.append(acc)

    print(f"{name} CV Accuracy: {np.mean(accuracies):.4f}")    
    print(f"Testing {name}...")
    if name in ["Neural Network", "Logistic Regression"]:  
        y_pred = model.predict(X_test_scaled)
    else:
        y_pred = model.predict(X_test)
    
    acc = accuracy_score(y_test, y_pred)
    print(f"{name} Test Accuracy: {acc:.4f}")
    print(classification_report(y_test, y_pred))
    print("-" * 50)

Cross-Validating Decision Tree...
Decision Tree CV Accuracy: 0.9827
Testing Decision Tree...
Decision Tree Test Accuracy: 0.9826
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      5070
           1       0.97      0.98      0.98      2930

    accuracy                           0.98      8000
   macro avg       0.98      0.98      0.98      8000
weighted avg       0.98      0.98      0.98      8000

--------------------------------------------------
Cross-Validating Neural Network...
Neural Network CV Accuracy: 0.9962
Testing Neural Network...
Neural Network Test Accuracy: 0.9954
              precision    recall  f1-score   support

           0       0.99      1.00      1.00      5070
           1       1.00      0.99      0.99      2930

    accuracy                           1.00      8000
   macro avg       1.00      0.99      1.00      8000
weighted avg       1.00      1.00      1.00      8000

---------------------------------