In [None]:
# Create a smaller dataset (e.g., 10% of the original)
sample_fraction=0.1
df_sampled = df.sample(frac=sample_fraction, random_state=42)  # Use a random state for reproducibility

# Update X and y with the sampled data
X_sampled = df_sampled.drop(columns=['Class'])
y_sampled = df_sampled['Class']

# ... (Rest of your code using X_sampled and y_sampled for initial experiments) ...
from sklearn.model_selection import RandomizedSearchCV

# Define hyperparameter search spaces for each model
param_dists = {
    "Logistic Regression": {
        'C': [0.1, 1, 10],
        'penalty': ['l1', 'l2'],
        'solver': ['liblinear', 'saga']  # Choose appropriate solvers
    },
    "Decision Tree": {
        'max_depth': [3, 5, 7, 10, None],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    },
    "Naive Bayes": {
        # Usually has few hyperparameters to tune
        'var_smoothing': [1e-9, 1e-8, 1e-7]
    },
    "XGBoost": {
        'n_estimators': [50, 100, 200],
        'max_depth': [3, 5, 7],
        'learning_rate': [0.01, 0.1, 0.2],
        'subsample': [0.8, 0.9, 1.0]
    },
    "Neural Network": {
        'hidden_layer_sizes': [(50,), (100,), (50, 50)],
        'activation': ['relu', 'tanh'],
        'alpha': [0.0001, 0.001, 0.01]
    },
    "Adaboost": {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.1, 1.0],
        'algorithm': ['SAMME', 'SAMME.R']
    },
    "Random Forest": {  # Already defined earlier
        'n_estimators': [50, 100, 200],
        'max_depth': [5, 10, 15, None],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    },
    "Gradient Boosting": {
        'n_estimators': [50, 100, 200],
        'max_depth': [3, 5, 7],
        'learning_rate': [0.01, 0.1, 0.2],
        'subsample': [0.8, 0.9, 1.0]
    }
}


for model_name, model in models.items():
    print(f"Optimizing hyperparameters for {model_name}")

    # Get the hyperparameter search space for the current model
    param_dist = param_dists[model_name]

    if model_name in ["Logistic Regression", "Decision Tree", "Naive Bayes", "Adaboost"]:
        # Use GridSearchCV with param_dist instead of param_grid
        grid_search = GridSearchCV(model, param_grid=param_dist, cv=skf, scoring='f1', n_jobs=-1)
        grid_search.fit(X_sampled, y_sampled)
        best_params = grid_search.best_params_
    else:
        # Use RandomizedSearchCV
        random_search = RandomizedSearchCV(model, param_distributions=param_dist, n_iter=50, cv=skf, scoring='f1', n_jobs=-1, random_state=42)
        random_search.fit(X_sampled, y_sampled)
        best_params = random_search.best_params_

    print(f"  Best hyperparameters: {best_params}")
