In [2]:
pip install bayesian-optimization catboost

Collecting bayesian-optimization
  Obtaining dependency information for bayesian-optimization from https://files.pythonhosted.org/packages/c5/fd/5998d2f9d693b5ef2954e3d9ddb96ede395373faa5d9bcfbd7da4b945d47/bayesian_optimization-1.5.1-py3-none-any.whl.metadata
  Using cached bayesian_optimization-1.5.1-py3-none-any.whl.metadata (16 kB)
Using cached bayesian_optimization-1.5.1-py3-none-any.whl (28 kB)
Installing collected packages: bayesian-optimization
Successfully installed bayesian-optimization-1.5.1
Note: you may need to restart the kernel to use updated packages.


In [1]:
from bayes_opt import BayesianOptimization
from catboost import CatBoostClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
import numpy as np
import pandas as pd
import numpy as np
import lightgbm as lgb
from scipy.stats import randint, uniform
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [2]:
df1 = pd.read_csv('Train Dataset .csv')
X_data = df1.drop(['Id','target'], axis=1)
y_data = df1['target']
X , X_test, y, y_test = train_test_split(X_data, y_data,test_size = 0.2, random_state = 40)
X_train, X_cv, y_train, y_cv = train_test_split(X,y, test_size = 0.2, random_state = 42)

In [None]:
# Define the function to optimize
def catboost_cv(depth, learning_rate, iterations, l2_leaf_reg, border_count, 
                bagging_temperature, random_strength):
    # Convert parameters to integers where necessary
    depth = int(depth)
    iterations = int(iterations)
    l2_leaf_reg = int(l2_leaf_reg)
    border_count = int(border_count)
    
    # Create the CatBoost model with the given parameters
    model = CatBoostClassifier(
        verbose=False,
        random_state=42,
        scale_pos_weight=5,
        depth=depth,
        learning_rate=learning_rate,
        iterations=iterations,
        l2_leaf_reg=l2_leaf_reg,
        border_count=border_count,
        bagging_temperature=bagging_temperature,
        random_strength=random_strength
    )

    # Cross-validation setup
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    # Use negative log loss as the scoring metric (change to your preferred metric)
    scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='accuracy')
    
    # Return the mean score
    return scores.mean()

# Define the bounds for Bayesian Optimization
param_bounds = {
    'depth': (4, 10),
    'learning_rate': (0.01, 0.3),
    'iterations': (10, 1000),
    'l2_leaf_reg': (1, 10),
    'border_count': (1, 255),
    'bagging_temperature': (0.0, 1.0),
    'random_strength': (0.0, 1.0)
}

# Create the Bayesian Optimization object
optimizer = BayesianOptimization(
    f=catboost_cv,
    pbounds=param_bounds,
    random_state=42,
    verbose=2
)

# Perform the optimization
optimizer.maximize(init_points=10, n_iter=30)  # `init_points` sets random exploration; `n_iter` is the number of optimization steps.

# Extract the best parameters
best_params = optimizer.max['params']
best_params['depth'] = int(best_params['depth'])  # Ensure integer type for certain parameters
best_params['iterations'] = int(best_params['iterations'])
best_params['l2_leaf_reg'] = int(best_params['l2_leaf_reg'])
best_params['border_count'] = int(best_params['border_count'])

print("Best Parameters:", best_params)


|   iter    |  target   | baggin... | border... |   depth   | iterat... | l2_lea... | learni... | random... |
-------------------------------------------------------------------------------------------------------------
| [39m1        [39m | [39m0.811    [39m | [39m0.3745   [39m | [39m242.5    [39m | [39m8.392    [39m | [39m602.7    [39m | [39m2.404    [39m | [39m0.05524  [39m | [39m0.05808  [39m |
| [35m2        [39m | [35m0.814    [39m | [35m0.8662   [39m | [35m153.7    [39m | [35m8.248    [39m | [35m30.38    [39m | [35m9.729    [39m | [35m0.2514   [39m | [35m0.2123   [39m |
| [35m3        [39m | [35m0.816    [39m | [35m0.1818   [39m | [35m47.58    [39m | [35m5.825    [39m | [35m529.5    [39m | [35m4.888    [39m | [35m0.09446  [39m | [35m0.6119   [39m |
| [39m4        [39m | [39m0.8138   [39m | [39m0.1395   [39m | [39m75.2     [39m | [39m6.198    [39m | [39m461.5    [39m | [39m8.067    [39m | [39m0.06791  [39m | [

In [6]:
best_model = CatBoostClassifier(
    verbose=False,
    random_state=42,
    scale_pos_weight=5,
    depth=best_params['depth'],
    learning_rate=best_params['learning_rate'],
    iterations=best_params['iterations'],
    l2_leaf_reg=best_params['l2_leaf_reg'],
    border_count=best_params['border_count'],
    bagging_temperature=best_params['bagging_temperature'],
    random_strength=best_params['random_strength']
)

In [14]:
best_model.fit(X_train, y_train)

y_prob_ = best_model.predict_proba(X_test)  # Predicting probabilities
y_pred_ = np.argmax(y_prob_, axis=1)
accuracy_ = accuracy_score(y_test, y_pred_)
conf_matrix_ = confusion_matrix(y_test, y_pred_)
classification_rep_ = classification_report(y_test, y_pred_)

print(f'Accuracy: {accuracy_ * 100:.2f}%')
print('Confusion Matrix:\n', conf_matrix_)
print('Classification Report:\n', classification_rep_)



Accuracy: 81.04%
Confusion Matrix:
 [[  21  260]
 [  17 1163]]
Classification Report:
               precision    recall  f1-score   support

           0       0.55      0.07      0.13       281
           1       0.82      0.99      0.89      1180

    accuracy                           0.81      1461
   macro avg       0.68      0.53      0.51      1461
weighted avg       0.77      0.81      0.75      1461



In [24]:
y_prob = best_model.predict_proba(X_cv)[:, 1] # Predicting probabilities
y_pred = np.where(y_prob > 0.70, 1, 0)
accuracy = accuracy_score(y_cv, y_pred)
conf_matrix = confusion_matrix(y_cv, y_pred)
classification_rep = classification_report(y_cv, y_pred)

print(f'Accuracy: {accuracy * 100:.2f}%')
print('Confusion Matrix:\n', conf_matrix)
print('Classification Report:\n', classification_rep)

Accuracy: 81.35%
Confusion Matrix:
 [[ 57 155]
 [ 63 894]]
Classification Report:
               precision    recall  f1-score   support

           0       0.47      0.27      0.34       212
           1       0.85      0.93      0.89       957

    accuracy                           0.81      1169
   macro avg       0.66      0.60      0.62      1169
weighted avg       0.78      0.81      0.79      1169



In [20]:
df3 = pd.read_csv('Test Dataset.csv')
df3.rename(columns = {'age' : 'Age', 'sex': 'Sex'}, inplace = True)
X_test_real = df3.drop(['id'], axis=1)


y_pred_real = best_model.predict_proba(X_test_real)
y_pred_1 = np.argmax(y_pred_real, axis=1)

In [21]:
Submission = pd.DataFrame({'ID' : df3['id'], 'target' : y_pred_1})
Submission.to_csv('DSNSubmission_12.csv', index = False)