In [3]:
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
import numpy as np

def predict_parking_purchase_probability_catboost(filepath, client_id_col='client_id', target_col='target', test_size=0.2, random_state=42):
    try:
        df = pd.read_csv(filepath)

        if client_id_col not in df.columns or target_col not in df.columns:
            print(f"Error: Columns '{client_id_col}' and '{target_col}' not found in '{filepath}'.")
            return None, None, None

        # Separate features and target
        X = df.drop(columns=[client_id_col, target_col])
        y = df[target_col]

        # Identify categorical features and handle potential numerical values in categorical columns.
        categorical_features = X.select_dtypes(include=['object', 'category']).columns
        for col in categorical_features:
            #Convert to string if not already string, handle NaNs, and ensure no numeric values
            if X[col].dtype != 'object':
                X[col] = X[col].astype(str)
            X[col] = X[col].fillna('Unknown')
            #If any values are numbers after above changes, force them to strings.
            X[col] = X[col].apply(lambda x: str(x) if isinstance(x, (int, float)) else x )


        # Get indices of categorical features for CatBoost
        categorical_features_indices = np.where(X.dtypes == 'object')[0]

        # Split data
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

        # Initialize and train the CatBoost model
        model = CatBoostClassifier(iterations=2934,
                                   learning_rate=0.1,
                                   loss_function='Logloss',
                                   random_seed=random_state,
                                   verbose=100,
                                   early_stopping_rounds=2934)

        model.fit(X_train, y_train, cat_features=categorical_features_indices, eval_set=(X_test, y_test))

        # Make predictions (probabilities)
        probabilities = model.predict_proba(X_test)[:, 1]

        # Evaluate the model
        accuracy = accuracy_score(y_test, model.predict(X_test))
        roc_auc = roc_auc_score(y_test, probabilities)
        report = classification_report(y_test, model.predict(X_test))

        evaluation_metrics = {
            "accuracy": accuracy,
            "roc_auc": roc_auc,
            "classification_report": report
        }

        return model, probabilities, evaluation_metrics

    except FileNotFoundError:
        print(f"Error: File '{filepath}' not found.")
        return None, None, None
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        return None, None, None

# Example Usage (Remember to replace 'your_file.csv' with your actual file path)
filepath = 'valid_data_clients.csv'
model, probabilities, evaluation_metrics = predict_parking_purchase_probability_catboost(filepath)

if model:
    print("Model trained successfully.")
    print("Evaluation Metrics:", evaluation_metrics)
    print("Probabilities:", probabilities)

  df = pd.read_csv(filepath)


0:	learn: 0.5005684	test: 0.5017623	best: 0.5017623 (0)	total: 2.85s	remaining: 2h 19m 14s
100:	learn: 0.0338261	test: 0.0744067	best: 0.0725854 (24)	total: 2m 55s	remaining: 1h 22m 15s
200:	learn: 0.0160508	test: 0.0758867	best: 0.0725854 (24)	total: 6m 39s	remaining: 1h 30m 25s
300:	learn: 0.0099111	test: 0.0803244	best: 0.0725854 (24)	total: 9m 53s	remaining: 1h 26m 33s
400:	learn: 0.0047676	test: 0.0824088	best: 0.0725854 (24)	total: 13m 4s	remaining: 1h 22m 33s
500:	learn: 0.0032023	test: 0.0852146	best: 0.0725854 (24)	total: 16m 37s	remaining: 1h 20m 41s
600:	learn: 0.0024371	test: 0.0874703	best: 0.0725854 (24)	total: 19m 49s	remaining: 1h 16m 59s
700:	learn: 0.0019952	test: 0.0895126	best: 0.0725854 (24)	total: 22m 57s	remaining: 1h 13m 9s
800:	learn: 0.0016578	test: 0.0912989	best: 0.0725854 (24)	total: 26m 22s	remaining: 1h 10m 12s
900:	learn: 0.0014202	test: 0.0929786	best: 0.0725854 (24)	total: 29m 32s	remaining: 1h 6m 38s
1000:	learn: 0.0012608	test: 0.0946140	best: 0.0725

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Model trained successfully.
Evaluation Metrics: {'accuracy': 0.9863713798977853, 'roc_auc': 0.5363773747841106, 'classification_report': '              precision    recall  f1-score   support\n\n           0       0.99      1.00      0.99       579\n           1       0.00      0.00      0.00         8\n\n    accuracy                           0.99       587\n   macro avg       0.49      0.50      0.50       587\nweighted avg       0.97      0.99      0.98       587\n'}
Probabilities: [0.01422786 0.01157046 0.01400227 0.01325636 0.01325636 0.01325636
 0.01146399 0.01379043 0.01325636 0.01325636 0.01325636 0.01325636
 0.05448642 0.01325636 0.01325636 0.01325636 0.01325636 0.01325636
 0.01931597 0.01422786 0.01325636 0.01575194 0.01325636 0.01585191
 0.01325636 0.01375559 0.01325636 0.01936911 0.01277656 0.01325636
 0.01325636 0.01367126 0.01325636 0.01151525 0.01455407 0.01325636
 0.01325636 0.01325636 0.01585191 0.01422786 0.01538146 0.01498338
 0.01325636 0.01325636 0.01325636 0.01325