In [3]:
import pandas as pd
import numpy as np
import shap
import matplotlib.pyplot as plt
import time

from sklearn.model_selection import (
    StratifiedKFold,
    GridSearchCV,
    train_test_split
)
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    log_loss,
    matthews_corrcoef,
    roc_auc_score,
    confusion_matrix,
    classification_report,
    make_scorer
)
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.inspection import permutation_importance
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE

from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.
pandas.Float64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.
pandas.UInt64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.


# Read Data

In [2]:
data = pd.read_csv(r'telecom_data.csv')

data.drop(['CustomerID'], axis = 1, inplace = True)

Unnamed: 0,AccountAge,MonthlyCharges,TotalCharges,SubscriptionType,PaymentMethod,PaperlessBilling,ContentType,MultiDeviceAccess,DeviceRegistered,ViewingHoursPerWeek,...,LoyaltyIndicator,ContentDiversity,PremiumCostIndex,DeviceEngagement,SupportNeedIntensity,HighWatchlist,FrequentDownloader,PaymentStability,ParentalControlImpact,SubscriptionContentInteraction
0,119,15.382382,1830.503504,0,1,1,1,1,1,28.563095,...,0,0,1,1,0.05,1,1,1,0,1
1,119,16.104333,1916.415654,0,2,1,1,1,2,13.422144,...,0,0,1,1,0.0,1,1,0,1,1
2,119,15.636604,1860.755831,0,3,0,0,1,3,16.542962,...,0,1,1,1,0.0,0,1,0,1,0
3,119,7.005459,833.649646,2,2,1,1,0,3,3.147423,...,0,0,0,0,0.008333,0,1,0,1,7
4,119,14.713919,1750.956338,1,2,1,2,0,1,31.802896,...,0,0,1,0,0.075,1,1,0,0,5


# Feature Engineering

In [None]:
# 1. Monthly Engagement Ratio
data['MonthlyEngagementRatio'] = data['ViewingHoursPerWeek'] / data['MonthlyCharges']

# 2. Loyalty Indicator
data['LoyaltyIndicator'] = pd.cut(data['AccountAge'], 
                                   bins=[0, 12, 36, float('inf')], 
                                   labels=['New', 'Mid-Level', 'Loyal'])

# 3. Content Diversity
data['ContentDiversity'] = (data['ContentType'] == 'Both').astype(int)

# 4. Premium Cost Index
average_cost = data.groupby('SubscriptionType')['MonthlyCharges'].transform('mean')
data['PremiumCostIndex'] = data['MonthlyCharges'] > average_cost

# 5. Device Engagement
data['DeviceEngagement'] = ((data['DeviceRegistered'] != 'Unknown') & (data['MultiDeviceAccess'] == 'Yes')).astype(int)

# 6. Support Need Intensity
data['SupportNeedIntensity'] = data['SupportTicketsPerMonth'] / (data['AccountAge'] + 1)  # Avoid division by zero

# 7. Churn Risk Indicators
data['HighWatchlist'] = (data['WatchlistSize'] > 20).astype(int)

# 8. FrequentDownloader
data['FrequentDownloader'] = (data['ContentDownloadsPerMonth'] > data['ContentDownloadsPerMonth'].mean()).astype(int)

# 9. Payment Stability
stable_payment_methods = ['Bank transfer', 'Credit card']
data['PaymentStability'] = data['PaymentMethod'].isin(stable_payment_methods).astype(int)

# 10. Parental Control Impact
data['ParentalControlImpact'] = ((data['ParentalControl'] == 'Yes') & (data['SubtitlesEnabled'] == 'Yes')).astype(int)

# 11. Feature Interactions
data['SubscriptionContentInteraction'] = data['SubscriptionType'] + "_" + data['ContentType']

# Label Encoding

In [None]:
le = LabelEncoder()

data['SubscriptionType'] = le.fit_transform(data['SubscriptionType'])
data['PaymentMethod'] = le.fit_transform(data['PaymentMethod'])
data['PaperlessBilling'] = le.fit_transform(data['PaperlessBilling'])
data['ContentType'] = le.fit_transform(data['ContentType'])
data['MultiDeviceAccess'] = le.fit_transform(data['MultiDeviceAccess'])
data['DeviceRegistered'] = le.fit_transform(data['DeviceRegistered'])
data['GenrePreference'] = le.fit_transform(data['GenrePreference'])
data['Gender'] = le.fit_transform(data['Gender'])
data['ParentalControl'] = le.fit_transform(data['ParentalControl'])
data['SubtitlesEnabled'] = le.fit_transform(data['SubtitlesEnabled'])
data['LoyaltyIndicator'] = le.fit_transform(data['LoyaltyIndicator'])
data['PremiumCostIndex'] = le.fit_transform(data['PremiumCostIndex'])
data['SubscriptionContentInteraction'] = le.fit_transform(data['SubscriptionContentInteraction'])

data.drop(['TotalCharges', 'ContentType', 'DeviceEngagement', 'SubscriptionContentInteraction'], axis = 1, inplace = True)

data.head(5)

# DATA SPLIT

In [3]:
data_train_val = data[data['AccountAge']>24]
data_test = data[data['AccountAge'] <=24]

print('data_train_val count:', data_train_val['Churn'].count())
print('data_test count:', data_test['Churn'].count())

# Assume 'Target' is the name of the target column
X = data_train_val.drop('Churn', axis=1)
y = data_train_val['Churn']

# Step 1: Split the dataset into training (75%) and validation (25%) sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)

# Step 2: Apply SMOTE to the training data only
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

data_train_val count: 195001
data_test count: 48786


# Standardize

In [None]:
# Step 3: Standardize the resampled training data and the validation data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_resampled)  # Fit and transform on resampled training data
X_val_scaled = scaler.transform(X_val)  # Only transform on the validation data, don't fit again

# Logistic Regression

In [4]:
# Define the logistic regression model with higher max_iter
log_reg = LogisticRegression(max_iter=1000, random_state=42)

# Define the hyperparameter grid for GridSearchCV
param_grid = [
    {
        'solver': ['liblinear'],
        'penalty': ['l1', 'l2'],
        'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100],
        'class_weight': ['balanced']
    },
    {
        'solver': ['lbfgs', 'newton-cg'],
        'penalty': ['l2'],
        'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100],
        'class_weight': ['balanced']
    },
    {
        'solver': ['saga'],
        'penalty': ['l1', 'l2', 'elasticnet'],
        'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100],
        'class_weight': ['balanced'],
        'l1_ratio': [0.1, 0.5, 0.7, 1.0]  # Only for elasticnet
    }
]

# Replace CV with Stratified Cross-Validation
stratified_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Replace RandomizedSearchCV with GridSearchCV
grid_search = GridSearchCV(
    log_reg,
    param_grid,
    scoring=make_scorer(f1_score, average='binary'),
    cv=stratified_cv,
    verbose=1,
    n_jobs=-1,
    error_score='raise'
)

# Measure training start time
start_time = time.time()

# Fit the model on the resampled and scaled training data
grid_search.fit(X_train_scaled, y_train_resampled)

# Measure end time for training
end_time = time.time()

# Get the best parameters and retrain the model
best_model = grid_search.best_estimator_

# Make predictions on the validation set
y_val_pred = best_model.predict(X_val_scaled)
y_val_pred_proba = best_model.predict_proba(X_val_scaled)[:, 1]  # For log loss and ROC-AUC

# Evaluate the model's performance on the validation set
val_metrics = {
    "Accuracy": accuracy_score(y_val, y_val_pred),
    "Precision": precision_score(y_val, y_val_pred, average='binary'),  
    "Recall": recall_score(y_val, y_val_pred, average='binary'),        
    "F1-Score": f1_score(y_val, y_val_pred, average='binary'),          
    "Log Loss": log_loss(y_val, y_val_pred_proba),
    "Matthews Correlation Coefficient": matthews_corrcoef(y_val, y_val_pred),
    "ROC-AUC": roc_auc_score(y_val, y_val_pred_proba),
}

# Print the evaluation metrics for validation set
print("\nValidation Metrics:")
for metric, value in val_metrics.items():
    print(f"{metric}: {value:.4f}")

# Confusion matrix and classification report for validation set
print("\nConfusion Matrix (Validation):")
print(confusion_matrix(y_val, y_val_pred))

print("\nClassification Report (Validation):")
print(classification_report(y_val, y_val_pred))

# Calculate and print training time
train_time = end_time - start_time
print("\nTraining Time:", round(train_time, 2), "seconds")

Fitting 5 folds for each of 112 candidates, totalling 560 fits

Validation Metrics:
Accuracy: 0.7536
Precision: 0.2707
Recall: 0.3647
F1-Score: 0.3108
Log Loss: 0.5168
Matthews Correlation Coefficient: 0.1674
ROC-AUC: 0.6597

Confusion Matrix (Validation):
[[34031  7295]
 [ 4717  2708]]

Classification Report (Validation):
              precision    recall  f1-score   support

           0       0.88      0.82      0.85     41326
           1       0.27      0.36      0.31      7425

    accuracy                           0.75     48751
   macro avg       0.57      0.59      0.58     48751
weighted avg       0.79      0.75      0.77     48751


Training Time: 291.03 seconds


# Feature Importance Logistic Regression

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Extract feature importance for Logistic Regression
feature_importance = np.abs(log_reg.coef_[0])  # Absolute values of coefficients
feature_names = X.columns  # Feature names
importance_df = pd.DataFrame({"Feature": feature_names, "Importance": feature_importance})

#  Convert importance values to percentages
importance_df["Importance (%)"] = (importance_df["Importance"] / importance_df["Importance"].sum()) * 100

#  Sort features by importance in descending order
importance_df = importance_df.sort_values(by="Importance (%)", ascending=False)

#  Plot top feature importances with color gradient
top_features = importance_df.head(15)
colors = plt.cm.Blues(np.linspace(1, 0.4, len(top_features)))

plt.figure(figsize=(12, 8))  # Increased the figure size
bars = plt.barh(
    top_features["Feature"],
    top_features["Importance (%)"],
    color=colors,
    edgecolor="black",
    alpha=0.9,
)

plt.gca().invert_yaxis()  # Highest importance at the top
plt.title("Feature Importances (Logistic Regression)", fontsize=18)
plt.xlabel("Importance (%)", fontsize=14)

# Add importance values on bars (rounded to 2 decimal places)
for bar, value in zip(bars, top_features["Importance (%)"]):
    plt.text(
        bar.get_width() + 0.001,  # Adjusted to provide more spacing
        bar.get_y() + bar.get_height() / 2,
        f"{value:.1f}%",  # Rounded to 2 decimal places
        va="center",
        fontsize=12,
    )

plt.tight_layout()
plt.show()

# Coefficients - Feature Importance Values

In [None]:
# Assuming 'log_reg' is your trained Logistic Regression model and 'X' is your feature DataFrame

# Extract coefficients
coefficients = log_reg.coef_[0]  # For binary classification, take the first row
absolute_coefficients = np.abs(coefficients)  # Use absolute values for importance ranking

# Pair coefficients with feature names
feature_importance_df = pd.DataFrame({
    "Feature": X.columns,
    "Importance": absolute_coefficients
})

# Sort by importance in descending order
feature_importance_df = feature_importance_df.sort_values(by="Importance", ascending=False)

# (Optional) Convert to percentages
feature_importance_df["Importance (%)"] = (feature_importance_df["Importance"] / feature_importance_df["Importance"].sum()) * 100

# Display the DataFrame with feature importance values
print(feature_importance_df)


# Permutation Importance - Feature Importance Values

In [None]:
from sklearn.inspection import permutation_importance
from sklearn.metrics import make_scorer, f1_score
import pandas as pd

# Calculate Permutation Importance on the validation set using F1 score
perm_importance = permutation_importance(
    log_reg,  # Trained logistic regression model
    X_val_scaled,  # Scaled validation features
    y_val,  # Validation target labels
    n_repeats=10,
    random_state=42,
    scoring=make_scorer(f1_score, average="binary")  # Using F1 score as the scoring metric
)

# Extract feature importances and feature names
feature_importances = perm_importance.importances_mean
feature_names = X.columns

# Convert importance values to percentages
total_importance = feature_importances.sum()
feature_importance_percentages = (feature_importances / total_importance) * 100

# Create a DataFrame to display feature importances
perm_importance_df = pd.DataFrame({
    "Feature": feature_names,
    "Importance (%)": feature_importance_percentages
}).sort_values(by="Importance (%)", ascending=False)

# Display the feature importance DataFrame
print("\nPermutation Importance for Features (as Percentage):")
print(perm_importance_df)

# Optionally save the results to an Excel file
output_file = r"permutation_importance_f1_score.xlsx"
perm_importance_df.to_excel(output_file, index=False)
print(f"\nPermutation importance percentages have been saved to: {output_file}")

# XGBClassifier

In [4]:
# Define the XGBoost classifier with early_stopping_rounds in the constructor
xgb_model = XGBClassifier(
    use_label_encoder=False,  # To suppress warning in newer versions
    eval_metric='logloss',  # Required by XGBoost
    random_state=42,
    early_stopping_rounds=10  # Set early_stopping_rounds here instead of in fit
)

# Define the hyperparameter grid for GridSearchCV
param_grid = {
    'n_estimators': [100, 150], 
    'learning_rate': [0.05, 0.1], 
    'max_depth': [4, 8],  
    'min_child_weight': [1, 3],  
    'subsample': [0.8], 
    'colsample_bytree': [0.8],  
    'gamma': [0],  
    'reg_alpha': [0], 
    'reg_lambda': [1]  
}

# Define GridSearchCV with F1-score as the scoring metric and StratifiedKFold for cross-validation
stratified_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

grid_search = GridSearchCV(
    xgb_model,
    param_grid,
    scoring=make_scorer(f1_score, average='binary'),  # Binary F1-score for imbalanced datasets
    cv=stratified_kfold,  # Use StratifiedKFold for cross-validation
    verbose=1,
    n_jobs=-1,
    error_score='raise'  # Set error_score to raise for debugging
)

# Measure training start time
start_time = time.time()

# Fit the model on the resampled and scaled training data
grid_search.fit(
    X_train_scaled, y_train_resampled,
    eval_set=[(X_val_scaled, y_val)], 
    verbose=1  # No need to pass early_stopping_rounds here anymore
)

# Measure end time for training
end_time = time.time()

# Get the best parameters and retrain the model
best_model = grid_search.best_estimator_

# Make predictions on the validation set
y_val_pred = best_model.predict(X_val_scaled)
y_val_pred_proba = best_model.predict_proba(X_val_scaled)[:, 1]  # For log loss and ROC-AUC

# Evaluate the model's performance on the validation set
val_metrics = {
    "Accuracy": accuracy_score(y_val, y_val_pred),
    "Precision": precision_score(y_val, y_val_pred, average='binary'),  
    "Recall": recall_score(y_val, y_val_pred, average='binary'),        
    "F1-Score": f1_score(y_val, y_val_pred, average='binary'),          
    "Log Loss": log_loss(y_val, y_val_pred_proba),
    "Matthews Correlation Coefficient": matthews_corrcoef(y_val, y_val_pred),
    "ROC-AUC": roc_auc_score(y_val, y_val_pred_proba),
}

# Print the evaluation metrics for validation set
print("\nValidation Metrics:")
for metric, value in val_metrics.items():
    print(f"{metric}: {value:.4f}")

# Confusion matrix and classification report for validation set
print("\nConfusion Matrix (Validation):")
print(confusion_matrix(y_val, y_val_pred))

print("\nClassification Report (Validation):")
print(classification_report(y_val, y_val_pred))

# Calculate and print training time
train_time = end_time - start_time
print("\nTraining Time:", round(train_time, 2), "seconds")

Fitting 5 folds for each of 16 candidates, totalling 80 fits
[0]	validation_0-logloss:0.67304
[1]	validation_0-logloss:0.65621
[2]	validation_0-logloss:0.63442
[3]	validation_0-logloss:0.61577
[4]	validation_0-logloss:0.60035
[5]	validation_0-logloss:0.58792
[6]	validation_0-logloss:0.57845
[7]	validation_0-logloss:0.56873
[8]	validation_0-logloss:0.55990
[9]	validation_0-logloss:0.55402
[10]	validation_0-logloss:0.54703
[11]	validation_0-logloss:0.54153
[12]	validation_0-logloss:0.53709
[13]	validation_0-logloss:0.53326
[14]	validation_0-logloss:0.52943
[15]	validation_0-logloss:0.52326
[16]	validation_0-logloss:0.51804
[17]	validation_0-logloss:0.51428
[18]	validation_0-logloss:0.51153
[19]	validation_0-logloss:0.50709
[20]	validation_0-logloss:0.50475
[21]	validation_0-logloss:0.50090
[22]	validation_0-logloss:0.49727
[23]	validation_0-logloss:0.49545
[24]	validation_0-logloss:0.49437
[25]	validation_0-logloss:0.49308
[26]	validation_0-logloss:0.49188
[27]	validation_0-logloss:0.489

# RandomForestClassifier

In [4]:
# Define the Random Forest classifier
rf_model = RandomForestClassifier(random_state=42)

# Define the hyperparameter grid for GridSearchCV
param_grid = {
    'n_estimators': [100, 150],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'max_features': ['sqrt', 'log2'],  # Fixed deprecated 'auto'
    'bootstrap': [True, False],
    'class_weight': [None, 'balanced']
}

# Define GridSearchCV with F1-score as the scoring metric and StratifiedKFold for cross-validation
stratified_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

grid_search = GridSearchCV(
    rf_model,
    param_grid,
    scoring=make_scorer(f1_score, average="binary"),  # Binary F1-score for imbalanced datasets
    cv=stratified_kfold,  # Use StratifiedKFold for cross-validation
    verbose=1,
    n_jobs=-1,
    error_score='raise'  # Set error_score to raise for debugging
)

# Measure training start time
start_time = time.time()

# Fit the model on the resampled and scaled training data
grid_search.fit(X_train_scaled, y_train_resampled)

# Measure end time for training
end_time = time.time()

# Get the best parameters and retrain the model
best_model = grid_search.best_estimator_

# Make predictions on the validation set
y_val_pred = best_model.predict(X_val_scaled)
y_val_pred_proba = best_model.predict_proba(X_val_scaled)[:, 1]  # For log loss and ROC-AUC

# Evaluate the model's performance on the validation set
val_metrics = {
    "Accuracy": accuracy_score(y_val, y_val_pred),
    "Precision": precision_score(y_val, y_val_pred, average="binary"),  
    "Recall": recall_score(y_val, y_val_pred, average="binary"),        
    "F1-Score": f1_score(y_val, y_val_pred, average="binary"),          
    "Log Loss": log_loss(y_val, y_val_pred_proba),
    "Matthews Correlation Coefficient": matthews_corrcoef(y_val, y_val_pred),
    "ROC-AUC": roc_auc_score(y_val, y_val_pred_proba)
}

# Print the evaluation metrics for validation set
print("\nValidation Metrics:")
for metric, value in val_metrics.items():
    print(f"{metric}: {value:.4f}")

# Confusion matrix and classification report for validation set
print("\nConfusion Matrix (Validation):")
print(confusion_matrix(y_val, y_val_pred))

print("\nClassification Report (Validation):")
print(classification_report(y_val, y_val_pred))

# Calculate and print training time
train_time = end_time - start_time
print("\nTraining Time:", round(train_time, 2), "seconds")

Fitting 5 folds for each of 192 candidates, totalling 960 fits

Validation Metrics:
Accuracy: 0.8137
Precision: 0.3105
Recall: 0.1826
F1-Score: 0.2300
Log Loss: 0.4442
Matthews Correlation Coefficient: 0.1381
ROC-AUC: 0.6875

Confusion Matrix (Validation):
[[38315  3011]
 [ 6069  1356]]

Classification Report (Validation):
              precision    recall  f1-score   support

           0       0.86      0.93      0.89     41326
           1       0.31      0.18      0.23      7425

    accuracy                           0.81     48751
   macro avg       0.59      0.55      0.56     48751
weighted avg       0.78      0.81      0.79     48751


Training Time: 13264.58 seconds


# LGBMClassifier

In [4]:
# Define the LGBM classifier
lgbm_model = LGBMClassifier(random_state=42)

# Define the hyperparameter grid for GridSearchCV
param_grid = {
    'n_estimators': [100, 300],
    'learning_rate': [0.05, 0.1],
    'max_depth': [10, 20], 
    'num_leaves': [31, 40], 
    'min_child_samples': [10], 
    'subsample': [0.8, 1.0],  
    'colsample_bytree': [0.8, 1.0], 
    'reg_alpha': [0, 0.1],  
    'reg_lambda': [1, 1.5],  
    'class_weight': [None, 'balanced'] 
}
# Define StratifiedKFold for cross-validation
stratified_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Define GridSearchCV with stratified cross-validation and F1-score as the scoring metric
grid_search = GridSearchCV(
    lgbm_model,
    param_grid,
    scoring=make_scorer(f1_score, average="binary"),  # Binary F1-score for imbalanced datasets
    cv=stratified_kfold,  # Use StratifiedKFold
    verbose=1,
    n_jobs=-1,
    error_score='raise'  # Set error_score to raise for debugging
)

# Measure training start time
start_time = time.time()

# Fit the model on the resampled and scaled training data
grid_search.fit(X_train_scaled, y_train_resampled)

# Measure end time for training
end_time = time.time()

# Get the best parameters and retrain the model
best_model = grid_search.best_estimator_

# Make predictions on the validation set
y_val_pred = best_model.predict(X_val_scaled)
y_val_pred_proba = best_model.predict_proba(X_val_scaled)[:, 1]  # For log loss and ROC-AUC

# Evaluate the model's performance on the validation set
val_metrics = {
    "Accuracy": accuracy_score(y_val, y_val_pred),
    "Precision": precision_score(y_val, y_val_pred),
    "Recall": recall_score(y_val, y_val_pred),
    "F1-Score": f1_score(y_val, y_val_pred),
    "Log Loss": log_loss(y_val, y_val_pred_proba),
    "Matthews Correlation Coefficient": matthews_corrcoef(y_val, y_val_pred),
    "ROC-AUC": roc_auc_score(y_val, y_val_pred_proba),
}

# Print the evaluation metrics for validation set
print("\nValidation Metrics:")
for metric, value in val_metrics.items():
    print(f"{metric}: {value:.4f}")

# Confusion matrix and classification report for validation set
print("\nConfusion Matrix (Validation):")
print(confusion_matrix(y_val, y_val_pred))

print("\nClassification Report (Validation):")
print(classification_report(y_val, y_val_pred))

# Calculate and print training time
train_time = end_time - start_time
print("\nTraining Time:", round(train_time, 2), "seconds")

Fitting 5 folds for each of 512 candidates, totalling 2560 fits
[LightGBM] [Info] Number of positive: 123977, number of negative: 123977
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.017436 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1770
[LightGBM] [Info] Number of data points in the train set: 247954, number of used features: 26
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000

Validation Metrics:
Accuracy: 0.8155
Precision: 0.3277
Recall: 0.2011
F1-Score: 0.2492
Log Loss: 0.4254
Matthews Correlation Coefficient: 0.1567
ROC-AUC: 0.7014

Confusion Matrix (Validation):
[[38263  3063]
 [ 5932  1493]]

Classification Report (Validation):
              precision    recall  f1-score   support

           0       0.87      0.93      0.89     41326
           1       0.33      0.20      0.25      7425

    