# Data Preparation

In [2]:
import pandas as pd
import numpy as np

dev_data = pd.read_csv("C:/Users/viina/OneDrive/Desktop/Credit Card Behaviour Score/Dev_data_to_be_shared.csv")
val_data = pd.read_csv("C:/Users/viina/OneDrive/Desktop/Credit Card Behaviour Score/validation_data_to_be_shared.csv")

# Display the first few rows
print("Development Data Preview:")
print(dev_data.head())

print("\nValidation Data Preview:")
print(val_data.head())


Development Data Preview:
   account_number  bad_flag  onus_attribute_1  transaction_attribute_1  \
0               1         0               NaN                      NaN   
1               2         0          221000.0                      0.0   
2               3         0           25000.0                      0.0   
3               4         0           86000.0                      0.0   
4               5         0          215000.0                      0.0   

   transaction_attribute_2  transaction_attribute_3  transaction_attribute_4  \
0                      NaN                      NaN                      NaN   
1                      0.0                      0.0                      0.0   
2                      0.0                      0.0                      0.0   
3                      0.0                      0.0                      0.0   
4                      0.0                      0.0                      0.0   

   transaction_attribute_5  transaction_attribut

# Inspect and Clean Data

In [3]:
# Check for missing values
print("Missing values in development data:")
print(dev_data.isnull().sum())

print("\nMissing values in validation data:")
print(val_data.isnull().sum())

# Separate features and target in development data
X = dev_data.drop(columns=["bad_flag", "account_number"])
y = dev_data["bad_flag"]

# Handle missing values by imputing with the mean
X.fillna(X.mean(), inplace=True)
val_X = val_data.drop(columns=["account_number"])
val_X.fillna(X.mean(), inplace=True)  # Use the same mean from training


Missing values in development data:
account_number                 0
bad_flag                       0
onus_attribute_1           25231
transaction_attribute_1    25231
transaction_attribute_2    25231
                           ...  
onus_attribute_44          85196
onus_attribute_45          85196
onus_attribute_46          85196
onus_attribute_47          85196
onus_attribute_48          85196
Length: 1216, dtype: int64

Missing values in validation data:
account_number                 0
onus_attribute_1           10990
transaction_attribute_1    10990
transaction_attribute_2    10990
transaction_attribute_3    10990
                           ...  
onus_attribute_44          36789
onus_attribute_45          36789
onus_attribute_46          36789
onus_attribute_47          36789
onus_attribute_48          36789
Length: 1215, dtype: int64


# Split Data for Training and Testing

In [4]:
from sklearn.model_selection import train_test_split

# Split development data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set shape: {X_train.shape}")
print(f"Testing set shape: {X_test.shape}")


Training set shape: (77444, 1214)
Testing set shape: (19362, 1214)


# Model Development

Train the XGBoost Model

In [6]:
!pip install xgboost


Collecting xgboost
  Downloading xgboost-2.1.3-py3-none-win_amd64.whl (124.9 MB)
     -------------------------------------- 124.9/124.9 MB 3.3 MB/s eta 0:00:00
Installing collected packages: xgboost
Successfully installed xgboost-2.1.3


In [7]:
import xgboost as xgb

# Initialize and train the XGBoost model
xgb_model = xgb.XGBClassifier(
    objective="binary:logistic",
    eval_metric="logloss",
    use_label_encoder=False,
    learning_rate=0.1,
    n_estimators=100,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    scale_pos_weight=len(y_train[y_train == 0]) / len(y_train[y_train == 1]),
    random_state=42
)

# Train the model
xgb_model.fit(X_train, y_train)

print("Model training complete.")


Parameters: { "use_label_encoder" } are not used.



Model training complete.


Evaluate the Model

In [8]:
from sklearn.metrics import roc_auc_score, log_loss

# Predict probabilities on the testing set
y_pred_proba = xgb_model.predict_proba(X_test)[:, 1]

# Calculate evaluation metrics
roc_auc = roc_auc_score(y_test, y_pred_proba)
logloss = log_loss(y_test, y_pred_proba)

print(f"Model Performance:\nROC-AUC: {roc_auc:.4f}\nLog Loss: {logloss:.4f}")


Model Performance:
ROC-AUC: 0.8177
Log Loss: 0.2635


#  Validation and Prediction

Predict on Validation Data

In [15]:
# Round the predicted probabilities to 3 decimals
val_data["predicted_probability"] = val_data["predicted_probability"].round(3)

# Save the data to CSV with two columns
val_data[["account_number", "predicted_probability"]].to_csv("validation_predictions.csv", index=False)

print("Predictions saved to 'validation_predictions.csv' with account_number in Column A and predicted_probability in Column B.")


Predictions saved to 'validation_predictions.csv' with account_number in Column A and predicted_probability in Column B.


 Save Predictions

In [16]:
# Save predictions to a CSV file
val_data[["account_number", "predicted_probability"]].to_csv("validation_predictions.csv", index=False)

print("Predictions saved to 'validation_predictions.csv'.")


Predictions saved to 'validation_predictions.csv'.


In [17]:
import os
print("Current Working Directory:", os.getcwd())


Current Working Directory: C:\Users\viina


In [18]:
import os

# Check if the file is in the working directory
if "validation_predictions.csv" in os.listdir():
    print("Prediction file found!")
else:
    print("Prediction file not found. Check your working directory.")


Prediction file found!


In [20]:
from IPython.display import FileLink

# Provide a link to the file
FileLink("validation_predictions.csv")
