## CONVOLVE 2025 - Predicting Credit Card Behaviour Scores

### Exploratory Data Analytics

In [1]:
!pip install matplotlib #installing the necessarey libraries




In [2]:
pip install seaborn

Note: you may need to restart the kernel to use updated packages.


In [3]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


In [4]:
#Loading the dataset
data_path = '/Users/anushka/Downloads/Dev_data_to_be_shared.csv'
df = pd.read_csv(data_path) #forming a dataframe from the provided dataset


In [5]:

#making a function to drop the rows that having missing value percentage more than the set threshold (in our case the missing_value_threshold chosen is 0.8)
#therefore all the rows that have missing values more than 80% are being dropped as thy contribute less to the final output

def drop_rows_with_many_missing_values(df, missing_value_threshold=0.8):


  num_columns = df.shape[1]
  threshold = int(num_columns * missing_value_threshold)

  rows_to_drop = df[df.isnull().sum(axis=1) > threshold].index
  df = df.drop(rows_to_drop)
  print(rows_to_drop)

  return df

# Example usage:
# Assuming 'df' is your DataFrame
df = drop_rows_with_many_missing_values(df)

Index([  144,   980,  1706,  1818,  1964,  2019,  2230,  2548,  2666,  2773,
       ...
       93871, 93974, 94339, 95444, 95506, 95524, 95538, 95918, 96052, 96580],
      dtype='int64', length=258)


In [6]:
# Calculate the percentage of missing values in each column
missing_percentage = (df.isnull().sum() / len(df)) * 100

# Find columns with more than 80% missing values
columns_to_drop = missing_percentage[missing_percentage > 80].index.tolist()

# Print the number of columns to be dropped
print("Number of columns with more than 80% missing values:", len(columns_to_drop))

# Print the names of the columns to be dropped
print("Columns to be dropped:", columns_to_drop)

# Drop the identified columns from the DataFrame
df = df.drop(columns=columns_to_drop)

# Final dataframe shape
print(f"DataFrame shape after dropping columns: {df.shape}")

Number of columns with more than 80% missing values: 14
Columns to be dropped: ['bureau_148', 'bureau_436', 'bureau_438', 'bureau_444', 'bureau_446', 'bureau_447', 'bureau_448', 'bureau_449', 'onus_attribute_43', 'onus_attribute_44', 'onus_attribute_45', 'onus_attribute_46', 'onus_attribute_47', 'onus_attribute_48']
DataFrame shape after dropping columns: (96548, 1202)


In [7]:
#Calculating the number of binary features by first finding out the number of unique features
unique_counts = df.nunique()
binary_features = unique_counts[unique_counts == 2].index.tolist()
print(binary_features)

#printing the number of binary features in our dataset
print(len(binary_features)) 

['bad_flag', 'transaction_attribute_79', 'transaction_attribute_80', 'transaction_attribute_81', 'transaction_attribute_144', 'transaction_attribute_183', 'transaction_attribute_222', 'bureau_67', 'bureau_149', 'bureau_150', 'bureau_151', 'bureau_153', 'bureau_154', 'bureau_155', 'bureau_156', 'bureau_157', 'bureau_158', 'bureau_159', 'bureau_160', 'bureau_161', 'bureau_163', 'bureau_164', 'bureau_165', 'bureau_166', 'bureau_167', 'bureau_168', 'bureau_169', 'bureau_170', 'bureau_171', 'bureau_173', 'bureau_174', 'bureau_175', 'bureau_176', 'bureau_177', 'bureau_178', 'bureau_179', 'bureau_180', 'bureau_181', 'bureau_183', 'bureau_184', 'bureau_185', 'bureau_186', 'bureau_187', 'bureau_188', 'bureau_189', 'bureau_190', 'bureau_191', 'bureau_193', 'bureau_194', 'bureau_195', 'bureau_196', 'bureau_197', 'bureau_198', 'bureau_199', 'bureau_200', 'bureau_201', 'bureau_203', 'bureau_204', 'bureau_205', 'bureau_206', 'bureau_207', 'bureau_208', 'bureau_209', 'bureau_210', 'bureau_211', 'bure

In [9]:
#Imputing the non_binary features using mean values and binary features using mode values
non_binary_features = [col for col in df.columns if col not in binary_features]
for col in binary_features:
  mode_value = df[col].mode()[0]
  df[col].fillna(mode_value, inplace = True)
for col in non_binary_features:
  mean_value = df[col].mean()
  df[col].fillna(mean_value, inplace = True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(mode_value, inplace = True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(mode_value, inplace = True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always 

In [10]:
#installing scikit-learn
!pip install scikit-learn



In [11]:
from sklearn.feature_selection import VarianceThreshold
#printing the number of features before applying VarianceThreshold
print("Number of features before selection:", df.shape[1])

# Create a VarianceThreshold object with a threshold (e.g., 0.01)
selector = VarianceThreshold(threshold=0.01)

# Fit the selector to your data
selector.fit(df)

# Get a boolean mask indicating which features are selected
selected_features = selector.get_support()

# Get the names of the selected features
selected_feature_names = df.columns[selected_features]

# Select only the selected features from the DataFrame
df = df[selected_feature_names]

print("Number of features after selection:", df.shape[1])

Number of features before selection: 1202
Number of features after selection: 849


In [12]:
#making a function that performs feature selection by removing features using high correlation by introducing a threshold
def feature_selection_by_correlation(data, threshold):

    # Compute the correlation matrix
    corr_matrix = data.corr().abs()
    
    # Identify highly correlated pairs (keep only upper triangle to avoid redundancy)
    upper_triangle = corr_matrix.where(~np.tril(np.ones(corr_matrix.shape)).astype(bool))
    
    # Find features to drop
    features_to_drop = [
        column for column in upper_triangle.columns if any(upper_triangle[column] > threshold)
    ]
    
    # Drop the features
    reduced_data = data.drop(columns=features_to_drop)
    
    return reduced_data, features_to_drop

# Perform feature selection
reduced_df, removed_features = feature_selection_by_correlation(df, threshold=0.9)
print(reduced_df.head())


   account_number  bad_flag  onus_attribute_1  transaction_attribute_1  \
0               1         0     154239.119804                 9.570769   
1               2         0     221000.000000                 0.000000   
2               3         0      25000.000000                 0.000000   
3               4         0      86000.000000                 0.000000   
4               5         0     215000.000000                 0.000000   

   transaction_attribute_2  transaction_attribute_3  transaction_attribute_4  \
0                 0.002207                 4.092854                77.306435   
1                 0.000000                 0.000000                 0.000000   
2                 0.000000                 0.000000                 0.000000   
3                 0.000000                 0.000000                 0.000000   
4                 0.000000                 0.000000                 0.000000   

   transaction_attribute_5  transaction_attribute_6  transaction_attribute

In [16]:
#printing the reduced dataframe shape
print(reduced_df.shape[1])

594


In [17]:
df=reduced_df


In [15]:
# Check for missing values in each column
missing_values_per_column = reduced_df.isnull().sum()

# Print the number of missing values in each column
print("Number of missing values per column:\n", missing_values_per_column)

# Check for rows with any missing values
rows_with_missing_values = df[df.isnull().any(axis=1)]
print(f"Number of rows with missing values: {len(rows_with_missing_values)}")

# Check for rows with all missing values
rows_with_all_missing_values = df[df.isnull().all(axis=1)]
print(f"Number of rows with all missing values: {len(rows_with_all_missing_values)}")

Number of missing values per column:
 account_number             0
bad_flag                   0
onus_attribute_1           0
transaction_attribute_1    0
transaction_attribute_2    0
                          ..
bureau_enquiry_38          0
bureau_enquiry_40          0
bureau_enquiry_44          0
bureau_enquiry_46          0
bureau_enquiry_48          0
Length: 594, dtype: int64
Number of rows with missing values: 0
Number of rows with all missing values: 0


In [18]:
#To understand which all features require normalisation or not, we implement Normal test and Anderson test 

from scipy.stats import shapiro, normaltest, anderson
from sklearn.preprocessing import PowerTransformer
numerical_features = df.select_dtypes(include=['float64', 'int64']).columns
binary_features = [col for col in numerical_features if df[col].nunique() == 2]
numerical_features = [col for col in numerical_features if col not in binary_features]
from scipy.stats import normaltest

for col in numerical_features:
    stat, p = normaltest(df[col])
    print(f"Feature: {col}, D'Agostino Test p-value: {p}")
non_normal_features = []
for col in numerical_features:
    stat, p = normaltest(df[col])
    if p < 0.05:  # Non-normal if p < 0.05
        non_normal_features.append(col)

print(f"Non-normal features: {non_normal_features}") #printing all the non_normal features
from sklearn.preprocessing import PowerTransformer

# Using yeo-JOhnson tranformation for normalisig the dataset
pt = PowerTransformer(method='yeo-johnson')

# Exclude "acount_number" from transformation as that does not needs to be normalised
excluded_feature = 'account_number'
features_to_transform = [feature for feature in non_normal_features if feature != excluded_feature]

# Appling transformation to rest of the selected features
df_transformed = df.copy() # Create df_transformed before using it
df_transformed[features_to_transform] = pt.fit_transform(df_transformed[features_to_transform]) #use df_transformed instead of df

# Verify transformation
print(df_transformed.head())


Feature: account_number, D'Agostino Test p-value: 0.0
Feature: onus_attribute_1, D'Agostino Test p-value: 0.0
Feature: transaction_attribute_1, D'Agostino Test p-value: 0.0
Feature: transaction_attribute_2, D'Agostino Test p-value: 0.0
Feature: transaction_attribute_3, D'Agostino Test p-value: 0.0
Feature: transaction_attribute_4, D'Agostino Test p-value: 0.0
Feature: transaction_attribute_5, D'Agostino Test p-value: 0.0
Feature: transaction_attribute_6, D'Agostino Test p-value: 0.0
Feature: transaction_attribute_7, D'Agostino Test p-value: 0.0
Feature: transaction_attribute_10, D'Agostino Test p-value: 0.0
Feature: transaction_attribute_11, D'Agostino Test p-value: 0.0
Feature: transaction_attribute_12, D'Agostino Test p-value: 0.0
Feature: transaction_attribute_13, D'Agostino Test p-value: 0.0
Feature: transaction_attribute_14, D'Agostino Test p-value: 0.0
Feature: transaction_attribute_16, D'Agostino Test p-value: 0.0
Feature: transaction_attribute_17, D'Agostino Test p-value: 0.0
F

In [19]:
#printing shape of the transformed dataset
df=df_transformed
dataset_shape = df.shape
print(f"Shape of the dataset: {dataset_shape}")

Shape of the dataset: (96548, 594)


In [20]:
#installing the remaining libraries
!pip install optuna
!pip install tensorflow
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import VarianceThreshold
from scipy.stats import shapiro
from imblearn.combine import SMOTEENN
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE
from tqdm import tqdm
import optuna
from lightgbm import LGBMClassifier



In [23]:
#Diving the training and test dataset
X = df.drop('bad_flag', axis=1)
y = df['bad_flag']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

#Applying SMOTE for balancing the dataset
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)
X_train_smote = X_train_smote.to_numpy()
y_train_smote = y_train_smote.to_numpy()

# smote_enn = SMOTEENN(random_state=42)
# X_resampled, y_resampled = smote_enn.fit_resample(X_train, y_train)

#Defining the parameters for XGBoost model
params = {
    'objective': 'binary:logistic',
    'learning_rate': 0.01,
    'max_depth': 3,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'n_estimators': 100,
    'random_state': 42
}

#Using K-Fold cross-validation
kfold = KFold(n_splits=5, shuffle=True, random_state=42)
accuracy_scores = []

#Applying the XGBoost model and calculating the required accuracy and mean absolute error
from sklearn.metrics import mean_absolute_error
for train_index, test_index in kfold.split(X_train_smote):
    X_train_fold, X_val_fold = X_train_smote[train_index], X_train_smote[test_index]
    y_train_fold, y_val_fold = y_train_smote[train_index], y_train_smote[test_index]

    model = XGBClassifier(**params)
    model.fit(X_train_fold, y_train_fold)

    y_pred = model.predict(X_val_fold)
    accuracy = accuracy_score(y_val_fold, y_pred)
    accuracy_scores.append(accuracy)

# Calculating and printing average accuracy
avg_accuracy = sum(accuracy_scores) / len(accuracy_scores)
print(f"Average Accuracy across folds: {avg_accuracy}")

# Training the final model on the entire training set (with SMOTE)
model = XGBClassifier(**params)
model.fit(X_train_smote, y_train_smote)

# Evaluate the model on the test set

y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
print(f"Test Accuracy: {accuracy}")
print(classification_report(y_test, y_pred))
print(f"Mean absolute Error: {mae}")

Average Accuracy across folds: 0.9218630935272255
Test Accuracy: 0.9250129466597617
              precision    recall  f1-score   support

           0       0.99      0.93      0.96     19036
           1       0.07      0.33      0.11       274

    accuracy                           0.93     19310
   macro avg       0.53      0.63      0.54     19310
weighted avg       0.98      0.93      0.95     19310

Mean absolute Error: 0.07498705334023822


In [26]:
#Defining the parameters for lightGBM Model
params2 = {
    'objective': 'binary',
    'learning_rate': 0.01,
    'max_depth': 3,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'n_estimators': 100,
    'random_state': 42
}
# kfold = KFold(n_splits=5, shuffle=True, random_state=42)
# accuracy_scores = []

#Applying the lightGBM model and calculating the accuracy and mean absolute error
for train_index, test_index in kfold.split(X_train_smote):
    X_train_fold, X_val_fold = X_train_smote[train_index], X_train_smote[test_index]
    y_train_fold, y_val_fold = y_train_smote[train_index], y_train_smote[test_index]

    model2 = LGBMClassifier(**params2)
    model2.fit(X_train_smote, y_train_smote)

    y_pred = model.predict(X_val_fold)
    accuracy = accuracy_score(y_val_fold, y_pred)
    accuracy_scores.append(accuracy)


# Evaluate the model on the test set
y_pred2 = model2.predict(X_test)
accuracy2 = accuracy_score(y_test, y_pred2)
mae2 = mean_absolute_error(y_test, y_pred2)


print(f"Test Accuracy: {accuracy2}")
print(classification_report(y_test, y_pred2))
print(f"Mean absolute Error: {mae2}")




[LightGBM] [Info] Number of positive: 76142, number of negative: 76142
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.111706 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 149667
[LightGBM] [Info] Number of data points in the train set: 152284, number of used features: 592
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000




[LightGBM] [Info] Number of positive: 76142, number of negative: 76142
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.149124 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 149667
[LightGBM] [Info] Number of data points in the train set: 152284, number of used features: 592
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000




[LightGBM] [Info] Number of positive: 76142, number of negative: 76142
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.100720 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 149667
[LightGBM] [Info] Number of data points in the train set: 152284, number of used features: 592
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000




[LightGBM] [Info] Number of positive: 76142, number of negative: 76142
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.102433 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 149667
[LightGBM] [Info] Number of data points in the train set: 152284, number of used features: 592
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000




[LightGBM] [Info] Number of positive: 76142, number of negative: 76142
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.126740 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 149667
[LightGBM] [Info] Number of data points in the train set: 152284, number of used features: 592
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Test Accuracy: 0.9210771620921803
              precision    recall  f1-score   support

           0       0.99      0.93      0.96     19036
           1       0.06      0.33      0.11       274

    accuracy                           0.92     19310
   macro avg       0.53      0.63      0.53     19310
weighted avg       0.98      0.92      0.95     19310

Mean absolute Error: 0.07892283790781979


In [48]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Train base models
model_1.fit(X_train, y_train)
model_2.fit(X_train, y_train)

# Get the probability predictions from base models
prob_1 = model_1.predict_proba(X_train)  # Train data
prob_2 = model_2.predict_proba(X_train)  # Train data

# Combine the probabilities from both models to create a new feature set for meta-model
stacked_features = np.column_stack((prob_1, prob_2))

# Train a meta-model (Logistic Regression) on the stacked features
meta_model = LogisticRegression()
meta_model.fit(stacked_features, y_train)

# Get the probability predictions from base models on test data
prob_1_test = model_1.predict_proba(X_test)
prob_2_test = model_2.predict_proba(X_test)

# Combine the probabilities from both models to create the final feature set for meta-model
stacked_features_test = np.column_stack((prob_1_test, prob_2_test))

# Predict final probabilities using the meta-model
final_predictions = meta_model.predict_proba(stacked_features_test)

# Example: Printing probabilities for the first instance
print(final_predictions)  # Probabilities for the first test instance


[LightGBM] [Info] Number of positive: 1096, number of negative: 76142
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.063224 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 80062
[LightGBM] [Info] Number of data points in the train set: 77238, number of used features: 592
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.014190 -> initscore=-4.240933
[LightGBM] [Info] Start training from score -4.240933
[[9.98980159e-01 1.01984125e-03]
 [9.99055539e-01 9.44461428e-04]
 [9.99077057e-01 9.22942544e-04]
 ...
 [9.99076852e-01 9.23148280e-04]
 [9.99069469e-01 9.30531391e-04]
 [9.98943715e-01 1.05628519e-03]]


In [50]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

# Assuming you already have X_train, X_test, y_train, y_test defined
# Create an instance of SMOTE
smote = SMOTE(random_state=42)

# Apply SMOTE to the training data
# X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
from imblearn.over_sampling import ADASYN

adasyn = ADASYN(sampling_strategy='minority', random_state=42)
X_train_resampled, y_train_resampled = adasyn.fit_resample(X_train, y_train)



# Train base models
model_1.fit(X_train_resampled, y_train_resampled)
model_2.fit(X_train_resampled, y_train_resampled)

# Get the probability predictions from base models for the training data
prob_1 = model_1.predict_proba(X_train_resampled)  # Resampled train data
prob_2 = model_2.predict_proba(X_train_resampled)  # Resampled train data

# Combine the probabilities from both models to create a new feature set for the meta-model
stacked_features = np.column_stack((prob_1[:, 1], prob_2[:, 1]))  # We only use the probability of being 1 (positive class)

# Train a meta-model (Logistic Regression) on the stacked features
meta_model = LogisticRegression()
meta_model.fit(stacked_features, y_train_resampled)

# Get the probability predictions from base models on the test data
prob_1_test = model_1.predict_proba(X_test)
prob_2_test = model_2.predict_proba(X_test)

# Combine the probabilities from both models to create the final feature set for the meta-model
stacked_features_test = np.column_stack((prob_1_test[:, 1], prob_2_test[:, 1]))  # Using only the probability of being 1 (positive class)

# Predict final probabilities using the meta-model
final_predictions = meta_model.predict_proba(stacked_features_test)[:, 1]  # We only want the probability of being 1

# Example: Printing probabilities for the first instance
print("Predicted Probabilities of being 1 for the first instance:", final_predictions[0])

# For evaluation, we get the final predicted class labels (by thresholding the probabilities)
final_pred_labels = (final_predictions > 0.5).astype(int)

# Calculate accuracy
accuracy = accuracy_score(y_test, final_pred_labels)
print(f"Ensemble Model Accuracy: {accuracy}")

# Confusion Matrix
cm = confusion_matrix(y_test, final_pred_labels)
print(f"Confusion Matrix:\n{cm}")


[LightGBM] [Info] Number of positive: 75740, number of negative: 76142
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.093229 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 149499
[LightGBM] [Info] Number of data points in the train set: 151882, number of used features: 592
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.498677 -> initscore=-0.005294
[LightGBM] [Info] Start training from score -0.005294
Predicted Probabilities of being 1 for the first instance: 0.0022752815757336635
Ensemble Model Accuracy: 0.9845675815639565
Confusion Matrix:
[[19003    33]
 [  265     9]]
