In [111]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score, StratifiedKFold
from tqdm import tqdm

In [None]:
df = pd.read_csv(r"C:\Users\rahul\OneDrive\Documents\Hackathon\IndustriAI\Industry-Baby\Creditscore_train_cleaned.csv")
df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
def value_cnt(df, column_name):
    vc = df[column_name].value_counts()
    vc_norm = df[column_name].value_counts(normalize=True)

    vc = vc.rename_axis(column_name).reset_index(name='counts')
    vc_norm = vc_norm.rename_axis(column_name).reset_index(name='norm_counts')

    df_result = pd.concat([vc[column_name], vc['counts'], vc_norm['norm_counts']], axis=1)
    
    return df_result

value_cnt(df, 'Credit_Score')  # imbalanced

In [None]:
plt.figure(figsize=(15, 8))

sns.heatmap(df.select_dtypes('number').corr(), annot = True, fmt='.2f', cmap= 'coolwarm');

In [80]:
X = df.drop(['Credit_Score'], axis=1)
y = df['Credit_Score']

seed = 42
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    stratify=y,
                                                    shuffle=True,
                                                    test_size=0.10,
                                                    random_state=seed)

In [None]:
print("X_train shape :", X_train.shape)
print("y_train shape :", y_train.shape)
print("X_test shape :", X_test.shape)
print("y_test shape :", y_test.shape)

In [82]:
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

order = [
    'Low_spent_Small_value_payments', 
    'Low_spent_Medium_value_payments', 
    'Low_spent_Large_value_payments', 
    'High_spent_Small_value_payments', 
    'High_spent_Medium_value_payments', 
    'High_spent_Large_value_payments'
]

ordinal_encoder = OrdinalEncoder(categories=[order])

X_train['Payment_Behaviour_Encoded'] = ordinal_encoder.fit_transform(X_train[['Payment_Behaviour']])
X_test['Payment_Behaviour_Encoded'] = ordinal_encoder.transform(X_test[['Payment_Behaviour']])

# Drop original column
X_train.drop(['Payment_Behaviour'], axis=1, inplace= True)
X_test.drop(['Payment_Behaviour'], axis=1, inplace= True)

In [None]:
X_train.head()

In [None]:
from sklearn.preprocessing import OneHotEncoder

# Create OneHotEncoder object
onehot_encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False).set_output(transform="pandas") 

# Fit and transform the training set
encoded_features = onehot_encoder.fit_transform(X_train[['Occupation', 'Payment_of_Min_Amount', 'Credit_Mix']])
encoded_features_test = onehot_encoder.transform(X_test[['Occupation', 'Payment_of_Min_Amount', 'Credit_Mix']])

encoded_features

In [85]:
X_train= pd.concat([X_train, encoded_features], axis=1).drop(columns= ['Occupation', 'Payment_of_Min_Amount', 'Credit_Mix'])
X_test= pd.concat([X_test, encoded_features_test], axis=1).drop(columns= ['Occupation', 'Payment_of_Min_Amount', 'Credit_Mix'])

In [None]:
X_train.head()

In [None]:
X_test.head()

In [None]:
X_train.columns

MinMax Scaling

In [90]:
# Scale the features
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert scaled arrays back to DataFrame
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns)


In [None]:
X_train

In [None]:
X_test

--------------

In [105]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam

from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import roc_auc_score, roc_curve, precision_recall_curve, average_precision_score
from sklearn.metrics import mean_squared_error, mean_absolute_error, explained_variance_score, r2_score

from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.model_selection import GridSearchCV

In [106]:
def eval_metric(model, X_train, y_train, X_test, y_test):
    y_train_pred_probabilities = model.predict(X_train)
    y_train_pred = y_train_pred_probabilities.argmax(axis=1)
    y_pred_probabilities = model.predict(X_test)
    y_pred = y_pred_probabilities.argmax(axis=1)
    
    print("Test Set:")
    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred))
    
    print("\nTrain Set:")
    print(confusion_matrix(y_train, y_train_pred))
    print(classification_report(y_train, y_train_pred))

# SMOTE

In [None]:
from imblearn.over_sampling import SMOTE

smote = SMOTE()
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

print("X_train shape :", X_train_smote.shape)
print("y_train shape :", y_train_smote.shape)
print("X_test shape :", X_test.shape)
print("y_test shape :", y_test.shape)

In [None]:
model=Sequential()

model.add(Dense(128, activation="relu"))
model.add(Dropout(0.3))
model.add(Dense(128, activation="relu"))
model.add(Dropout(0.3))
model.add(Dense(64, activation="relu"))
model.add(Dropout(0.3))
model.add(Dense(64, activation="relu"))
model.add(Dropout(0.3))
model.add(Dense(3, activation="softmax"))

opt = Adam(learning_rate=0.001)
model.compile(optimizer=opt, 
              loss="sparse_categorical_crossentropy",
              metrics=['accuracy'])

es=EarlyStopping(monitor="val_accuracy",patience=30,mode="auto",verbose=1)

history = model.fit(x=X_train_smote,
          y=y_train_smote,
          validation_data=(X_test, y_test),
          batch_size=512,
          epochs=300,
          verbose=1,
          callbacks=[es])

In [None]:
eval_metric(model, X_train_smote, y_train_smote, X_test, y_test)

In [113]:
# Define models
models = {
    "Gradient Boosting": GradientBoostingClassifier(),
    # "XGBoost": XGBClassifier(),
    "LightGBM": LGBMClassifier(),
    "Logistic Regression": LogisticRegression(),
    "CatBoost": CatBoostClassifier(verbose=0),
    "Random Forest": RandomForestClassifier()
}

# Train and evaluate models
results = {}
for name, model in tqdm(models.items()):
    # Perform cross-validation
    kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring="accuracy")
    results[name] = cv_results
    print(f"{name}: Mean Accuracy = {cv_results.mean()}, Std = {cv_results.std()}")

# Compare model performance
print("\nModel Performance Comparison (Accuracy):")
for name, scores in results.items():
    print(f"{name}: Mean Accuracy = {scores.mean()}, Std = {scores.std()}")

# Plotting the results (optional)
plt.figure(figsize=(10, 6))
plt.boxplot(list(results.values()), labels=list(results.keys()))
plt.title("Model Performance Comparison (Accuracy)")
plt.ylabel("Accuracy")
plt.xticks(rotation=45)
plt.show()

  0%|          | 0/5 [03:42<?, ?it/s]


KeyboardInterrupt: 

# Final Model

In [None]:
X.shape

In [47]:
# encoding

order = [
    'Low_spent_Small_value_payments', 
    'Low_spent_Medium_value_payments', 
    'Low_spent_Large_value_payments', 
    'High_spent_Small_value_payments', 
    'High_spent_Medium_value_payments', 
    'High_spent_Large_value_payments'
]

ordinal_encoder = OrdinalEncoder(categories=[order])

X['Payment_Behaviour'] = ordinal_encoder.fit_transform(X[['Payment_Behaviour']])

#########################################################

onehot_encoder = OneHotEncoder(handle_unknown="ignore", sparse=False).set_output(transform="pandas") 

encoded_features = onehot_encoder.fit_transform(X[['Occupation', 'Payment_of_Min_Amount', 'Credit_Mix']])

X = pd.concat([X, encoded_features], axis=1).drop(columns= ['Occupation', 'Payment_of_Min_Amount', 'Credit_Mix'])

In [None]:
X.columns

In [49]:
scaler = MinMaxScaler().fit(X)

In [50]:
import pickle
pickle.dump(scaler, open("scaler_credit_score", 'wb'))

In [51]:
X_scaled = scaler.transform(X)

In [None]:
# final_model = LightGBMClassifier()
# final_model.fit(X_scaled, y)
# pickle.dump(final_model, open("credit_score_model", 'wb'))

# print("Model saved successfully!")

In [53]:
# final_model.save('final_model_credit_score.h5') 

# Predictions with test data

In [None]:
test = pd.read_csv(r"C:\Users\rahul\OneDrive\Documents\Hackathon\IndustriAI\Industry-Baby\Creditscore_test_cleaned.csv")
test

In [55]:
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, MinMaxScaler

payment_behaviour_order = [
    'Low_spent_Small_value_payments', 
    'Low_spent_Medium_value_payments', 
    'Low_spent_Large_value_payments', 
    'High_spent_Small_value_payments', 
    'High_spent_Medium_value_payments', 
    'High_spent_Large_value_payments'
]

# Define preprocessing function for new data
def preprocess_new_data(new_data, ordinal_encoder, onehot_encoder, scaler):
    # Ordinal encoding for 'Payment_Behaviour'
    new_data['Payment_Behaviour'] = ordinal_encoder.transform(new_data[['Payment_Behaviour']])
    
    # One-hot encoding for 'Occupation', 'Payment_of_Min_Amount', 'Credit_Mix'
    encoded_features = onehot_encoder.transform(new_data[['Occupation', 'Payment_of_Min_Amount', 'Credit_Mix']])
    encoded_df = pd.DataFrame(encoded_features, columns=onehot_encoder.get_feature_names_out(['Occupation', 'Payment_of_Min_Amount', 'Credit_Mix']))
    new_data = pd.concat([new_data, encoded_df], axis=1).drop(columns=['Occupation', 'Payment_of_Min_Amount', 'Credit_Mix'])
    
    # Min-max scaling for all numeric features
    new_data[new_data.columns] = scaler.transform(new_data[new_data.columns])
    
    return new_data


In [None]:
# Apply preprocessing function to new data using fitted encoders and scaler
test_df = preprocess_new_data(test, ordinal_encoder, onehot_encoder, scaler)
test_df

In [None]:
# Make predictions
probabilities = final_model.predict(test_df)
probabilities

In [None]:
probabilities_df = pd.DataFrame(probabilities, columns=['Probability_Class_0', 'Probability_Class_1', 'Probability_Class_2'])
# Display the DataFrame
probabilities_df

In [None]:
predicted_labels = np.argmax(probabilities, axis=1)
# Display the predicted class labels
predicted_labels

In [None]:
predictions_df = pd.DataFrame(predicted_labels, columns=['Predicted_Label'])
predictions_df

In [None]:
# Concatenate predictions and probabilities DataFrames along the columns axis
pred_proba_df = pd.concat([probabilities_df, predictions_df], axis=1)
pred_proba_df


In [None]:
pred_proba_df['Predicted_Label'].value_counts()