<a href="https://colab.research.google.com/github/Sai937593/Kaggle-Projects/blob/main/loan-approval-ann.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
import tensorflow as tf

# Check if TPU is already initialized
if 'tpu_strategy' not in globals():  # If 'tpu_strategy' does not exist, TPU is not initialized
    try:
        tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
        tf.tpu.experimental.initialize_tpu_system(tpu)
        tpu_strategy = tf.distribute.TPUStrategy(tpu)
        print("TPU initialized successfully.")
    except Exception as e:
        print("TPU initialization failed:", e)
else:
    print("TPU is already initialized.")


TPU is already initialized.


In [5]:
import os

# Create the directory explicitly with the absolute path
os.makedirs("/root/.kaggle", exist_ok=True)

# Copy the kaggle.json file from your Google Drive to the required location
!cp "/content/drive/MyDrive/Colab Notebooks/kaggle.json" /root/.kaggle/

# Set the correct permissions for the Kaggle API token
!chmod 600 /root/.kaggle/kaggle.json


In [6]:
!kaggle competitions download -c playground-series-s4e10

playground-series-s4e10.zip: Skipping, found more recently modified local copy (use --force to force download)


In [7]:
from zipfile import ZipFile

with ZipFile('/content/playground-series-s4e10.zip', 'r') as zip_ref:
    zip_ref.extractall('/content')

In [8]:
import pandas as pd

print(f'reading the csv files into pandas dataframes.')
train_df = pd.read_csv('/content/train.csv')
test_df = pd.read_csv('/content/test.csv')
sub_df = pd.read_csv('/content/sample_submission.csv')


reading the csv files into pandas dataframes.


In [9]:
# train_df.info()

In [10]:
# test_df.info()

In [11]:
# train_df.isnull().any(), test_df.isnull().any()

In [12]:
cat_cols = train_df.select_dtypes('object').columns
num_cols = []
for col in train_df.select_dtypes(exclude='object').columns:
    if col not in ('id', 'loan_status'):
        num_cols.append(col)
# num_cols, cat_cols

In [13]:
# train_df['loan_status'].value_counts(normalize=True)

In [14]:
import matplotlib.pyplot as plt

def box_plots_num_cols(df, columns):
    df = df.copy()
    base_width = 10
    base_height = 5
    rows = len(columns)
    cols = 1
    fig_width = cols * base_width
    fig_height = rows * base_height
    fig, axes = plt.subplots(rows, cols, figsize=(fig_width, fig_height))
    axes = axes.flatten() if rows > 1 else [axes]
    for i, col in enumerate(columns):
        axes[i].boxplot(df[col])
        axes[i].set_title(col)
    for j in range(i+1, len(axes)):
        axes[j].axis('off')
    plt.tight_layout()

In [15]:
# box_plots_num_cols(train_df, num_cols)

In [16]:
from scipy import stats
import numpy as np

def remove_outliers(df:pd.DataFrame(), cols, beta):
    df = df.copy()
    for col in cols:
#         print(f'removing {col} outliers:\n')
        data = df[col]
        iqr = stats.iqr(data)
        q1 = np.percentile(data, 25)
        q3 = np.percentile(data, 75)
        outlier_low = q1 - beta * iqr
        outlier_high = q1 + beta * iqr
#         print(f'outlier_low:{outlier_low}, outlier_high: {outlier_high}')
        df = df[(data >= outlier_low) & (data <= outlier_high)]
    return df


In [17]:
print(f'removing outliers before oversampling')
df_clean = remove_outliers(train_df, num_cols, 1.5)
# box_plots_num_cols(df_clean, num_cols)
df_clean.shape, train_df.shape

removing outliers before oversampling


((24802, 13), (58645, 13))

In [18]:
def hist_plot_cat_cols(df:pd.DataFrame(), cat_cols=cat_cols):
    df = df.copy()
    fig, axes_cat = plt.subplots(1, len(cat_cols), figsize=(20, 10))
    for i, col in enumerate(cat_cols):
        axes_cat[i].hist(df[col])
        axes_cat[i].set_title(col)

In [19]:
# hist_plot_cat_cols(train_df)

In [20]:
# hist_plot_cat_cols(df_clean)

In [21]:
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

print('encoding the categorical features')
df_clean = df_clean.drop(columns=['id'])

ord_enc = OrdinalEncoder()
one_hot_enc = OneHotEncoder(sparse_output=False)

ordinal_cols = ['loan_grade']
one_hot_cols = [col for col in cat_cols if col != 'loan_grade']
remaining_cols = [col for col in df_clean.columns if col not in cat_cols]

encoder = ColumnTransformer(
            transformers=[
                ('ordinal_encoder', ord_enc, ordinal_cols),
                ('one_hot_encoder', one_hot_enc, one_hot_cols),
                ('passthrough', 'passthrough', remaining_cols )
            ]
)

encoded_data = encoder.fit_transform(df_clean)
one_hot_enc.fit(df_clean[one_hot_cols])
one_hot_encoded_cols = one_hot_enc.get_feature_names_out(one_hot_cols)
all_cols = ordinal_cols + list(one_hot_encoded_cols) + remaining_cols
df_encoded = pd.DataFrame(encoded_data, columns=all_cols)
df_encoded.info()

encoding the categorical features
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24802 entries, 0 to 24801
Data columns (total 21 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   loan_grade                      24802 non-null  float64
 1   person_home_ownership_MORTGAGE  24802 non-null  float64
 2   person_home_ownership_OTHER     24802 non-null  float64
 3   person_home_ownership_OWN       24802 non-null  float64
 4   person_home_ownership_RENT      24802 non-null  float64
 5   loan_intent_DEBTCONSOLIDATION   24802 non-null  float64
 6   loan_intent_EDUCATION           24802 non-null  float64
 7   loan_intent_HOMEIMPROVEMENT     24802 non-null  float64
 8   loan_intent_MEDICAL             24802 non-null  float64
 9   loan_intent_PERSONAL            24802 non-null  float64
 10  loan_intent_VENTURE             24802 non-null  float64
 11  cb_person_default_on_file_N     24802 non-null  float64
 12

In [22]:
!pip install imblearn
from imblearn.over_sampling import SMOTE, SVMSMOTE

print(f'oversampling using SVMSMOTE')
df_enc_copy = df_encoded.copy()

smote = SMOTE(random_state=32,  k_neighbors=10)
svm_smote = SVMSMOTE(random_state=32, k_neighbors=10,m_neighbors=10)

X = df_enc_copy.drop(columns=['loan_status'])
y = df_enc_copy.loc[:, 'loan_status']

X_smote, y_smote = smote.fit_resample(X, y)
X_svm_smote, y_svm_smote = svm_smote.fit_resample(X, y)
X_smote.shape, y_smote.shape, X_svm_smote.shape, y_svm_smote.shape

Collecting imblearn
  Downloading imblearn-0.0-py2.py3-none-any.whl.metadata (355 bytes)
Collecting imbalanced-learn (from imblearn)
  Downloading imbalanced_learn-0.12.4-py3-none-any.whl.metadata (8.3 kB)
Downloading imblearn-0.0-py2.py3-none-any.whl (1.9 kB)
Downloading imbalanced_learn-0.12.4-py3-none-any.whl (258 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m258.3/258.3 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: imbalanced-learn, imblearn
Successfully installed imbalanced-learn-0.12.4 imblearn-0.0
oversampling using SVMSMOTE


((46192, 20), (46192,), (46192, 20), (46192,))

In [23]:
smote_df = X_smote
smote_df['loan_status'] = y_smote

svm_smote_df = X_svm_smote
svm_smote_df['loan_status'] = y_svm_smote


In [24]:
# c = [col for col in smote_df.columns if col not in one_hot_encoded_cols]
# box_plots_num_cols(smote_df, c)

In [25]:
# c2 = [col for col in svm_smote_df.columns if col not in one_hot_encoded_cols]
# box_plots_num_cols(svm_smote_df, c2)

In [26]:
# smote_corr = smote_df.corr()
# svm_smote_corr = svm_smote_df.corr()

In [27]:
import seaborn as sns

def heatmap_corr(corr):
    plt.figure(figsize=(10, 8))
    sns.heatmap(corr, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)
    plt.title('Correlation Heatmap')
    plt.show()

In [28]:
# heatmap_corr(smote_corr)

In [29]:
# heatmap_corr(svm_smote_corr)

In [30]:
# high_smote_corr = smote_corr[(abs(smote_corr) > 0.7) & (abs(smote_corr) != 1.0)]
# heatmap_corr(high_smote_corr)

In [31]:
# high_svm_smote_corr = svm_smote_corr[(abs(svm_smote_corr) > 0.7) & (abs(svm_smote_corr) != 1.0)]
# heatmap_corr(high_svm_smote_corr)

In [32]:
print('dropping the column person home ownership rent')
df_final = svm_smote_df.drop(columns=[ 'person_home_ownership_RENT'])
df_final.info()

dropping the column person home ownership rent
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46192 entries, 0 to 46191
Data columns (total 20 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   loan_grade                      46192 non-null  float64
 1   person_home_ownership_MORTGAGE  46192 non-null  float64
 2   person_home_ownership_OTHER     46192 non-null  float64
 3   person_home_ownership_OWN       46192 non-null  float64
 4   loan_intent_DEBTCONSOLIDATION   46192 non-null  float64
 5   loan_intent_EDUCATION           46192 non-null  float64
 6   loan_intent_HOMEIMPROVEMENT     46192 non-null  float64
 7   loan_intent_MEDICAL             46192 non-null  float64
 8   loan_intent_PERSONAL            46192 non-null  float64
 9   loan_intent_VENTURE             46192 non-null  float64
 10  cb_person_default_on_file_N     46192 non-null  float64
 11  cb_person_default_on_file_Y     46192 non-null

In [33]:
from sklearn.model_selection import StratifiedShuffleSplit

print('splitting the dataset into train and test')

sss = StratifiedShuffleSplit(n_splits=1, random_state=32, test_size=0.1)
for tr, te in sss.split(X, y):
    df_train = df_final.iloc[tr]
    df_test  = df_final.iloc[te]
target_col = 'loan_status'
features_cols = [col for col in df_train.columns if col != target_col]
df_train.shape, df_test.shape

splitting the dataset into train and test


((22321, 20), (2481, 20))

In [71]:
import tensorflow as tf
import pandas as pd
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline

def prepare_data_for_tpu(df:pd.DataFrame, features_cols:list[str], target_col:str, batch_size:int=32, poly_degree:int=2, poly:bool=False, valid_split:bool=False,valid_split_size:float=0.2):

    df = df.copy()
    features = df[features_cols]
    target = df.loc[:, target_col]

    pipe = Pipeline([
        ('scaler', StandardScaler()),
        ('poly', PolynomialFeatures(degree=poly_degree, include_bias=False, interaction_only=True)) if poly else None
    ])
    pipe.steps = [step for step in pipe.steps if step is not None]

    features_final = pipe.fit_transform(features)
    features_final_df = pd.DataFrame(features_final, columns=[f'feature_{i}' for i in range(features_final.shape[1])])


    features_tensor = tf.convert_to_tensor(features_final_df.values.astype('float32'), dtype=tf.float32)
    target_tensor = tf.convert_to_tensor(target.values.astype('int64'), dtype=tf.int64)

    dataset = tf.data.Dataset.from_tensor_slices((features_tensor, target_tensor))

    dataset_size = len(features)
    val_size = int(valid_split_size * dataset_size)
    if valid_split and (valid_split_size > 0.0):
        valid_dataset = dataset.take(val_size).batch(batch_size).prefetch(tf.data.experimental.AUTOTUNE)
        train_dataset = dataset.skip(val_size).shuffle(buffer_size=dataset_size-val_size).batch(batch_size).prefetch(tf.data.experimental.AUTOTUNE)

        result = (train_dataset, valid_dataset)
    else:
        dataset = dataset.shuffle(buffer_size=dataset_size).batch(batch_size).prefetch(tf.data.experimental.AUTOTUNE)

        result = (dataset, )
    return result

train_dataset, valid_dataset = prepare_data_for_tpu(df_train, features_cols=features_cols, target_col=target_col, poly=True, poly_degree=3, valid_split=True, valid_split_size=0.2)

In [72]:
import tensorflow as tf
import keras
from keras import activations, layers, initializers, regularizers, optimizers, losses, metrics

class ann_layer_gen:
    def __init__(self, input_shape, batch_norm:bool=False, drop_out:bool=False, regularize:bool=False,  initializers :bool=False,drop_out_rate:float=0.0) -> None:
        self.input_shape = input_shape
        self.batch_norm = batch_norm
        self.drop_out = drop_out
        self.regularize = regularize
        self.initializers = initializers
        self.drop_out_rate = drop_out_rate
        self.model = keras.Sequential()
        self.model.add(keras.Input(shape=(input_shape, )))

    def __call__(self,  out_act_fun:activations, out_unit:int, units:int=100, n_hidden_layers:int=5,  hidden_act_fun:activations=activations.relu) -> keras.Sequential :
        for n_layer in range(1, n_hidden_layers+1):
            layer_params = {'units':units, 'activation':hidden_act_fun}

            if self.regularize:
                layer_params['kernel_regularizer'] = regularizers.L1()
            if self.initializers:
                layer_params['kernel_initializer'] = initializers.glorot_normal

            self.model.add(layers.Dense(**layer_params))

            if self.drop_out:
                self.model.add(layers.Dropout(rate=self.drop_out_rate))

            if self.batch_norm:
                self.model.add(layers.BatchNormalization())

            units = round(units / 2)
        self.model.add(layers.Dense(out_unit, activation=out_act_fun))
        return self.model

    def compile(model, loss:losses.Loss, metrics:list=['accuracy'], optimizer:optimizers.Optimizer=optimizers.Adam, valid_split:float=0.2, learning_rate:float=0.01 ) -> keras.Sequential:
        try:
            model.compile(optimizer=optimizer(learning_rate=learning_rate), loss=loss, metrics=metrics)
        except Exception as e:
            print('Could not compile the model with given params.')
        return model

In [92]:
# !pip install optuna
import optuna
from keras import optimizers, activations, losses
import tensorflow as tf
from keras import backend as K

import tensorflow as tf

# Check if TPU is already initialized
if 'tpu_strategy' not in globals():  # If 'tpu_strategy' does not exist, TPU is not initialized
    try:
        tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
        tf.tpu.experimental.initialize_tpu_system(tpu)
        tpu_strategy = tf.distribute.TPUStrategy(tpu)
        print("TPU initialized successfully.")
    except Exception as e:
        print("TPU initialization failed:", e)
else:
    print("TPU is already initialized.")

input_shape = train_dataset.element_spec[0].shape[1]  # Assuming the first element in the dataset is the feature tensor

# Objective function for Optuna
def objective(trial: optuna.Trial):
    # print('working')
    epochs = trial.suggest_int('epochs', 10, 100, step=10)
    units = trial.suggest_int('units', 100, 500, step=50)
    n_hidden_layers = trial.suggest_int('n_hidden_layers', 3, 19, step=2)
    batch_norm = trial.suggest_categorical('batch_norm', [True, False])
    drop_out = trial.suggest_categorical('drop_out', [True, False])
    regularize = trial.suggest_categorical('regularize', [True, False])
    initializers = trial.suggest_categorical('initializers', [True, False])
    drop_out_rate = trial.suggest_float('drop_out_rate', 0, 1)
    learning_rate = trial.suggest_float('learning_rate', 1e-6, 1e-1, log=True)
    optimizer_name = trial.suggest_categorical('optimizer', ['Adam', 'RMSprop', 'SGD', 'Nadam'])

    # Create optimizer object based on the selected name
    if optimizer_name == 'Adam':
        optimizer = optimizers.Adam(learning_rate=learning_rate)
    elif optimizer_name == 'RMSprop':
        optimizer = optimizers.RMSprop(learning_rate=learning_rate)
    elif optimizer_name == 'SGD':
        optimizer = optimizers.SGD(learning_rate=learning_rate)
    elif optimizer_name == 'Nadam':
        optimizer = optimizers.Nadam(learning_rate=learning_rate)
    # Initialization parameters
    init_params = {
        'batch_norm': batch_norm,
        'drop_out': drop_out,
        'drop_out_rate': drop_out_rate,
        'regularize': regularize,
        'initializers': initializers
    }

    # Model creation within TPU scope
    with tpu_strategy.scope():
        ann_layer_generator = ann_layer_gen(input_shape=input_shape, **init_params)
        ann_model = ann_layer_generator(out_act_fun=activations.sigmoid, out_unit=1, units=units, n_hidden_layers=n_hidden_layers)
        ann_model.compile(optimizer=optimizer, loss=losses.binary_crossentropy, metrics=['accuracy'])

    batch_size = 8 * tpu_strategy.num_replicas_in_sync

    # Model training
    history = ann_model.fit(train_dataset, validation_data=valid_dataset, batch_size=batch_size, epochs=epochs, shuffle=True, verbose=0)

    val_accuracy = history.history['val_accuracy'][-1]
    train_accuracy = history.history['accuracy'][-1]
    val_loss  = history.history['val_loss'][-1]
    train_loss = history.history['loss'][-1]
    return (train_loss, train_accuracy)


TPU is already initialized.


In [None]:
# !rm -rf loan_approval_ann_study.db
loan_approval_ann_study = optuna.create_study(
    storage='sqlite:///loan_approval_ann_study.db',
    study_name='loan_approval_ann_study',
    directions=['minimize', 'maximize'],
    load_if_exists=True
)

loan_approval_ann_study.optimize(objective,  n_trials=10, show_progress_bar=True)

[I 2024-10-25 15:42:09,438] Using an existing study with name 'loan_approval_ann_study' instead of creating a new one.


  0%|          | 0/10 [00:00<?, ?it/s]

[I 2024-10-25 15:44:42,517] Trial 6 finished with values: [0.689157247543335, 0.9297754168510437] and parameters: {'epochs': 10, 'units': 400, 'n_hidden_layers': 15, 'batch_norm': True, 'drop_out': True, 'regularize': False, 'initializers': True, 'drop_out_rate': 0.3510723847441686, 'learning_rate': 1.765449524063833e-06, 'optimizer': 'Adam'}.
[I 2024-10-25 15:55:00,729] Trial 7 finished with values: [0.25426751375198364, 0.9297754168510437] and parameters: {'epochs': 70, 'units': 150, 'n_hidden_layers': 13, 'batch_norm': False, 'drop_out': True, 'regularize': False, 'initializers': False, 'drop_out_rate': 0.37719713820015044, 'learning_rate': 0.08191664108041778, 'optimizer': 'SGD'}.
[I 2024-10-25 16:14:21,955] Trial 8 finished with values: [8.083690643310547, 0.9297754168510437] and parameters: {'epochs': 100, 'units': 150, 'n_hidden_layers': 11, 'batch_norm': True, 'drop_out': True, 'regularize': True, 'initializers': True, 'drop_out_rate': 0.27312928286645444, 'learning_rate': 0.04

ValueError: Record does not exist.

In [39]:
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


print('encoding the final test data set')
ord_enc_test = OrdinalEncoder()
one_hot_enc_test = OneHotEncoder(sparse_output=False)

test_cat_cols = test_df.select_dtypes('object').columns
ordinal_cols = ['loan_grade']
one_hot_cols = [col for col in test_cat_cols if col != 'loan_grade']
remaining_cols = [col for col in test_df.columns if col not in list(test_cat_cols)]

encoder_test = ColumnTransformer(
            transformers=[
                ('ordinal_encoder', ord_enc_test, ordinal_cols),
                ('one_hot_encoder', one_hot_enc_test, one_hot_cols),
                ('passthrough', 'passthrough', remaining_cols )
            ]
)

encoded_data = encoder_test.fit_transform(test_df)
one_hot_enc_test.fit(test_df[one_hot_cols])
one_hot_encoded_cols = one_hot_enc_test.get_feature_names_out(one_hot_cols)
all_cols = ordinal_cols + list(one_hot_encoded_cols) + remaining_cols
test_df_encoded = pd.DataFrame(encoded_data, columns=all_cols)
test_df_encoded.info()

encoding the final test data set
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39098 entries, 0 to 39097
Data columns (total 21 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   loan_grade                      39098 non-null  float64
 1   person_home_ownership_MORTGAGE  39098 non-null  float64
 2   person_home_ownership_OTHER     39098 non-null  float64
 3   person_home_ownership_OWN       39098 non-null  float64
 4   person_home_ownership_RENT      39098 non-null  float64
 5   loan_intent_DEBTCONSOLIDATION   39098 non-null  float64
 6   loan_intent_EDUCATION           39098 non-null  float64
 7   loan_intent_HOMEIMPROVEMENT     39098 non-null  float64
 8   loan_intent_MEDICAL             39098 non-null  float64
 9   loan_intent_PERSONAL            39098 non-null  float64
 10  loan_intent_VENTURE             39098 non-null  float64
 11  cb_person_default_on_file_N     39098 non-null  float64
 12 

In [40]:
import tensorflow as tf
import pandas as pd
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline

def prepare_test_data_for_tpu(df: pd.DataFrame, features_cols: list[str], target_col: str, batch_size: int = 32, poly_degree: int = 2, poly: bool = False):

    df = df.copy()
    features = df[features_cols]
    target = df[target_col]

    # Define pipeline for scaling and optional polynomial features
    pipe = Pipeline([
        ('scaler', StandardScaler()),
        ('poly', PolynomialFeatures(degree=poly_degree, include_bias=False, interaction_only=True)) if poly else None
    ])
    pipe.steps = [step for step in pipe.steps if step is not None]

    # Transform features
    features_final = pipe.fit_transform(features)
    features_final_df = pd.DataFrame(features_final, columns=[f'feature_{i}' for i in range(features_final.shape[1])])

    # Create tf.data.Dataset without shuffling, batching only
    dataset = tf.data.Dataset.from_tensor_slices((dict(features_final_df), target))
    dataset = dataset.batch(batch_size).prefetch(tf.data.experimental.AUTOTUNE)

    return dataset
