In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
# Define features (X) and target (y)
X = train.drop(columns=['target'])  # Replace 'target_column' with your actual target column name
y = train['target']  # Replace 'target_column' with the actual name of your target column

# Train-test split (80% training, 20% validation)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [3]:
# Define the preprocessing pipeline
def create_refactored_preprocessing_pipeline():
    # Columns to be encoded
    categorical_columns = ['loan_type', 'New_versus_Repeat','lender_id']  # Update with actual categorical column names
    
    # Columns to be scaled
    numerical_columns = ['Total_Amount', 'Total_Amount_to_Repay', 'Amount_Funded_By_Lender', 
                         'Lender_portion_Funded', 'Lender_portion_to_be_repaid']  # Update with actual numerical column names
    
    # Categorical transformer: handle missing values and one-hot encode
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),  # Impute missing values with the most frequent category
        ('onehot', OneHotEncoder(handle_unknown='ignore'))     # One-hot encode categorical variables
    ])
    # Numerical transformer: handle missing values and scale
    numerical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),            # Impute missing numerical values with the mean
        ('scaler', MinMaxScaler())                            # Standardize numerical features
    ])

    # Combine transformers in a ColumnTransformer
    preprocessor = ColumnTransformer(
        transformers=[
            ('cat', categorical_transformer, categorical_columns),
            ('num', numerical_transformer, numerical_columns)
        ])
    
    return preprocessor

In [4]:
# Create and fit the preprocessing pipeline
preprocessor = create_refactored_preprocessing_pipeline()

# Fit and transform the training data
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_val_preprocessed = preprocessor.transform(X_val)  # Apply the same preprocessing to validation data

In [5]:
# Save the preprocessor
joblib.dump(preprocessor, 'preprocessor3.pkl')

['preprocessor3.pkl']

In [6]:
# Print the preprocessed train dataset
print("\nPreprocessed Train Dataset:")
print(X_train_preprocessed)

import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.utils import to_categorical  

# Convert target to categorical if it's a classification task
y_train_encoded = to_categorical(y_train)
y_val_encoded = to_categorical(y_val)           


Preprocessed Train Dataset:
  (0, 0)	1.0
  (0, 23)	1.0
  (0, 27)	1.0
  (0, 28)	5.8956526865784944e-05
  (0, 29)	5.343301200078694e-05
  (1, 0)	1.0
  (1, 23)	1.0
  (1, 27)	1.0
  (1, 28)	5.208696105104009e-05
  (1, 29)	4.863269722604761e-05
  (1, 30)	0.00022500000000000002
  (1, 31)	0.25682311293822113
  (1, 32)	0.0002036964034133148
  (2, 0)	1.0
  (2, 23)	1.0
  (2, 27)	1.0
  (2, 28)	0.00023852175987145737
  (2, 29)	0.0002172339169781625
  (3, 0)	1.0
  (3, 23)	1.0
  (3, 27)	1.0
  (3, 28)	9.426087776181546e-05
  (3, 29)	8.538264804249459e-05
  (3, 30)	0.000406875
  (3, 31)	0.25682311293822113
  :	:
  (54919, 32)	0.00018722499612921928
  (54920, 0)	1.0
  (54920, 23)	1.0
  (54920, 27)	1.0
  (54920, 28)	0.0006828261463327083
  (54920, 29)	0.0006180208538264805
  (54920, 30)	0.0029450625000000006
  (54920, 31)	0.25682311293822113
  (54920, 32)	0.002587109037421939
  (54921, 0)	1.0
  (54921, 23)	1.0
  (54921, 27)	1.0
  (54921, 28)	0.00033834785550850916
  (54921, 29)	0.00031414518984851467
  

In [8]:
# Build the feed-forward neural network model
model = Sequential([
    Dense(128, activation='relu', input_shape=(X_train_preprocessed.shape[1],)),
    Dropout(0.2),
    Dense(64, activation='relu'),
    Dropout(0.2),
    Dense(y_train_encoded.shape[1], activation='softmax')  # Use softmax for multi-class classification
])           # Compile the model
model.compile(optimizer='adam', 
              loss='categorical_crossentropy', 
              metrics=['accuracy'])            # Train the model
history = model.fit(
    X_train_preprocessed, y_train_encoded, 
    validation_data=(X_val_preprocessed, y_val_encoded),
    epochs=20, 
    batch_size=32
)          

Epoch 1/20
[1m1717/1717[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - accuracy: 0.9798 - loss: 0.1002 - val_accuracy: 0.9831 - val_loss: 0.0740
Epoch 2/20
[1m1717/1717[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.9832 - loss: 0.0756 - val_accuracy: 0.9832 - val_loss: 0.0719
Epoch 3/20
[1m1717/1717[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.9825 - loss: 0.0784 - val_accuracy: 0.9832 - val_loss: 0.0719
Epoch 4/20
[1m1717/1717[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.9832 - loss: 0.0750 - val_accuracy: 0.9829 - val_loss: 0.0716
Epoch 5/20
[1m1717/1717[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.9828 - loss: 0.0754 - val_accuracy: 0.9831 - val_loss: 0.0719
Epoch 6/20
[1m1717/1717[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.9820 - loss: 0.0769 - val_accuracy: 0.9832 - val_loss: 0.0723
Epoch 7/20
[1m1

In [9]:
# Save the model after training
model.save('fnn_model3.h5')
print("Model saved as 'fnn_model3.h5'")

# Load the saved model
from tensorflow.keras.models import load_model

model = load_model('fnn_model3.h5')
print("Model loaded successfully.")

# Evaluate using F1 Score on test data
y_val_pred_probs = model.predict(X_val_preprocessed)
y_val_pred = np.argmax(y_val_pred_probs, axis=1)

# Calculate F1 Score
f1 = f1_score(y_val, y_val_pred, average='macro')
print(f"Test F1 Score: {f1}")

Model saved as 'fnn_model3.h5'
Model loaded successfully.
[1m430/430[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step
Test F1 Score: 0.5936704228947385


In [11]:
# Load the saved model
from tensorflow.keras.models import load_model

model = load_model('/kaggle/working/fnn_model3.h5')
print("Model loaded successfully.")

import joblib
# Load the preprocessor from the saved file
preprocessor = joblib.load('/kaggle/working/preprocessor3.pkl')  

import pandas as pd
# Load your test data from test.csv
test_data = pd.read_csv('/kaggle/input/zindidataset/Test.csv')


Model loaded successfully.


In [12]:
# Apply the preprocessor to the test data
X_test = test_data  
X_test_preprocessed = preprocessor.transform(X_test) 

# Make predictions on the test data
import numpy as np 
y_test_pred_probs = model.predict(X_test_preprocessed)  # Get prediction probabilities
y_test_pred = np.argmax(y_test_pred_probs, axis=1)  # Get the class with highest probability (predicted class)

[1m582/582[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step


In [13]:
# Prepare the submission DataFrame
submission = pd.DataFrame({
    'ID': test_data['ID'],  # Replace 'ID' with the correct column name from your test dataset
    'Target': y_test_pred  # The predicted target values (0 or 1)
})

# Save the submission file
submission.to_csv('submission.csv', index=False)

print("Submission file 'submission.csv' has been saved.")

Submission file 'submission.csv' has been saved.
