In [34]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import RandomizedSearchCV

In [35]:
data = pd.read_csv(r"C:\Users\user\OneDrive\Desktop\ML work\Loan_elegibility_Problem\loan_train.csv")
data.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [36]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB


In [37]:

def fill_missing(data):
    # Fill numeric columns with mean
    numeric_cols = data.select_dtypes(include=['number']).columns
    data[numeric_cols] = data[numeric_cols].fillna(data[numeric_cols].mean())

    # Fill categorical columns with mode
    categorical_cols = data.select_dtypes(include=['object']).columns
    data[categorical_cols] = data[categorical_cols].fillna(data[categorical_cols].mode().iloc[0])

    return data
data = fill_missing(data)

In [38]:
data.isnull().any()

Loan_ID              False
Gender               False
Married              False
Dependents           False
Education            False
Self_Employed        False
ApplicantIncome      False
CoapplicantIncome    False
LoanAmount           False
Loan_Amount_Term     False
Credit_History       False
Property_Area        False
Loan_Status          False
dtype: bool

In [39]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             614 non-null    object 
 2   Married            614 non-null    object 
 3   Dependents         614 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      614 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         614 non-null    float64
 9   Loan_Amount_Term   614 non-null    float64
 10  Credit_History     614 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB


In [40]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import pandas as pd

def encode_categorical_data(data):
    encoded_data = data.copy()
    label_encoder = LabelEncoder()
    for col in encoded_data.columns:
        if encoded_data[col].dtype == 'object':
            encoded_data[col] = label_encoder.fit_transform(encoded_data[col])
    return encoded_data

def one_hot_encode(data):
    encoded_data = data.copy()
    for col in encoded_data.columns:
        if encoded_data[col].dtype == 'object':
            encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
            encoded_col = pd.DataFrame(encoder.fit_transform(encoded_data[[col]]))
            encoded_col.columns = [col + '_' + str(i) for i in range(encoded_col.shape[1])]
            encoded_data = pd.concat([encoded_data, encoded_col], axis=1)
            encoded_data.drop(col, axis=1, inplace=True)
    return encoded_data


# First, encode categorical data using LabelEncoder
encoded_data = encode_categorical_data(data)

# Then, apply one-hot encoding
encoded_data = one_hot_encode(encoded_data)

encoded_data.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,0,1,0,0,0,0,5849,0.0,146.412162,360.0,1.0,2,1
1,1,1,1,1,0,0,4583,1508.0,128.0,360.0,1.0,0,0
2,2,1,1,0,0,1,3000,0.0,66.0,360.0,1.0,2,1
3,3,1,1,0,1,0,2583,2358.0,120.0,360.0,1.0,2,1
4,4,1,0,0,0,0,6000,0.0,141.0,360.0,1.0,2,1


In [41]:
encoded_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    int32  
 1   Gender             614 non-null    int32  
 2   Married            614 non-null    int32  
 3   Dependents         614 non-null    int32  
 4   Education          614 non-null    int32  
 5   Self_Employed      614 non-null    int32  
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         614 non-null    float64
 9   Loan_Amount_Term   614 non-null    float64
 10  Credit_History     614 non-null    float64
 11  Property_Area      614 non-null    int32  
 12  Loan_Status        614 non-null    int32  
dtypes: float64(4), int32(8), int64(1)
memory usage: 43.3 KB


In [42]:
# Separate features and target variable
X = encoded_data.drop(columns=['Loan_Status'])
y = encoded_data['Loan_Status']

In [43]:
# Splitting the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [44]:
# Standardizing features by removing the mean and scaling to unit variance
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [45]:
from sklearn.ensemble import RandomForestRegressor
# Initialize and fit the Random Forest model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Get feature importances
importances = rf_model.feature_importances_
# Sort indices based on importance
sorted_indices = np.argsort(importances)[::-1]

# Calculate total importance for normalization
total_importance = np.sum(importances)

In [46]:
# Print feature importance in percentage
for i in sorted_indices:
    print(f"{X.columns[i]}: {importances[i] / total_importance * 100:.2f}%")

Credit_History: 30.69%
LoanAmount: 14.79%
ApplicantIncome: 14.60%
Loan_ID: 14.18%
CoapplicantIncome: 8.07%
Loan_Amount_Term: 4.72%
Property_Area: 4.39%
Dependents: 2.94%
Married: 1.99%
Education: 1.31%
Gender: 1.19%
Self_Employed: 1.13%


In [47]:
# Standardizing features by removing the mean and scaling to unit variance
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [48]:
# Initialize the CNN model
model = Sequential()

In [49]:
# Build the model
model = Sequential([
    Dense(128, activation='sigmoid', input_shape=(X_train.shape[1],)),
    Dropout(0.2),  # Adding dropout for regularization
    Dense(64, activation='sigmoid'),
    Dense(1)  # Output layer with single neuron for regression
])


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [50]:
# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')

# Train the model
history = model.fit(X_train, y_train, epochs=100, batch_size=32, validation_split=0.2)

Epoch 1/100
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 45ms/step - loss: 0.2596 - val_loss: 0.2009
Epoch 2/100
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - loss: 0.2144 - val_loss: 0.1681
Epoch 3/100
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - loss: 0.2053 - val_loss: 0.1853
Epoch 4/100
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 0.2041 - val_loss: 0.1859
Epoch 5/100
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - loss: 0.1957 - val_loss: 0.1531
Epoch 6/100
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 0.2017 - val_loss: 0.1492
Epoch 7/100
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 0.1823 - val_loss: 0.1483
Epoch 8/100
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 0.1677 - val_loss: 0.1487
Epoch 9/100
[1m13/13[0m [32m━━━━━━━━━━━━━━━━

In [53]:
# Evaluate the model
loss = model.evaluate(X_test, y_test)
print("Test Loss:", loss)
# Evaluate the model
train_mse = history.history['loss'][-1]  # Final training MSE
val_mse = history.history['val_loss'][-1]  # Final validation MSE

print("Final Training MSE:", train_mse)
print("Final Validation MSE:", val_mse)

[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.1510 
Test Loss: 0.15590068697929382
Final Training MSE: 0.1531374454498291
Final Validation MSE: 0.13524477183818817
