### Study the back-propagation algorithm. Implement a classifier for the loan data with Decision as the output attribute. Prepare the data as needed. Submit the notebook file.

In [1]:
import numpy as np
import pandas as pd

np.random.seed(42)

num_samples = 200000

# Generate Gender with some missing values
gender = np.random.choice(['Male', 'Female', np.nan], size=num_samples, p=[0.59, 0.39, 0.02])

# Generate Married status dependent on Gender, with missing values
married = []
for g in gender:
    if g == 'Male':
        married.append(np.random.choice(['Yes', 'No', np.nan], p=[0.74, 0.24, 0.02]))
    elif g == 'Female':
        married.append(np.random.choice(['Yes', 'No', np.nan], p=[0.64, 0.34, 0.02]))
    else:
        married.append(np.nan)
married = np.array(married)

# Generate Dependents based on Married status, with missing values
dependents = []
for m in married:
    if m == 'Yes':
        dependents.append(np.random.choice(['0', '1', '2', '3+', np.nan], p=[0.28, 0.28, 0.24, 0.14, 0.06]))
    elif m == 'No':
        dependents.append(np.random.choice(['0', '1', '2', '3+', np.nan], p=[0.88, 0.05, 0.03, 0.02, 0.02]))
    else:
        dependents.append(np.nan)
dependents = np.array(dependents)

# Generate Education with missing values
education = np.random.choice(['Graduate', 'Not Graduate', np.nan], size=num_samples, p=[0.79, 0.19, 0.02])

# Generate Self_Employed based on Education, with missing values
self_employed = []
for edu in education:
    if edu == 'Graduate':
        self_employed.append(np.random.choice(['No', 'Yes', np.nan], p=[0.78, 0.20, 0.02]))
    elif edu == 'Not Graduate':
        self_employed.append(np.random.choice(['No', 'Yes', np.nan], p=[0.88, 0.10, 0.02]))
    else:
        self_employed.append(np.nan)
self_employed = np.array(self_employed)

# Generate ApplicantIncome with dependencies and missing values
applicant_income = []
for i in range(num_samples):
    mean_income = 5000
    std_income = 2000
    if education[i] == 'Graduate':
        mean_income += 1000
    if self_employed[i] == 'Yes':
        mean_income += 500
        std_income += 1000  # Higher variance for self-employed
    if gender[i] == 'Male':
        mean_income += 500
    if married[i] == 'Yes':
        mean_income += 500
    income = np.random.normal(loc=mean_income, scale=std_income)
    income = max(income, 1500)  # Minimum income of 1500
    # Introduce missing values
    if np.random.rand() < 0.01:
        applicant_income.append(np.nan)
    else:
        applicant_income.append(int(income))
applicant_income = np.array(applicant_income)

# Generate CoapplicantIncome based on Married status, with missing values
coapplicant_income = []
for i in range(num_samples):
    if married[i] == 'Yes':
        prob_coapplicant_works = 0.6
    else:
        prob_coapplicant_works = 0.2
    if np.random.rand() < prob_coapplicant_works:
        mean_co_income = 2000
        std_co_income = 1000
        income = np.random.normal(loc=mean_co_income, scale=std_co_income)
        income = max(income, 0)
        # Introduce missing values
        if np.random.rand() < 0.01:
            coapplicant_income.append(np.nan)
        else:
            coapplicant_income.append(int(income))
    else:
        coapplicant_income.append(0)
coapplicant_income = np.array(coapplicant_income)

# Handle missing values in ApplicantIncome or CoapplicantIncome when calculating TotalIncome
total_income = np.where(
    np.isnan(applicant_income), 
    coapplicant_income, 
    np.where(
        np.isnan(coapplicant_income), 
        applicant_income, 
        applicant_income + coapplicant_income
    )
)

# Generate LoanAmount based on TotalIncome, with missing values
loan_amount = []
for i in range(num_samples):
    if np.isnan(total_income[i]):
        loan_amount.append(np.nan)
        continue
    mean_loan = (total_income[i] / 1000) * np.random.uniform(4, 6)
    std_loan = mean_loan * 0.1  # 10% of mean
    loan = np.random.normal(loc=mean_loan, scale=std_loan)
    loan = max(loan, 50)  # Minimum loan amount of 50
    # Introduce missing values
    if np.random.rand() < 0.02:
        loan_amount.append(np.nan)
    else:
        loan_amount.append(int(loan))
loan_amount = np.array(loan_amount)

# Generate Loan_Amount_Term based on LoanAmount, with missing values
loan_amount_term = []
for amt in loan_amount:
    if np.isnan(amt):
        loan_amount_term.append(np.nan)
    else:
        if amt <= 100:
            term = np.random.choice([60, 120, 180, 360, np.nan], p=[0.38, 0.28, 0.18, 0.08, 0.08])
        elif amt <= 200:
            term = np.random.choice([120, 180, 360, np.nan], p=[0.09, 0.28, 0.55, 0.08])
        else:
            term = np.random.choice([180, 360, np.nan], p=[0.18, 0.74, 0.08])
        loan_amount_term.append(term)
loan_amount_term = np.array(loan_amount_term)

# Generate Credit_History based on TotalIncome, with missing values
credit_history = []
for income in total_income:
    if np.isnan(income):
        credit_history.append(np.nan)
        continue
    if income >= 7000:
        prob_good_credit = 0.89
    elif income >= 4000:
        prob_good_credit = 0.78
    else:
        prob_good_credit = 0.58
    credit = np.random.choice([1, 0, np.nan], p=[prob_good_credit, 1 - prob_good_credit - 0.02, 0.02])
    credit_history.append(credit)
credit_history = np.array(credit_history)

# Generate Property_Area based on TotalIncome, with missing values
property_area = []
for income in total_income:
    if np.isnan(income):
        property_area.append(np.nan)
    else:
        if income >= 7000:
            area = np.random.choice(['Urban', 'Semiurban', 'Rural', np.nan], p=[0.48, 0.28, 0.18, 0.06])
        elif income >= 4000:
            area = np.random.choice(['Urban', 'Semiurban', 'Rural', np.nan], p=[0.28, 0.48, 0.18, 0.06])
        else:
            area = np.random.choice(['Urban', 'Semiurban', 'Rural', np.nan], p=[0.18, 0.38, 0.38, 0.06])
        property_area.append(area)
property_area = np.array(property_area)

# Generate Decision based on multiple factors, with missing values
decision = []
for i in range(num_samples):
    # If any of the key fields are missing, the decision is missing
    if (np.isnan(applicant_income[i]) or np.isnan(loan_amount[i]) or
        np.isnan(loan_amount_term[i]) or np.isnan(credit_history[i]) or
        np.isnan(total_income[i])):
        decision.append(np.nan)
        continue
    approved = 1  # Assume approval
    if credit_history[i] == 0:
        approved = 0  # Bad credit history
    dti = (loan_amount[i] * 1000) / loan_amount_term[i] / total_income[i]
    if dti > 0.4:
        approved = 0  # Debt-to-income ratio too high
    if loan_amount[i] > total_income[i] * 10:
        approved = 0  # Loan amount too high compared to income
    if applicant_income[i] < 2000:
        approved = 0  # Applicant income too low
    decision.append(approved)
decision = np.array(decision)

# Create DataFrame
loan_data = pd.DataFrame({
    'Gender': gender,
    'Married': married,
    'Dependents': dependents,
    'Education': education,
    'Self_Employed': self_employed,
    'ApplicantIncome': applicant_income,
    'CoapplicantIncome': coapplicant_income,
    'LoanAmount': loan_amount,
    'Loan_Amount_Term': loan_amount_term,
    'Credit_History': credit_history,
    'Property_Area': property_area,
    'Decision': decision
})

# Introduce random missing values in the dataset (additional step)
missingness_prob = {
    'Gender': 0.01,
    'Married': 0.01,
    'Dependents': 0.02,
    'Education': 0.005,
    'Self_Employed': 0.02,
    'ApplicantIncome': 0.01,
    'CoapplicantIncome': 0.01,
    'LoanAmount': 0.002,
    'Loan_Amount_Term': 0.002,
    'Credit_History': 0.001,
    'Property_Area': 0.005,
    'Decision': 0.005
}

for column, prob in missingness_prob.items():
    loan_data.loc[loan_data.sample(frac=prob).index, column] = np.nan

loan_data.to_csv('Generated_loan_data.csv', index=False)


In [2]:
loan_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 12 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   Gender             198000 non-null  object 
 1   Married            198000 non-null  object 
 2   Dependents         196000 non-null  object 
 3   Education          199000 non-null  object 
 4   Self_Employed      196000 non-null  object 
 5   ApplicantIncome    195966 non-null  float64
 6   CoapplicantIncome  197040 non-null  float64
 7   LoanAmount         195474 non-null  float64
 8   Loan_Amount_Term   179892 non-null  float64
 9   Credit_History     195757 non-null  float64
 10  Property_Area      199000 non-null  object 
 11  Decision           173936 non-null  float64
dtypes: float64(6), object(6)
memory usage: 18.3+ MB


In [3]:
data = loan_data.copy()

In [4]:
# Check for missing values
missing_values = loan_data.isnull().sum()
print("Missing values in each column:")
print(missing_values)


Missing values in each column:
Gender                2000
Married               2000
Dependents            4000
Education             1000
Self_Employed         4000
ApplicantIncome       4034
CoapplicantIncome     2960
LoanAmount            4526
Loan_Amount_Term     20108
Credit_History        4243
Property_Area         1000
Decision             26064
dtype: int64


In [5]:
# Replace '3+' with 3 and convert to integer
loan_data['Dependents'] = loan_data['Dependents'].replace('3+', 3).astype(int)
print(loan_data.dtypes)

ValueError: invalid literal for int() with base 10: 'nan'

In [None]:
import numpy as np

# For 'ApplicantIncome'
income_q1 = loan_data['ApplicantIncome'].quantile(0.25)
income_q3 = loan_data['ApplicantIncome'].quantile(0.75)
income_iqr = income_q3 - income_q1

income_lower_bound = income_q1 - 1.5 * income_iqr
income_upper_bound = income_q3 + 1.5 * income_iqr

# Identify outliers
income_outliers = loan_data[(loan_data['ApplicantIncome'] < income_lower_bound) | (loan_data['ApplicantIncome'] > income_upper_bound)]
print(f"Number of outliers in 'ApplicantIncome': {income_outliers.shape[0]}")

# Optionally, cap the outliers
loan_data['ApplicantIncome'] = np.where(loan_data['ApplicantIncome'] > income_upper_bound, income_upper_bound,np.where(loan_data['ApplicantIncome'] < income_lower_bound, income_lower_bound, loan_data['ApplicantIncome']))

# For 'CoapplicantIncome'
CoapplicantIncome_q1 = loan_data['CoapplicantIncome'].quantile(0.25)
CoapplicantIncome_q3 = loan_data['CoapplicantIncome'].quantile(0.75)
CoapplicantIncome_iqr = income_q3 - income_q1

CoapplicantIncome_lower_bound = CoapplicantIncome_q1 - 1.5 * CoapplicantIncome_iqr
CoapplicantIncome_upper_bound = CoapplicantIncome_q3 + 1.5 * CoapplicantIncome_iqr

# Identify outliers
CoapplicantIncome_outliers = loan_data[(loan_data['CoapplicantIncome'] < CoapplicantIncome_lower_bound) | (loan_data['CoapplicantIncome'] > CoapplicantIncome_upper_bound)]
print(f"Number of outliers in 'CoapplicantIncome': {CoapplicantIncome_outliers.shape[0]}")

# Optionally, cap the outliers
loan_data['CoapplicantIncome'] = np.where(loan_data['CoapplicantIncome'] > CoapplicantIncome_upper_bound, CoapplicantIncome_upper_bound,np.where(loan_data['CoapplicantIncome'] < CoapplicantIncome_lower_bound, CoapplicantIncome_lower_bound, loan_data['CoapplicantIncome']))


# For 'LoanAmount'
LoanAmount_q1 = loan_data['LoanAmount'].quantile(0.25)
LoanAmount_q3 = loan_data['LoanAmount'].quantile(0.75)
LoanAmount_iqr = income_q3 - income_q1

LoanAmount_lower_bound = LoanAmount_q1 - 1.5 * LoanAmount_iqr
LoanAmount_upper_bound = LoanAmount_q3 + 1.5 * LoanAmount_iqr

# Identify outliers
LoanAmount_outliers = loan_data[(loan_data['LoanAmount'] < LoanAmount_lower_bound) | (loan_data['LoanAmount'] > LoanAmount_upper_bound)]
print(f"Number of outliers in 'LoanAmount': {LoanAmount_outliers.shape[0]}")

# Optionally, cap the outliers
loan_data['LoanAmount'] = np.where(loan_data['LoanAmount'] > LoanAmount_upper_bound, LoanAmount_upper_bound,np.where(loan_data['CoapplicantIncome'] < LoanAmount_lower_bound, LoanAmount_lower_bound, loan_data['LoanAmount']))



Number of outliers in 'ApplicantIncome': 0
Number of outliers in 'CoapplicantIncome': 0
Number of outliers in 'LoanAmount': 0
