In [None]:
import pandas as pd
import numpy as np
from mlxtend.frequent_patterns import apriori
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import association_rules

In [None]:
dataset = pd.read_csv('/content/loan_train.csv')

In [None]:
dataset

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,Applicant_Income,Coapplicant_Income,Loan_Amount,Term,Credit_History,Area,Status
0,Male,No,0,Graduate,No,584900,0.0,15000000,360.0,1.0,Urban,Y
1,Male,Yes,1,Graduate,No,458300,150800.0,12800000,360.0,1.0,Rural,N
2,Male,Yes,0,Graduate,Yes,300000,0.0,6600000,360.0,1.0,Urban,Y
3,Male,Yes,0,Not Graduate,No,258300,235800.0,12000000,360.0,1.0,Urban,Y
4,Male,No,0,Graduate,No,600000,0.0,14100000,360.0,1.0,Urban,Y
...,...,...,...,...,...,...,...,...,...,...,...,...
609,Female,No,0,Graduate,No,290000,0.0,7100000,360.0,1.0,Rural,Y
610,Male,Yes,3+,Graduate,No,410600,0.0,4000000,180.0,1.0,Rural,Y
611,Male,Yes,1,Graduate,No,807200,24000.0,25300000,360.0,1.0,Urban,Y
612,Male,Yes,2,Graduate,No,758300,0.0,18700000,360.0,1.0,Urban,Y


# **Preprocessing**

In [None]:
dataset.isna().sum()

Gender                13
Married                3
Dependents            15
Education              0
Self_Employed         32
Applicant_Income       0
Coapplicant_Income     0
Loan_Amount            0
Term                  14
Credit_History        50
Area                   0
Status                 0
dtype: int64

In [None]:
dataset["Credit_History"].unique()

array([ 1.,  0., nan])

In [None]:
mode_credit_hist = dataset["Credit_History"].mode()[0]
dataset["Credit_History"].fillna(mode_credit_hist, inplace=True)

In [None]:
mode_term = dataset["Term"].mode()[0]
dataset["Term"].fillna(mode_term, inplace=True)

In [None]:
mean_loan_amount = dataset["Loan_Amount"].mean()
dataset["Loan_Amount"] = dataset["Loan_Amount"].replace([0.0], [mean_loan_amount])

In [None]:
mean_coapp_income = dataset["Coapplicant_Income"].mean()
dataset["Coapplicant_Income"] = dataset["Coapplicant_Income"].replace([0.0], [mean_loan_amount])

In [None]:
# Check if there is any zero value in Applicant_Income
np.where(dataset["Applicant_Income"] == 0.0)

(array([], dtype=int64),)

In [None]:
mode_self_employed = dataset["Self_Employed"].mode()[0]
dataset["Self_Employed"].fillna(mode_self_employed, inplace=True)

In [None]:
mode_dependents = dataset["Dependents"].mode()[0]
dataset["Dependents"].fillna(mode_dependents, inplace=True)

In [None]:
mode_married = dataset["Married"].mode()[0]
dataset["Married"].fillna(mode_married, inplace=True)

In [None]:
mode_gender = dataset["Gender"].mode()[0]
dataset["Gender"].fillna(mode_gender, inplace=True)

In [None]:
dataset.isna().sum()

Gender                0
Married               0
Dependents            0
Education             0
Self_Employed         0
Applicant_Income      0
Coapplicant_Income    0
Loan_Amount           0
Term                  0
Credit_History        0
Area                  0
Status                0
dtype: int64

In [None]:
dataset

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,Applicant_Income,Coapplicant_Income,Loan_Amount,Term,Credit_History,Area,Status
0,Male,No,0,Graduate,No,584900,1.414104e+07,15000000.0,360.0,1.0,Urban,Y
1,Male,Yes,1,Graduate,No,458300,1.508000e+05,12800000.0,360.0,1.0,Rural,N
2,Male,Yes,0,Graduate,Yes,300000,1.414104e+07,6600000.0,360.0,1.0,Urban,Y
3,Male,Yes,0,Not Graduate,No,258300,2.358000e+05,12000000.0,360.0,1.0,Urban,Y
4,Male,No,0,Graduate,No,600000,1.414104e+07,14100000.0,360.0,1.0,Urban,Y
...,...,...,...,...,...,...,...,...,...,...,...,...
609,Female,No,0,Graduate,No,290000,1.414104e+07,7100000.0,360.0,1.0,Rural,Y
610,Male,Yes,3+,Graduate,No,410600,1.414104e+07,4000000.0,180.0,1.0,Rural,Y
611,Male,Yes,1,Graduate,No,807200,2.400000e+04,25300000.0,360.0,1.0,Urban,Y
612,Male,Yes,2,Graduate,No,758300,1.414104e+07,18700000.0,360.0,1.0,Urban,Y


In [None]:
# Convert categorical columns from string to integers
categorical_cols = ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Term', 'Area']
for col in categorical_cols:
    le = LabelEncoder()
    dataset[col] = le.fit_transform(dataset[col])
dataset

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,Applicant_Income,Coapplicant_Income,Loan_Amount,Term,Credit_History,Area,Status
0,1,0,0,0,0,584900,1.414104e+07,15000000.0,8,1.0,2,Y
1,1,1,1,0,0,458300,1.508000e+05,12800000.0,8,1.0,0,N
2,1,1,0,0,1,300000,1.414104e+07,6600000.0,8,1.0,2,Y
3,1,1,0,1,0,258300,2.358000e+05,12000000.0,8,1.0,2,Y
4,1,0,0,0,0,600000,1.414104e+07,14100000.0,8,1.0,2,Y
...,...,...,...,...,...,...,...,...,...,...,...,...
609,0,0,0,0,0,290000,1.414104e+07,7100000.0,8,1.0,0,Y
610,1,1,3,0,0,410600,1.414104e+07,4000000.0,5,1.0,0,Y
611,1,1,1,0,0,807200,2.400000e+04,25300000.0,8,1.0,2,Y
612,1,1,2,0,0,758300,1.414104e+07,18700000.0,8,1.0,2,Y


In [None]:
# Apply discretization on continuous value columns
cont_cols = ['Applicant_Income', 'Coapplicant_Income', 'Loan_Amount']
for col in cont_cols:
    dataset[col] = pd.qcut(dataset[col], q=4, labels=False, duplicates='drop')
dataset

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,Applicant_Income,Coapplicant_Income,Loan_Amount,Term,Credit_History,Area,Status
0,1,0,0,0,0,3,2,2,8,1.0,2,Y
1,1,1,1,0,0,2,0,1,8,1.0,0,N
2,1,1,0,0,1,1,2,0,8,1.0,2,Y
3,1,1,0,1,0,0,1,1,8,1.0,2,Y
4,1,0,0,0,0,3,2,2,8,1.0,2,Y
...,...,...,...,...,...,...,...,...,...,...,...,...
609,0,0,0,0,0,1,2,0,8,1.0,0,Y
610,1,1,3,0,0,2,2,0,5,1.0,0,Y
611,1,1,1,0,0,3,0,3,8,1.0,2,Y
612,1,1,2,0,0,3,2,3,8,1.0,2,Y


In [None]:
transactions = []
for _, row in dataset.iterrows():
    transaction = [f"{column}={value}" for column, value in row.iteritems()]
    transactions.append(transaction)
transactions

  transaction = [f"{column}={value}" for column, value in row.iteritems()]


[['Gender=1',
  'Married=0',
  'Dependents=0',
  'Education=0',
  'Self_Employed=0',
  'Applicant_Income=3',
  'Coapplicant_Income=2',
  'Loan_Amount=2',
  'Term=8',
  'Credit_History=1.0',
  'Area=2',
  'Status=Y'],
 ['Gender=1',
  'Married=1',
  'Dependents=1',
  'Education=0',
  'Self_Employed=0',
  'Applicant_Income=2',
  'Coapplicant_Income=0',
  'Loan_Amount=1',
  'Term=8',
  'Credit_History=1.0',
  'Area=0',
  'Status=N'],
 ['Gender=1',
  'Married=1',
  'Dependents=0',
  'Education=0',
  'Self_Employed=1',
  'Applicant_Income=1',
  'Coapplicant_Income=2',
  'Loan_Amount=0',
  'Term=8',
  'Credit_History=1.0',
  'Area=2',
  'Status=Y'],
 ['Gender=1',
  'Married=1',
  'Dependents=0',
  'Education=1',
  'Self_Employed=0',
  'Applicant_Income=0',
  'Coapplicant_Income=1',
  'Loan_Amount=1',
  'Term=8',
  'Credit_History=1.0',
  'Area=2',
  'Status=Y'],
 ['Gender=1',
  'Married=0',
  'Dependents=0',
  'Education=0',
  'Self_Employed=0',
  'Applicant_Income=3',
  'Coapplicant_Income=2

In [None]:
# Convert the transactions to a binary transaction format
te = TransactionEncoder()
te_ary = te.fit(transactions).transform(transactions)
transactions_dataset = pd.DataFrame(te_ary, columns=te.columns_)

# Split the transaction data into training and test sets
train_dataset, test_dataset = train_test_split(transactions_dataset, test_size=0.3, random_state=42)

# Generate frequent itemsets
frequent_itemsets = apriori(train_dataset, min_support=0.6, use_colnames=True)
# Generate association rules
association_rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.7)
# Print the association rules
association_rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(Education=0),(Credit_History=1.0),0.778555,0.86014,0.675991,0.868263,1.009445,0.006325,1.061666
1,(Credit_History=1.0),(Education=0),0.86014,0.778555,0.675991,0.785908,1.009445,0.006325,1.034345
2,(Gender=1),(Credit_History=1.0),0.804196,0.86014,0.692308,0.86087,1.000848,0.000587,1.005245
3,(Credit_History=1.0),(Gender=1),0.86014,0.804196,0.692308,0.804878,1.000848,0.000587,1.003497
4,(Credit_History=1.0),(Self_Employed=0),0.86014,0.862471,0.74359,0.864499,1.002351,0.001744,1.014965
5,(Self_Employed=0),(Credit_History=1.0),0.862471,0.86014,0.74359,0.862162,1.002351,0.001744,1.014672
6,(Status=Y),(Credit_History=1.0),0.703963,0.86014,0.692308,0.983444,1.143353,0.086801,8.447552
7,(Credit_History=1.0),(Status=Y),0.86014,0.703963,0.692308,0.804878,1.143353,0.086801,1.517191
8,(Term=8),(Credit_History=1.0),0.841492,0.86014,0.734266,0.872576,1.014458,0.010465,1.097598
9,(Credit_History=1.0),(Term=8),0.86014,0.841492,0.734266,0.853659,1.014458,0.010465,1.083139


In [None]:
# Filter rules based on consequent
loan_status_Yes_rules = association_rules[association_rules['consequents'].apply(lambda x: 'Status=Y' in x)]
loan_status_No_rules = association_rules[association_rules['consequents'].apply(lambda x: 'Status=N' in x)]

In [None]:
loan_status_Yes_rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
7,(Credit_History=1.0),(Status=Y),0.86014,0.703963,0.692308,0.804878,1.143353,0.086801,1.517191
21,(Self_Employed=0),(Status=Y),0.862471,0.703963,0.608392,0.705405,1.002049,0.001244,1.004897
25,(Term=8),(Status=Y),0.841492,0.703963,0.610723,0.725762,1.030966,0.018344,1.07949
34,"(Term=8, Credit_History=1.0)",(Status=Y),0.734266,0.703963,0.601399,0.819048,1.163482,0.084503,1.635996
36,(Term=8),"(Status=Y, Credit_History=1.0)",0.841492,0.692308,0.601399,0.714681,1.032318,0.018827,1.078417


In [None]:
loan_status_No_rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction


In [None]:
def predict_with_rules(instance, rules):
    predicted_class = []
    max_confidence = 0.7
    for _, rule in rules.iterrows():
        if all(item in instance for item in rule['antecedents']):
            if rule['confidence'] > max_confidence:
                predicted_class = list(rule['consequents'])
                max_confidence = rule['confidence']
    return predicted_class

# Testing on test dataset
test_instances = test_dataset.apply(lambda row: [col for col in test_dataset.columns if row[col]], axis=1)
predictions = test_instances.apply(lambda instance: predict_with_rules(instance, loan_status_Yes_rules))

In [None]:
# Fetch true status from test data and store in a list
status_Y_test = test_dataset['Status=N']
status_N_test = test_dataset['Status=Y']
y_true = []
for i in status_Y_test:
  if i == True:
    y_true.append('Status=Y')
  else:
    y_true.append('Status=N')

In [None]:
# Fetch predicted status in a list
y_pred = []
for i in predictions:
  y_pred.append(i[0])

In [None]:
# Classification Report
classification_repo = classification_report(y_true, y_pred)
print(classification_repo)

              precision    recall  f1-score   support

    Status=N       0.00      0.00      0.00       120
    Status=Y       0.35      1.00      0.52        65

    accuracy                           0.35       185
   macro avg       0.18      0.50      0.26       185
weighted avg       0.12      0.35      0.18       185



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
