# Import the created functions

In [1]:
import numpy as np
import pandas as pd

from src.prepare_data import prepare
from src.preprocess_data import preprocess
from src.train_models import model_1, model_2

# Process and retrieve the data

In [2]:
# Retrieve data
df = prepare('raw_data.csv')

print(df.info())
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Gender             601 non-null    object 
 1   Married            611 non-null    object 
 2   Dependents         599 non-null    object 
 3   Education          614 non-null    object 
 4   Self_Employed      582 non-null    object 
 5   ApplicantIncome    614 non-null    int64  
 6   CoapplicantIncome  614 non-null    float64
 7   LoanAmount         592 non-null    float64
 8   Loan_Amount_Term   600 non-null    float64
 9   Credit_History     564 non-null    float64
 10  Property_Area      614 non-null    object 
 11  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(7)
memory usage: 57.7+ KB
None


Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


# Preprocess the data

In [3]:
# Preprocess the data
preprocessed_df = preprocess(df)

print(preprocessed_df.info())
preprocessed_df.head()

<class 'pandas.core.frame.DataFrame'>
Index: 371 entries, 1 to 613
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Gender             371 non-null    object 
 1   Married            371 non-null    object 
 2   Dependents         371 non-null    object 
 3   Education          371 non-null    object 
 4   Self_Employed      371 non-null    object 
 5   ApplicantIncome    371 non-null    int64  
 6   CoapplicantIncome  371 non-null    float64
 7   LoanAmount         371 non-null    float64
 8   Loan_Amount_Term   371 non-null    float64
 9   Credit_History     371 non-null    float64
 10  Property_Area      371 non-null    object 
 11  Loan_Status        371 non-null    int32  
dtypes: float64(4), int32(1), int64(1), object(6)
memory usage: 36.2+ KB
None


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Loan_Status'] = df['Loan_Status'].map({'Y' : 1, 'N' : 0}).astype('int')


Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
1,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,0
2,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,1
3,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,1
4,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,1
6,Male,Yes,0,Not Graduate,No,2333,1516.0,95.0,360.0,1.0,Urban,1


# Model 1

## Create and fit Model 1

In [4]:
model_1 = model_1(preprocessed_df)

Logistic Regression: 86.6667
Training MAE: 0.18
Test data MAE: 0.13


In [5]:
def make_prediction_model_1(row):

    data = {
        "Gender": row[0],
        "Married": row[1],
        "Dependents": row[2],
        "Education": row[3],
        "Self_Employed": row[4],
        "ApplicantIncome": row[5],
        "CoapplicantIncome": row[6],
        "LoanAmount": row[7],
        "Loan_Amount_Term": row[8],
        "Credit_History": row[9],
        "Property_Area": row[10]
    }

    df_predict = pd.DataFrame(data, index=[0])
    prediction = model_1.predict(df_predict)[0]
    
    if prediction == 0:
        return "Rejected"
    elif prediction == 1:
        return "Approved"
    else:
        return "Error"

In [6]:
print(make_prediction_model_1(pd.Series(['Male', 'Yes', '1', 'Graduate', 'No', 4583, 1508.0, 128.0, 360.0, 1.0, 'Rural']))) # Should output 1: "Approved"
print(make_prediction_model_1(pd.Series(['Male', 'No', '0', 'Graduate', 'No', 6000, 0.0, 141.0, 360.0, 1.0, 'Urban']))) # Should output 1: "Approved"
print(make_prediction_model_1(pd.Series(['Female', 'No', '4', 'Not Graduate', 'Yes', 200, 0.0, 1000.0, 360.0, 0.0, 'Rural']))) # Should output 0: "Rejected"

Approved
Approved
Rejected


## Save Model 1

In [7]:
import joblib
# Save Model
joblib.dump(model_1, '../artifacts/model_1.pkl')

['../artifacts/model_1.pkl']

# Model 2 (With Feature Engineering)

In [8]:
model_2 = model_2(preprocessed_df)

Logistic Regression: 86.6667
Training MAE: 0.18
Test data MAE: 0.13


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Income'] = df['ApplicantIncome'] + df['CoapplicantIncome']
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(columns=['ApplicantIncome', 'CoapplicantIncome'], inplace=True)


In [9]:
def make_prediction_model_2(row):

    data = {
        "Gender": row[0],
        "Married": row[1],
        "Dependents": row[2],
        "Education": row[3],
        "Self_Employed": row[4],
        "LoanAmount": row[5],
        "Loan_Amount_Term": row[6],
        "Credit_History": row[7],
        "Property_Area": row[8],
        "Income": row[9]
    }

    df_predict = pd.DataFrame(data, index=[0])
    prediction = model_2.predict(df_predict)[0]
    
    if prediction == 0:
        return "Rejected"
    elif prediction == 1:
        return "Approved"
    else:
        return "Error"

In [10]:
print(make_prediction_model_2(pd.Series(['Male', 'Yes', '1', 'Graduate', 'No', 128.0, 360.0, 1.0, 'Rural', (4583 + 1508)]))) # Should output 1
print(make_prediction_model_2(pd.Series(['Male', 'No', '0', 'Graduate', 'No', 141.0, 360.0, 1.0, 'Urban', (6000 + 0)]))) # Should output 1
print(make_prediction_model_2(pd.Series(['Female', 'No', '4', 'Not Graduate', 'Yes', 1000.0, 360.0, 0.0, 'Rural', (200 + 0)]))) # Should output 0

Approved
Approved
Rejected


## Save Model 2

In [11]:
import joblib
# Save Model
joblib.dump(model_2, '../artifacts/model_2.pkl')

['../artifacts/model_2.pkl']