In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
pd.set_option('display.max_rows', None)
import matplotlib.pyplot as plt

from sklearn.preprocessing  import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.metrics import classification_report

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/loan-prediction-with-3-problem-statement/sample_submission.csv
/kaggle/input/loan-prediction-with-3-problem-statement/training_set.csv
/kaggle/input/loan-prediction-with-3-problem-statement/data_description.txt
/kaggle/input/loan-prediction-with-3-problem-statement/testing_set.csv
/kaggle/input/loan-prediction-with-3-problem-statement/Problem_statment.txt


# Project description

The objective of this project was to use **Object Oriented Programming** for a **data science** problem, specifically for loan prediction.

The dataset consists with data for each loan applicant with information such as marriage status, income, gender, and others. 

For this project three different csv files are used:
   1. training set
   2. testing set
   3. testing set loan classes

# Datasets of Use

In [2]:
df_sample_submission = pd.read_csv('/kaggle/input/loan-prediction-with-3-problem-statement/sample_submission.csv')
df_sample_submission.head(3)

Unnamed: 0.1,Unnamed: 0,Loan_ID,Loan_Status
0,0,LP001015,Y
1,1,LP001022,Y
2,2,LP001031,Y


In [3]:
df_training_set = pd.read_csv('/kaggle/input/loan-prediction-with-3-problem-statement/training_set.csv')
df_training_set.head(3)

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849.0,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000.0,0.0,66.0,360.0,1.0,Urban,Y


In [4]:
df_testing_set = pd.read_csv('/kaggle/input/loan-prediction-with-3-problem-statement/testing_set.csv')
df_testing_set.head(3)

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,property_Area
0,LP001015,Male,Yes,0,Graduate,No,5720,0,110.0,360.0,1.0,Urban
1,LP001022,Male,Yes,1,Graduate,No,3076,1500,126.0,360.0,1.0,Urban
2,LP001031,Male,Yes,2,Graduate,No,5000,1800,208.0,360.0,1.0,Urban


# Classes

## ExploratoryDataAnalysis Class

In [5]:
class ExploratoryDataAnalysis():
    
    def __init__(self, data):
        print('Exploring your dataset!')
        self.data = data
     
    def missing_values_method(self, data):
        # return the total number of missing values of the dataset
        return data.isnull().sum()
    
    def data_description_method(self, data):

        # shape of the dataset
        rows, columns = data.shape
        
        # column names  
        features = data.columns
        
        # data types
        data_types = data.dtypes
        
        # missing values
        missing_values = self.missing_values_method(data)
        
        
        print('Information about your dataset!')
        print()
        print('This dataset has {} rows and {} columns'.format(rows, columns))
        print()
        print('The features of the dataset are:')
        print(features)
        print()
        print('The following is information about the data type of the dataset:')
        print(data_types)
        print()
        print('The following is information about any missing values of the dataset:')
        print(missing_values)


## PreProcessing Class

In [6]:
class PreProcessing():
    
    def __init__(self, data):
        print('Pre-processing your dataset!')
        self.data = data
    
    # dropna
    def drop(self, data, columns):
        data = data.dropna(subset=columns)
        data.reset_index(drop=True, inplace=True)
        return data
        
    # fillna numerical 
    def fill(self, data, column, filling):
        data_ = data.copy()
        data_[column].fillna(filling, inplace=True)
        return data_
    
    # label encoding
    def labelEncoding(self, data, column):
        le = LabelEncoder()        
        data[column] = le.fit_transform(data[column])

        return data

## Processing Class

In [18]:
class Processing():
    def __init__(self, data):
        self.data = data
        self.preprocessing = PreProcessing(data)
        print('Processing your dataset!')

    def processing(self, data):
        # drop
        data = self.preprocessing.drop(data, ['Gender', 'ApplicantIncome', 'LoanAmount'])
        
        # fill
        data = self.preprocessing.fill(data, 'CoapplicantIncome', 0)
        data = self.preprocessing.fill(data, 'Credit_History', 0)
        data = self.preprocessing.fill(data, 'Loan_Amount_Term', 360)
        data = self.preprocessing.fill(data, 'Dependents', '0')
        data = self.preprocessing.fill(data, 'Married', 'No')
        data = self.preprocessing.fill(data, 'Self_Employed', 'No')
        data = self.preprocessing.fill(data, 'Education', 'Not Graduate')
       
        # encode       
        data = self.preprocessing.labelEncoding(data, 'Gender')
        data = self.preprocessing.labelEncoding(data, 'Married')
        data = self.preprocessing.labelEncoding(data, 'Dependents')
        data = self.preprocessing.labelEncoding(data, 'Education')
        data = self.preprocessing.labelEncoding(data, 'Self_Employed')
        data = self.preprocessing.labelEncoding(data, 'property_Area')
        
        return data

## MachineLearning Class

In [8]:
class MachineLearning():
    def __init__(self, train_data, test_data, test_classes):
        print('Your dataset is ready for some Machine Learning algorithms!')
        self.train_data = train_data
        self.test_data = test_data
        self.test_classes = test_classes[test_classes['Loan_ID'].isin(test_data['Loan_ID'])]
        
    def models(self):
        x_train = self.train_data.drop(['Loan_Status', 'Loan_ID'], axis=1)
        x_test = self.test_data.loc[:, self.test_data.columns != 'Loan_ID']
        y_train = self.train_data['Loan_Status']
        y_test = self.test_classes['Loan_Status']
        
        x_train_ss = StandardScaler().fit_transform(x_train)
        x_test_ss = StandardScaler().fit_transform(x_test)
        
        names = [
            "Nearest Neighbors",
            "Linear SVM",
            "Gaussian Process",
            "Decision Tree",
            "Random Forest",
            "Neural Net",
            "AdaBoost",
            "Naive Bayes",
            "QDA",
            "Logistic Regression"]

        classifiers = [
            KNeighborsClassifier(),
            SVC(kernel="linear", C=0.025),
            GaussianProcessClassifier(),
            DecisionTreeClassifier(),
            RandomForestClassifier(),
            MLPClassifier(alpha=1, max_iter=1000),
            AdaBoostClassifier(),
            GaussianNB(),
            QuadraticDiscriminantAnalysis(),
            LogisticRegression()]

        # iterate over classifiers
        for name, clf in zip(names, classifiers):
            clf.fit(x_train_ss, y_train)
            y_pred = clf.predict(x_test_ss)
            print(name)
            print('---'*20)
            print(classification_report(y_test, y_pred))
            print('---'*20)
            print('---'*20)

# Usage of Classes

In [9]:
# define ExploratoryDataAnalysis class
eda_step = ExploratoryDataAnalysis(df_training_set)

Exploring your dataset!


In [10]:
# use data_description_method method from ExploratoryDataAnalysis class
eda_step.data_description_method(df_training_set)

Information about your dataset!

This dataset has 614 rows and 13 columns

The features of the dataset are:
Index(['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education',
       'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'property_Area', 'Loan_Status'],
      dtype='object')

The following is information about the data type of the dataset:
Loan_ID               object
Gender                object
Married               object
Dependents            object
Education             object
Self_Employed         object
ApplicantIncome      float64
CoapplicantIncome    float64
LoanAmount           float64
Loan_Amount_Term     float64
Credit_History       float64
property_Area         object
Loan_Status           object
dtype: object

The following is information about any missing values of the dataset:
Loan_ID               0
Gender               15
Married               3
Dependents           15
Education             1


In [23]:
# define Processing class
processing_step = Processing(df_training_set)

Pre-processing your dataset!
Processing your dataset!


In [24]:
# use processing method from Processing class to process training data
mydata_train = processing_step.processing(df_training_set)

In [21]:
# explore the processed training data 
eda_step.data_description_method(mydata_train)

Information about your dataset!

This dataset has 575 rows and 13 columns

The features of the dataset are:
Index(['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education',
       'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'property_Area', 'Loan_Status'],
      dtype='object')

The following is information about the data type of the dataset:
Loan_ID               object
Gender                 int64
Married                int64
Dependents             int64
Education              int64
Self_Employed          int64
ApplicantIncome      float64
CoapplicantIncome    float64
LoanAmount           float64
Loan_Amount_Term     float64
Credit_History       float64
property_Area          int64
Loan_Status           object
dtype: object

The following is information about any missing values of the dataset:
Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_

In [25]:
# process test data
mydata_test = processing_step.processing(df_testing_set)

In [26]:
# define MachineLearning class
machinelearning_step = MachineLearning(mydata_train, mydata_test, df_sample_submission)

Your dataset is ready for some Machine Learning algorithms!


In [27]:
# use models method from MachineLearning class 
machinelearning_step.models()

Nearest Neighbors
------------------------------------------------------------
              precision    recall  f1-score   support

           N       0.68      0.71      0.70        56
           Y       0.95      0.94      0.94       295

    accuracy                           0.90       351
   macro avg       0.81      0.82      0.82       351
weighted avg       0.90      0.90      0.90       351

------------------------------------------------------------
------------------------------------------------------------
Linear SVM
------------------------------------------------------------
              precision    recall  f1-score   support

           N       0.66      1.00      0.79        56
           Y       1.00      0.90      0.95       295

    accuracy                           0.92       351
   macro avg       0.83      0.95      0.87       351
weighted avg       0.95      0.92      0.92       351

------------------------------------------------------------
------------