In [47]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
import seaborn as sns
import matplotlib.pyplot as plt

In [48]:
TrainData = pd.read_csv("Train Data.csv", delimiter = ',', header = 0, index_col = 0)
TrainData

Unnamed: 0_level_0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
Loan_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
...,...,...,...,...,...,...,...,...,...,...,...,...
LP002978,Female,No,0,Graduate,No,2900,0.0,71.0,360.0,1.0,Rural,Y
LP002979,Male,Yes,3+,Graduate,No,4106,0.0,40.0,180.0,1.0,Rural,Y
LP002983,Male,Yes,1,Graduate,No,8072,240.0,253.0,360.0,1.0,Urban,Y
LP002984,Male,Yes,2,Graduate,No,7583,0.0,187.0,360.0,1.0,Urban,Y


In [59]:
# Data Cleaning Pipeline
# 1) Imputing missing values (mean, median, most_frequent, knn, dropna)
# 2) Normalizing numerical features (optional, depends on model) (minmaxscaler, standardscaler, robustscaler)
# 2) Encoding categorical features (optional, depends on model) (labelencoder, one-hotencoder)
# 3) Remove outliers (IQR, Z-score, Multilinear Regression, IsolationForest)

# 5) Train a machine learning model
# 6) Evaluate and optimise the model
# 7) Clean new data (steps 1-4)
# 8) Fit the model on new data

numeric_features = ["ApplicantIncome", "CoapplicantIncome", "LoanAmount", "Loan_Amount_Term"]
categorical_features = ["Gender", "Married", "Dependents", "Education", "Self_Employed", "Credit_History", "Property_Area", "Loan_Status"]

In [60]:
print(TrainData.isnull().sum())

Gender               13
Married               0
Dependents           12
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           21
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64


In [61]:
# Using conclusions from data visualization section, decide for each feature how to fill the missing values.

# Gender: most_frequent (male), since there is a huge variance between the two.
# Married: we can drop the 3 tuples, knowing, their effect over 614 tuples is not significant anyway.
TrainData.drop(index=[row for row in TrainData.index 
                    if pd.isna(TrainData.loc[row, 'Married'])], inplace=True)
# Dependents: median
# Self_Employed: most_frequent
# Credit_History: most_frequent
# LoanAmount: (normally distributed variable, since mean=342, median=360, mode=360) impute with mean
# Loan_Amount_Term: impute with median

numeric_null_features = ["LoanAmount", "Loan_Amount_Term"]
categorical_null_features = ["Gender", #"Married", 
                             "Dependents", "Self_Employed", "Credit_History"]

# Note: In regards to imputing missing data: "Mean is most useful when the original data is not skewed, 
# while the median is more robust, not sensitive to outliers, and thus used when data is skewed."

# next steps: use RobustScaler on numerical + LabelEncoder on[], OneHotEncoder on[] 
TrainData.isnull().sum()

Gender               13
Married               0
Dependents           12
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           21
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [62]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, RobustScaler, LabelEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

num_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='mean')),
    ('scale',MinMaxScaler())
])
cat_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('label', LabelEncoder())
])

In [63]:
clean_col_trans = ColumnTransformer(transformers=[
    ('num_pipeline',num_pipeline,numeric_features),
    ('cat_pipeline',cat_pipeline,categorical_features)
    ],
    #remainder='passthrough',
    remainder='drop', # the specified columns in transformers are transformed and combined in the output, and the non-specified columns are dropped.
    n_jobs=-1) # Number of jobs to run in parallel. -1 -> all processors


In [64]:
from sklearn.linear_model import LinearRegression

clf = LinearRegression()
clf_pipeline = Pipeline(steps=[
    ('clean_col_trans', clean_col_trans),
    ('linear_reg_model', clf)
])

In [65]:
from sklearn import set_config

set_config(display='diagram')
display(clf_pipeline)

In [66]:
from sklearn.model_selection import train_test_split

X= TrainData.iloc[:, :-1]
y= TrainData.iloc[:, -1] 
# train test split: 80/20 split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)
print("X: ", X)
print("y: ", y)

X:            Gender Married Dependents     Education Self_Employed  \
Loan_ID                                                           
LP001002    Male      No          0      Graduate            No   
LP001003    Male     Yes          1      Graduate            No   
LP001005    Male     Yes          0      Graduate           Yes   
LP001006    Male     Yes          0  Not Graduate            No   
LP001008    Male      No          0      Graduate            No   
...          ...     ...        ...           ...           ...   
LP002978  Female      No          0      Graduate            No   
LP002979    Male     Yes         3+      Graduate            No   
LP002983    Male     Yes          1      Graduate            No   
LP002984    Male     Yes          2      Graduate            No   
LP002990  Female      No          0      Graduate           Yes   

          ApplicantIncome  CoapplicantIncome  LoanAmount  Loan_Amount_Term  \
Loan_ID                                       

In [67]:
clf_pipeline.fit(X_train, y_train)
# preds = clf_pipeline.predict(X_test)
score = clf_pipeline.score(X_test, y_test)
print("Model score: ", score) # model accuracy

ValueError: A given column is not a column of the dataframe

In [None]:
from sklearn.base import BaseEstimator
from sklearn.linear_model import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

class ClfSwitcher(BaseEstimator):

    def __init__(self, estimator = DecisionTreeClassifier()):
            self.estimator = estimator
            
    def fit(self, X, y=None, **kwargs):
            self.estimator.fit(X, y)
            return self
            
    def predict(self, X, y=None):
            return self.estimator.predict(X)
            
    def predict_proba(self, X):
            return self.estimator.predict_proba(X)
            
    def score(self, X, y):
            return self.estimator.score(X, y)

In [None]:
from sklearn.model_selection import train_test_split

X = TrainData["Gender", "Married", "Dependents",
"Education"             ,
"Self_Employed"        ,
"ApplicantIncome"       ,
"CoapplicantIncome"     ,
"LoanAmount"           ,
"Loan_Amount_Term"     ,
"Credit_History"       ,
"Property_Area"]
y = TrainData['Loan_Status']
# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y) #stratification. 70/30 split

In [None]:
clf_pipeline = Pipeline(steps=[
    ('clean_col_trans', clean_col_trans),
    ('model', ClfSwitcher())  # class to switch between models
])


from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

grid_params = [
    {'model__estimator': [DecisionTreeClassifier(criterion="entropy")]},
    {'model__estimator': [KNeighborsClassifier()]},
    {'model__estimator': [GaussianNB()]}
]

gs = GridSearchCV(clf_pipeline, grid_params, scoring='accuracy')
# prepare the cross-validation procedure
cv = KFold(n_splits=10, random_state=1, shuffle=True)
# evaluate model
scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
gs.fit(X_train, y_train)

print("Best Score of train set: "+str(gs.best_score_))
print("Best parameter set: "+str(gs.best_params_))
print("Test Score: "+str(gs.score(X_test,y_test)))