In [4]:
# Importing Data Manipulation Libraries
import pandas as pd
import numpy as np
# Import Data Visualization Libraries
import seaborn as sns 
import matplotlib.pyplot as plt 
# Import Filter Warning Libraries
import warnings
warnings.filterwarnings('ignore')
# Import Logging
import logging
logging.basicConfig(level = logging.INFO,
                    format = '%(asctime)s - %(levelname)s - %(message)s',
                    filemode = 'w',
                    filename = 'model.log',force = True)
# Import Scikit Learn Libraries for Machine Learning Model Building
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split,cross_val_score,GridSearchCV,learning_curve,KFold
from sklearn.metrics import mean_squared_error,mean_absolute_error,r2_score
from sklearn.linear_model import LinearRegression,Lasso,Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
import xgboost
from xgboost import XGBRegressor
from sklearn.cluster import KMeans


# Multicolinearity test and treatment libraries
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.decomposition import PCA

#3. Import OrderDict()
from collections import OrderedDict

In [5]:
filepath = "Data/credit-risk-data.csv"
df = pd.read_csv(filepath)
df.head()

Unnamed: 0,CreditScore,AnnualIncome,LoanAmount,LoanDuration,Age,EmploymentStatus,MaritalStatus,NumberOfDependents,EducationLevel,HomeOwnershipStatus,...,JobTenure,MonthlySavings,AnnualBonuses,AnnualExpenses,MonthlyHousingCosts,MonthlyTransportationCosts,MonthlyFoodCosts,MonthlyHealthcareCosts,MonthlyEntertainmentCosts,LoanApproved
0,402,63295,18830,13,29,Self-Employed,Widowed,2,Doctorate,Other,...,24,378,3741,40058,977,412,399,136,124,0
1,735,55936,23729,1,42,Self-Employed,Divorced,3,Master,Own,...,10,575,4115,16745,695,206,898,252,131,0
2,570,62547,19660,7,54,Self-Employed,Single,3,Doctorate,Mortgage,...,16,691,4105,23273,627,266,392,73,36,0
3,406,46129,21674,23,25,Self-Employed,Divorced,3,High School,Other,...,6,452,4559,42163,397,307,250,378,-32,0
4,371,57725,12189,26,42,Employed,Widowed,4,Master,Own,...,2,690,7856,30087,723,315,114,88,68,0


In [6]:
# Descriptive stats

def descriptive_stats():
    numerical_col = df.select_dtypes(exclude = 'object').columns
    categorica_col = df.select_dtypes(include = 'object').columns
    num_stats = []
    cat_stats = []
    data_info = []

    for i in numerical_col:
        Q1 = df[i].quantile(0.25)
        Q3 = df[i].quantile(0.75)
        IQR = Q3 - Q1 
        LF = Q1 - 1.5*IQR
        UF = Q3 + 1.5*IQR

        outlier_count = len(df[(df[i] < LF) | (df[i] > UF)])
        outlier_percentage = outlier_count / len(df[i]) * 100

        numerical_stats = OrderedDict({
            "Feature " : i ,
            "Q1" : Q1,
            "Q3" : Q3,
            "IQR" : IQR,
            "LF" : LF,
            "UF" : UF,
            "Mean" : df[i].mean(),
            "Median" : df[i].median(),
            "Min" : df[i].min(),
            "Max" : df[i].max(),
            "Outlier count" : outlier_count,
            "outlier percentage" : outlier_percentage,
            "standard derivation": df[i].std(),
            "variance" : df[i].var(),
            "skewness" : df[i].skew(),
            "kurtosis" : df[i].kurtosis()
        })
        num_stats.append(numerical_stats)
    numerical_stats_report = pd.DataFrame(num_stats)

    for i in categorica_col:
        categorical_stats = OrderedDict({
            "Feature" : i , 
            "Unquie count" : df[i].nunique(),
            "Value count" : df[i].value_counts(),
            "mode" : df[i].mode()
        })
        cat_stats.append(categorical_stats)
    categorical_stats_report = pd.DataFrame(cat_stats)


    for i in df.columns : 
        data1 = OrderedDict({
            "Feature" : i ,
            "Missing value" : df[i].isnull().sum(),
            "Unqiue value" : df[i].nunique(),
            "value count " : df[i].value_counts().to_dict()
        })
        data_info.append(data1)
    data_info_report = pd.DataFrame(data_info)

    return categorical_stats_report,numerical_stats_report,data_info_report

categorical_stats_report,numerical_stats_report,data_info_report = descriptive_stats()

In [7]:
numerical_stats_report

Unnamed: 0,Feature,Q1,Q3,IQR,LF,UF,Mean,Median,Min,Max,Outlier count,outlier percentage,standard derivation,variance,skewness,kurtosis
0,CreditScore,437.0,712.0,275.0,24.5,1124.5,574.42352,574.0,300.0,849.0,0,0.0,158.751086,25201.91,0.001973,-1.198059
1,AnnualIncome,49874.0,70114.0,20240.0,19514.0,100474.0,59998.98444,59957.0,1263.0,118054.0,737,0.737,14995.177544,224855300.0,1.4e-05,-0.01202
2,LoanAmount,16614.0,23369.0,6755.0,6481.5,33501.5,19996.25248,19986.0,-348.0,40423.0,694,0.694,5017.365491,25173960.0,0.001516,-0.016734
3,LoanDuration,8.0,22.0,14.0,-13.0,43.0,15.02664,15.0,1.0,29.0,0,0.0,8.376726,70.16953,-0.004112,-1.20591
4,Age,30.0,56.0,26.0,-9.0,95.0,43.41745,43.0,18.0,69.0,0,0.0,14.982679,224.4807,0.010035,-1.198233
5,NumberOfDependents,1.0,3.0,2.0,-2.0,6.0,1.99724,2.0,0.0,4.0,0,0.0,1.417841,2.010272,5.1e-05,-1.305371
6,MonthlyDebtPayments,366.0,635.0,269.0,-37.5,1038.5,499.91136,499.0,-402.0,1378.0,726,0.726,199.988611,39995.44,0.005237,0.016336
7,CreditCardUtilizationRate,0.251575,0.750752,0.499176,-0.497189,1.499517,0.500778,0.501393,1.4e-05,0.999997,0,0.0,0.288493,0.08322814,-0.002672,-1.198389
8,NumberOfOpenCreditLines,3.0,11.0,8.0,-9.0,23.0,6.99887,7.0,0.0,14.0,0,0.0,4.319747,18.66022,-0.001191,-1.21423
9,NumberOfCreditInquiries,2.0,7.0,5.0,-5.5,14.5,4.51075,5.0,0.0,9.0,0,0.0,2.872679,8.252287,-0.003135,-1.227823


In [8]:
categorical_stats_report

Unnamed: 0,Feature,Unquie count,Value count,mode
0,EmploymentStatus,3,EmploymentStatus Employed 33488 Self-E...,"0 Employed Name: EmploymentStatus, dtype: o..."
1,MaritalStatus,4,MaritalStatus Married 25226 Divorced 24...,"0 Married Name: MaritalStatus, dtype: object"
2,EducationLevel,5,EducationLevel Associate 20153 High Schoo...,"0 Associate Name: EducationLevel, dtype: ob..."
3,HomeOwnershipStatus,4,HomeOwnershipStatus Mortgage 25168 Own ...,"0 Mortgage Name: HomeOwnershipStatus, dtype..."
4,LoanPurpose,5,LoanPurpose Education 20120 Home ...,"0 Education Name: LoanPurpose, dtype: object"
5,HealthInsuranceStatus,2,HealthInsuranceStatus Insured 50042 Unins...,"0 Insured Name: HealthInsuranceStatus, dtyp..."
6,LifeInsuranceStatus,2,LifeInsuranceStatus Uninsured 50031 Insured...,"0 Uninsured Name: LifeInsuranceStatus, dtyp..."
7,CarInsuranceStatus,2,CarInsuranceStatus Insured 50163 Uninsure...,"0 Insured Name: CarInsuranceStatus, dtype: ..."
8,HomeInsuranceStatus,2,HomeInsuranceStatus Uninsured 50214 Insured...,"0 Uninsured Name: HomeInsuranceStatus, dtyp..."
9,EmployerType,4,EmployerType Private 25240 Other ...,"0 Private Name: EmployerType, dtype: object"


In [9]:
data_info_report

Unnamed: 0,Feature,Missing value,Unqiue value,value count
0,CreditScore,0,550,"{675: 221, 656: 216, 829: 216, 584: 215, 448: ..."
1,AnnualIncome,0,48065,"{56031: 12, 58552: 11, 56208: 10, 58777: 10, 6..."
2,LoanAmount,0,22395,"{21448: 21, 21369: 21, 20813: 20, 19360: 18, 1..."
3,LoanDuration,0,29,"{16: 3528, 28: 3523, 4: 3521, 14: 3506, 23: 34..."
4,Age,0,52,"{33: 2020, 31: 2011, 26: 2009, 21: 1982, 57: 1..."
5,EmploymentStatus,0,3,"{'Employed': 33488, 'Self-Employed': 33347, 'U..."
6,MaritalStatus,0,4,"{'Married': 25226, 'Divorced': 24988, 'Single'..."
7,NumberOfDependents,0,5,"{0: 20273, 4: 20044, 3: 19970, 2: 19925, 1: 19..."
8,EducationLevel,0,5,"{'Associate': 20153, 'High School': 20122, 'Ba..."
9,HomeOwnershipStatus,0,4,"{'Mortgage': 25168, 'Own': 25037, 'Rent': 2498..."
