In [1]:
#Load modules and import data
import pandas as pd
import numpy as np
data = pd.read_csv("/Users/bburns/Desktop/Analytics/Data Sets/Loan_Prediction_Training.csv", index_col="Loan_ID")

In [2]:
print(data)

          Gender Married Dependents     Education Self_Employed  \
Loan_ID                                                           
LP001002    Male      No          0      Graduate            No   
LP001003    Male     Yes          1      Graduate            No   
LP001005    Male     Yes          0      Graduate           Yes   
LP001006    Male     Yes          0  Not Graduate            No   
LP001008    Male      No          0      Graduate            No   
LP001011    Male     Yes          2      Graduate           Yes   
LP001013    Male     Yes          0  Not Graduate            No   
LP001014    Male     Yes         3+      Graduate            No   
LP001018    Male     Yes          2      Graduate            No   
LP001020    Male     Yes          1      Graduate            No   
LP001024    Male     Yes          2      Graduate            No   
LP001027    Male     Yes          2      Graduate           NaN   
LP001028    Male     Yes          2      Graduate            N

#1 Boolean Indexing

In [3]:
#Show rows with only Female, non-graduates who got loans.  
#Similar to a WHERE clause.
data.loc[(data["Gender"]=="Female") 
         & (data["Education"]=="Not Graduate") 
         & (data["Loan_Status"]=="Y"), ["Gender","Education","Loan_Status"]]

Unnamed: 0_level_0,Gender,Education,Loan_Status
Loan_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
LP001155,Female,Not Graduate,Y
LP001669,Female,Not Graduate,Y
LP001692,Female,Not Graduate,Y
LP001908,Female,Not Graduate,Y
LP002300,Female,Not Graduate,Y
LP002314,Female,Not Graduate,Y
LP002407,Female,Not Graduate,Y
LP002489,Female,Not Graduate,Y
LP002502,Female,Not Graduate,Y
LP002534,Female,Not Graduate,Y


#2 Apply Function
It is one of the commonly used functions for playing with data and creating new variables. Apply returns some value after passing each row/column of a data frame with some function. The function can be both default or user-defined.


In [4]:
# Find the #missing values in each row and column.
#Create a new function:
def num_missing(x):
  return sum(x.isnull())

#Applying per column:
print("Missing values per column:")
print(data.apply(num_missing, axis=0)) #axis=0 defines that function is to be applied on each column


#Applying per row:
print("\nMissing values per row:")
print (data.apply(num_missing, axis=1).head()) #axis=1 defines that function is to be applied on each row

Missing values per column:
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

Missing values per row:
Loan_ID
LP001002    1
LP001003    0
LP001005    0
LP001006    0
LP001008    0
dtype: int64


#3 Imputing Values

‘fillna()’ does it in one go. It is used for updating missing values with the overall mean/mode/median of the column. 

In [5]:
#Impute the ‘Gender’, ‘Married’ and ‘Self_Employed’ columns with their respective modes.
#First we import a function to determine the mode
from scipy.stats import mode
mode(data['LoanAmount'])

ModeResult(mode=array([ 120.]), count=array([20]))

In [7]:
#Determine pivot table
impute_grps = data.pivot_table(values=["LoanAmount"], index=["Gender","Married","Self_Employed"], aggfunc=np.mean)
print(impute_grps)

                              LoanAmount
Gender Married Self_Employed            
Female No      No             110.596774
               Yes            125.800000
       Yes     No             135.480000
               Yes            282.250000
Male   No      No             128.137255
               Yes            173.625000
       Yes     No             151.709220
               Yes            169.355556
