In [1]:
# --------------------------------------------------
# Data processing template
# --------------------------------------------------

# Import libraries
import pandas as pd

In [2]:
# Read data from the csv file
dataset = pd.read_csv('loan_small.csv')

In [3]:
# Read data from the tsv (Tab Seperated) file
dataset_tab = pd.read_csv('loan_small_tsv.txt', sep='\t')

In [4]:
# Access the data using iloc. 
# Example - Get first three rows from the second and third column
subset = dataset.iloc[0:3, 1:3]

In [5]:
# Access the data using column names
# Get all rows of the column Gender and ApplicantIncome
subsetN = dataset[['Gender', 'ApplicantIncome']]

In [6]:
# Get first three rows of the columns Gender and ApplicantIncome
subsetN = dataset[['Gender', 'ApplicantIncome']][0:3]

In [7]:
# display a small set of data for quick check on a large data
dataset.head(10)

Unnamed: 0,Loan_ID,Gender,ApplicantIncome,CoapplicantIncome,LoanAmount,Area,Loan_Status
0,LP001002,,5849.0,0.0,,urban,Y
1,LP001003,Male,4583.0,,128.0,semi,N
2,LP001005,Male,3000.0,0.0,66.0,,Y
3,LP001006,Female,2583.0,2358.0,120.0,semi,
4,LP001008,Male,,0.0,141.0,urban,Y
5,LP001011,Male,5417.0,4196.0,267.0,semi,Y
6,LP001013,Male,2333.0,1516.0,,rural,Y
7,LP001014,Female,3036.0,2504.0,158.0,semi,N
8,LP001018,Male,4006.0,1526.0,168.0,rural,Y
9,LP001020,Male,12841.0,10968.0,349.0,semi,N


In [8]:
# Get the Shape of the dataframe (Row x Columns)
dataset.shape

(16, 7)

In [9]:
# Get column names of the dataframe
dataset.columns

Index(['Loan_ID', 'Gender', 'ApplicantIncome', 'CoapplicantIncome',
       'LoanAmount', 'Area', 'Loan_Status'],
      dtype='object')

In [10]:
# Store column names of the dataframe in a list
column_list = dataset.columns.to_list()

In [11]:
# -------------------------------------------------------------
# Handling missing values
# -------------------------------------------------------------

# Find out columns with missing values with their count
dataset.isnull().sum(axis=0)

Loan_ID              0
Gender               1
ApplicantIncome      2
CoapplicantIncome    1
LoanAmount           3
Area                 1
Loan_Status          1
dtype: int64

In [12]:
# Drop all the rows with missing values
dataset_clean = dataset.dropna()

In [13]:
# Drop all the rows with missing values of a particular column 
dataset_clean = dataset.dropna(subset=["Loan_Status"])

In [14]:
# Replace missing categorical values using column names
dt = dataset.copy()
cols = ['Gender', 'Area', 'Loan_Status']

In [15]:
# fillna for filling NaN values
dt[cols] = dt[cols].fillna(dt.mode().iloc[0])
dt.isnull().sum(axis=0)

Loan_ID              0
Gender               0
ApplicantIncome      2
CoapplicantIncome    1
LoanAmount           3
Area                 0
Loan_Status          0
dtype: int64

In [16]:
# Replace missing numerical values using column names
cols2 = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount']

In [17]:
# fillna for filling NaN values
dt[cols2] = dt[cols2].fillna(dt.mean())
dt.isnull().sum(axis=0)

Loan_ID              0
Gender               0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Area                 0
Loan_Status          0
dtype: int64

In [18]:
#
# ---------------------------------------------------------
# label encoding - Convert Categorical to Numerical values
# ---------------------------------------------------------

# Get datatypes of all the columns of the dataframe
dt.dtypes

Loan_ID               object
Gender                object
ApplicantIncome      float64
CoapplicantIncome    float64
LoanAmount           float64
Area                  object
Loan_Status           object
dtype: object

In [19]:
# Convert string/object column types to categorical 
dt[cols] = dt[cols].astype('category')

In [20]:
# Convert string to numerical codes
for columns in cols:
    dt[columns] = dt[columns].cat.codes

In [21]:
# ---------------------------------------------------------
# Hot encoding or Dummy Variable Creation
# ---------------------------------------------------------

# Drop a column using column name
df2 = dataset.drop(['Loan_ID'], axis=1)

In [22]:
# using get_dummies function of Pandas
df2 = pd.get_dummies(df2)

In [23]:
# Avoid dummy variable trap using drop_first
df3 = dataset.drop(['Loan_ID'], axis=1)
df3 = pd.get_dummies(df3, drop_first=True)

In [24]:
# ---------------------------------------------------------
# Data Normalization using Standardscaler and MinMax 
# ---------------------------------------------------------

# extract data to scale
data_to_scale = dataset_clean.iloc[:, 2:5]

In [25]:
# Import the StandardScaler class
from sklearn.preprocessing import StandardScaler

In [26]:
# Create an object of the class StandardScaler
scaler = StandardScaler()

In [27]:
# Fit and Transform the data for normalization
ss_scaler = scaler.fit_transform(data_to_scale)

In [28]:
# MinMax Normalization of the data
from sklearn.preprocessing import minmax_scale

In [29]:
# Fit and Transform the data for MinMax normalization
mm_scaler = minmax_scale(data_to_scale)

In [30]:
# ----------------------------------------------------------
# Split the Data by rows and columns
# ----------------------------------------------------------
df = dataset.copy()

In [31]:
# Split by column for X(independent) and Y(dependent) variables
X = df.iloc[:, :-1]
Y = df.iloc[:,  -1]

In [32]:
# Split by rows for training and test datasets
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test =      \
train_test_split(X, Y, test_size=0.3, random_state=1234)