# 1. DATA CLEANSING 

### Data Cleansing Notebook to generate "clean_data.csv"

### Libraries

In [2]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")
pd.set_option("display.max_columns",None)
pd.set_option("display.max_rows",None)

### Importing the data

In [3]:
df = pd.read_csv('Application_Data.csv')

In [4]:
df.head(3)

Unnamed: 0,Applicant_ID,Applicant_Gender,Owned_Car,Owned_Realty,Total_Children,Total_Income,Income_Type,Education_Type,Family_Status,Housing_Type,Owned_Mobile_Phone,Owned_Work_Phone,Owned_Phone,Owned_Email,Job_Title,Total_Family_Members,Applicant_Age,Years_of_Working,Total_Bad_Debt,Total_Good_Debt,Status
0,5008806,M,1,1,0,112500,Working ...,Secondary / secondary special ...,Married ...,House / apartment ...,1,0,0,0,Security staff ...,2,59,4,0,30,1
1,5008808,F,0,1,0,270000,Commercial associate ...,Secondary / secondary special ...,Single / not married ...,House / apartment ...,1,0,1,1,Sales staff ...,1,53,9,0,5,1
2,5008809,F,0,1,0,270000,Commercial associate ...,Secondary / secondary special ...,Single / not married ...,House / apartment ...,1,0,1,1,Sales staff ...,1,53,9,0,5,1


### Info and data description

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25128 entries, 0 to 25127
Data columns (total 21 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Applicant_ID          25128 non-null  int64 
 1   Applicant_Gender      25128 non-null  object
 2   Owned_Car             25128 non-null  int64 
 3   Owned_Realty          25128 non-null  int64 
 4   Total_Children        25128 non-null  int64 
 5   Total_Income          25128 non-null  int64 
 6   Income_Type           25128 non-null  object
 7   Education_Type        25128 non-null  object
 8   Family_Status         25128 non-null  object
 9   Housing_Type          25128 non-null  object
 10  Owned_Mobile_Phone    25128 non-null  int64 
 11  Owned_Work_Phone      25128 non-null  int64 
 12  Owned_Phone           25128 non-null  int64 
 13  Owned_Email           25128 non-null  int64 
 14  Job_Title             25128 non-null  object
 15  Total_Family_Members  25128 non-null

In [6]:
df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Applicant_ID,25128.0,5078835.0,41943.777736,5008806.0,5042225.75,5079004.0,5115603.25,5150487.0
Owned_Car,25128.0,0.4183779,0.493303,0.0,0.0,0.0,1.0,1.0
Owned_Realty,25128.0,0.6549268,0.475402,0.0,0.0,1.0,1.0,1.0
Total_Children,25128.0,0.5094715,0.762937,0.0,0.0,0.0,1.0,5.0
Total_Income,25128.0,194836.5,104521.1233,27000.0,135000.0,180000.0,225000.0,1575000.0
Owned_Mobile_Phone,25128.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
Owned_Work_Phone,25128.0,0.2737584,0.445895,0.0,0.0,0.0,1.0,1.0
Owned_Phone,25128.0,0.2927412,0.45503,0.0,0.0,0.0,1.0,1.0
Owned_Email,25128.0,0.1006845,0.300916,0.0,0.0,0.0,0.0,1.0
Total_Family_Members,25128.0,2.291309,0.928871,1.0,2.0,2.0,3.0,7.0


### Null value analysis

In [20]:
df.isnull().sum()

Applicant_ID            0
Applicant_Gender        0
Owned_Car               0
Owned_Realty            0
Total_Children          0
Total_Income            0
Income_Type             0
Education_Type          0
Family_Status           0
Housing_Type            0
Owned_Mobile_Phone      0
Owned_Work_Phone        0
Owned_Phone             0
Owned_Email             0
Job_Title               0
Total_Family_Members    0
Applicant_Age           0
Years_of_Working        0
Total_Bad_Debt          0
Total_Good_Debt         0
Status                  0
dtype: int64

### Understanding the data

In [60]:
for i in df.columns:
    print(i)
    print(i," : ",df[i].nunique())
    print(df[i].unique())
    print('\n\n')

Applicant_ID
Applicant_ID  :  25128
[5008806 5008808 5008809 ... 5150484 5150485 5150487]



Applicant_Gender
Applicant_Gender  :  2
['M' 'F']



Owned_Car
Owned_Car  :  2
[1 0]



Owned_Realty
Owned_Realty  :  2
[1 0]



Total_Children
Total_Children  :  6
[0 3 1 2 4 5]



Total_Income
Total_Income  :  195
[ 112500  270000  135000  130500  157500  405000  211500  360000  126000
  247500  297000  166500  225000  315000  148500  202500  450000  180000
   90000  765000  229500  292500   74250  144000  337500  193500  267750
  139500   67500  252000  900000  279000   76500  234000   81000  108000
  198000  216000   45000  238500  117000 1350000   99000  445500  427500
  306000  328500  310500  540000  171000  675000  121500   72000  184500
  189000  243000   94500  256500  697500  196650   40500  432000  162000
  103500  257625   58500  207000  720000  261000  175500  382500  119250
   63000  274500   90900  495000  220500   49500  630000  585000  145350
  173250  531000  387000 1125000  

### We can see there are some unwanted white spaces in the data. lets clean them! 

In [46]:
## Gender
agen = []
for i in df.Applicant_Gender:
    agen.append(i.strip())
df['Applicant_Gender'] = agen

In [50]:
## Income Type
ityp = []
for i in df.Income_Type:
    ityp.append(i.strip())
df['Income_Type'] = ityp

In [52]:
## Education Type
etyp = []
for i in df.Education_Type:
    etyp.append(i.strip())
df['Education_Type'] = etyp

In [54]:
## Family Status
fsts = []
for i in df.Family_Status:
    fsts.append(i.strip())
df['Family_Status'] = fsts

In [57]:
## Housing type
htyp = []
for i in df.Housing_Type:
    htyp.append(i.strip())
df['Housing_Type'] = htyp

In [59]:
## Jobtitle
jtl = []
for i in df.Job_Title:
    jtl.append(i.strip())
df['Job_Title'] = jtl

In [61]:
df.head()

Unnamed: 0,Applicant_ID,Applicant_Gender,Owned_Car,Owned_Realty,Total_Children,Total_Income,Income_Type,Education_Type,Family_Status,Housing_Type,Owned_Mobile_Phone,Owned_Work_Phone,Owned_Phone,Owned_Email,Job_Title,Total_Family_Members,Applicant_Age,Years_of_Working,Total_Bad_Debt,Total_Good_Debt,Status
0,5008806,M,1,1,0,112500,Working,Secondary / secondary special,Married,House / apartment,1,0,0,0,Security staff,2,59,4,0,30,1
1,5008808,F,0,1,0,270000,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,1,0,1,1,Sales staff,1,53,9,0,5,1
2,5008809,F,0,1,0,270000,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,1,0,1,1,Sales staff,1,53,9,0,5,1
3,5008810,F,0,1,0,270000,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,1,0,1,1,Sales staff,1,53,9,0,27,1
4,5008811,F,0,1,0,270000,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,1,0,1,1,Sales staff,1,53,9,0,39,1


### Saving Clean data ! 

In [62]:
df.to_csv('clean_data.csv',index=False)