# DATA PREPROCESSING SAMPLE FOR LOAN PREDICTION

## DATA CLEANING
## Handle Missing Data

In [11]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer

In [15]:
TrainData = pd.read_csv("Train Data.csv", delimiter = ',', header = 0, index_col = 0)
print(TrainData)

          Gender Married Dependents     Education Self_Employed  \
Loan_ID                                                           
LP001002    Male      No          0      Graduate            No   
LP001003    Male     Yes          1      Graduate            No   
LP001005    Male     Yes          0      Graduate           Yes   
LP001006    Male     Yes          0  Not Graduate            No   
LP001008    Male      No          0      Graduate            No   
...          ...     ...        ...           ...           ...   
LP002978  Female      No          0      Graduate            No   
LP002979    Male     Yes         3+      Graduate            No   
LP002983    Male     Yes          1      Graduate            No   
LP002984    Male     Yes          2      Graduate            No   
LP002990  Female      No          0      Graduate           Yes   

          ApplicantIncome  CoapplicantIncome  LoanAmount  Loan_Amount_Term  \
Loan_ID                                           

In [16]:
#Identify missing data of both types, numerical and categorical data
NumericData = TrainData.select_dtypes(exclude = ['object']).columns.tolist()
print(NumericData)

['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term', 'Credit_History']


In [17]:
#Numerical data with loan id
TrainData_n = TrainData[NumericData]
print(TrainData_n, '\n')

          ApplicantIncome  CoapplicantIncome  LoanAmount  Loan_Amount_Term  \
Loan_ID                                                                      
LP001002             5849                0.0         NaN             360.0   
LP001003             4583             1508.0       128.0             360.0   
LP001005             3000                0.0        66.0             360.0   
LP001006             2583             2358.0       120.0             360.0   
LP001008             6000                0.0       141.0             360.0   
...                   ...                ...         ...               ...   
LP002978             2900                0.0        71.0             360.0   
LP002979             4106                0.0        40.0             180.0   
LP002983             8072              240.0       253.0             360.0   
LP002984             7583                0.0       187.0             360.0   
LP002990             4583                0.0       133.0        

In [18]:
#make sure there are no missing values
#if there are missing values replace with 0
#eliminate any tuple with an attribute marked as 0
MissingZero = SimpleImputer(missing_values = np.NaN , strategy = 'constant', fill_value = 0)
MissingZero.fit(TrainData_n)
TrainData_n = MissingZero.transform(TrainData_n)
print(TrainData_n)

[[5.849e+03 0.000e+00 0.000e+00 3.600e+02 1.000e+00]
 [4.583e+03 1.508e+03 1.280e+02 3.600e+02 1.000e+00]
 [3.000e+03 0.000e+00 6.600e+01 3.600e+02 1.000e+00]
 ...
 [8.072e+03 2.400e+02 2.530e+02 3.600e+02 1.000e+00]
 [7.583e+03 0.000e+00 1.870e+02 3.600e+02 1.000e+00]
 [4.583e+03 0.000e+00 1.330e+02 3.600e+02 0.000e+00]]


In [19]:
CategoricData = TrainData.select_dtypes(include = ['object']).columns.tolist()
print(CategoricData)

['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Property_Area', 'Loan_Status']


In [20]:
TrainData_c = TrainData[CategoricData]
print(TrainData_c, '\n')

          Gender Married Dependents     Education Self_Employed Property_Area  \
Loan_ID                                                                         
LP001002    Male      No          0      Graduate            No         Urban   
LP001003    Male     Yes          1      Graduate            No         Rural   
LP001005    Male     Yes          0      Graduate           Yes         Urban   
LP001006    Male     Yes          0  Not Graduate            No         Urban   
LP001008    Male      No          0      Graduate            No         Urban   
...          ...     ...        ...           ...           ...           ...   
LP002978  Female      No          0      Graduate            No         Rural   
LP002979    Male     Yes         3+      Graduate            No         Rural   
LP002983    Male     Yes          1      Graduate            No         Urban   
LP002984    Male     Yes          2      Graduate            No         Urban   
LP002990  Female      No    

In [21]:
MissingStatus = SimpleImputer(missing_values = np.NaN , strategy = 'constant', fill_value = 'Not Applicable')
MissingStatus.fit(TrainData_c)
TrainData_c = MissingStatus.transform(TrainData_c)
print(TrainData_c)

[['Male' 'No' '0' ... 'No' 'Urban' 'Y']
 ['Male' 'Yes' '1' ... 'No' 'Rural' 'N']
 ['Male' 'Yes' '0' ... 'Yes' 'Urban' 'Y']
 ...
 ['Male' 'Yes' '1' ... 'No' 'Urban' 'Y']
 ['Male' 'Yes' '2' ... 'No' 'Urban' 'Y']
 ['Female' 'No' '0' ... 'Yes' 'Semiurban' 'N']]


In [22]:
TrainData[NumericData] = TrainData_n
TrainData[CategoricData] = TrainData_c
print(TrainData)

          Gender Married Dependents     Education Self_Employed  \
Loan_ID                                                           
LP001002    Male      No          0      Graduate            No   
LP001003    Male     Yes          1      Graduate            No   
LP001005    Male     Yes          0      Graduate           Yes   
LP001006    Male     Yes          0  Not Graduate            No   
LP001008    Male      No          0      Graduate            No   
...          ...     ...        ...           ...           ...   
LP002978  Female      No          0      Graduate            No   
LP002979    Male     Yes         3+      Graduate            No   
LP002983    Male     Yes          1      Graduate            No   
LP002984    Male     Yes          2      Graduate            No   
LP002990  Female      No          0      Graduate           Yes   

          ApplicantIncome  CoapplicantIncome  LoanAmount  Loan_Amount_Term  \
Loan_ID                                           

## Handle Noisy Data
## Binning By pd.cut

In [23]:
#Bin by Applicant Income, Coapplicant Income, Loan Amount
print(TrainData['ApplicantIncome'].describe())
#ApplicantIncomeCut = pd.cut(TrainData.iloc[:, 5], 4)
#print(ApplicantIncomeCut)

count      614.000000
mean      5403.459283
std       6109.041673
min        150.000000
25%       2877.500000
50%       3812.500000
75%       5795.000000
max      81000.000000
Name: ApplicantIncome, dtype: float64


In [24]:
Bins = 10
ApplicantIncomeBinSize = (81000 - 150)/Bins
print(pd.cut(TrainData['ApplicantIncome'], Bins, precision = 0).value_counts(sort=False))

(69.0, 8235.0]        533
(8235.0, 16320.0]      59
(16320.0, 24405.0]     15
(24405.0, 32490.0]      0
(32490.0, 40575.0]      4
(40575.0, 48660.0]      0
(48660.0, 56745.0]      1
(56745.0, 64830.0]      1
(64830.0, 72915.0]      0
(72915.0, 81000.0]      1
Name: ApplicantIncome, dtype: int64


In [25]:
TrainData['ApplicantIncome_Bin'] = pd.cut(TrainData['ApplicantIncome'], Bins, labels = False)
#print(TrainData.head())                                          

In [47]:
print(TrainData['CoapplicantIncome'].describe())

count      614.000000
mean      1621.245798
std       2926.248369
min          0.000000
25%          0.000000
50%       1188.500000
75%       2297.250000
max      41667.000000
Name: CoapplicantIncome, dtype: float64


In [48]:
print(TrainData['LoanAmount'].describe())

count    614.000000
mean     141.166124
std       88.340630
min        0.000000
25%       98.000000
50%      125.000000
75%      164.750000
max      700.000000
Name: LoanAmount, dtype: float64


## Binning By Feature Engine

In [26]:
from feature_engine.discretisation import EqualWidthDiscretiser

In [43]:
#Bin by Applicant Income, Coapplicant Income, Loan Amount
ApplicantIncomeFE = EqualWidthDiscretiser(bins=10, return_object = True, return_boundaries = True)
#ApplicantIncomeFE.fit(TrainData)
#ApplicantIncomeFE.transform(TrainData)["ApplicantIncome_b"].value_counts()
#ApplicantIncomeFE = EqualWidthDiscretiser()
#print(ApplicantIncomeFE)
ApplicantIncomeFE.fit(TrainData)
ApplicantIncomeFE.transform(TrainData)["ApplicantIncome"].value_counts()

(-inf, 8235.0]        533
(8235.0, 16320.0]      59
(16320.0, 24405.0]     15
(32490.0, 40575.0]      4
(48660.0, 56745.0]      1
(56745.0, 64830.0]      1
(72915.0, inf]          1
Name: ApplicantIncome, dtype: int64

In [46]:
CoapplicantIncomeFE = EqualWidthDiscretiser(bins=10, return_object = True, return_boundaries = True)
CoapplicantIncomeFE.fit(TrainData)
CoapplicantIncomeFE.transform(TrainData)["CoapplicantIncome"].value_counts()

(-inf, 4166.7]        561
(4166.7, 8333.4]       46
(8333.4, 12500.1]       3
(16666.8, 20833.5]      2
(33333.6, 37500.3]      1
(37500.3, inf]          1
Name: CoapplicantIncome, dtype: int64

In [49]:
LoanAmountFE = EqualWidthDiscretiser(bins=10, return_object = True, return_boundaries = True)
LoanAmountFE.fit(TrainData)
LoanAmountFE.transform(TrainData)["LoanAmount"].value_counts()

(70.0, 140.0]     313
(140.0, 210.0]    152
(-inf, 70.0]       78
(210.0, 280.0]     37
(280.0, 350.0]     15
(350.0, 420.0]      6
(420.0, 490.0]      5
(560.0, 630.0]      3
(490.0, 560.0]      3
(630.0, inf]        2
Name: LoanAmount, dtype: int64

## Binning By KBinsDiscretizer 

In [50]:
from sklearn.preprocessing import KBinsDiscretizer

In [57]:
#Default bins
TrainDataAmounts = TrainData['LoanAmount']
#print(TrainDataAmounts)
TrainDataDefBin = KBinsDiscretizer(n_bins = 10, strategy = 'uniform', encode = 'ordinal')
n = TrainDataDefBin.fit(TrainDataAmounts)
print(n.bin_edges_)

ValueError: Expected 2D array, got 1D array instead:
array=[  0. 128.  66. 120. 141. 267.  95. 158. 168. 349.  70. 109. 200. 114.
  17. 125. 100.  76. 133. 115. 104. 315. 116. 112. 151. 191. 122. 110.
  35. 120. 201.  74. 106. 114. 320.   0. 100. 144. 184. 110.  80.  47.
  75. 134.  96.  88.  44. 144. 120. 144. 100. 120. 112. 134. 286.  97.
  96. 135. 180. 144. 120.  99. 165.   0. 116. 258. 126. 312. 125. 136.
 172.  97.  81.  95. 187. 113. 176. 110. 180. 130. 111.   0. 167. 265.
  50. 136.  99. 104. 210. 175. 131. 188.  81. 122.  25.   0. 137.  50.
 115. 131. 133. 151.   0.   0. 160. 100. 225. 120. 216.  94. 136. 139.
 152.   0. 118. 185. 154.  85. 175. 259. 180.  44. 137.  81. 194.  93.
 370.   0. 160. 182. 650.  74.  70.  25. 102. 290.  84.  88. 242. 129.
 185. 168. 175. 122. 187. 100.  70.  30. 225. 125. 118. 152. 244. 113.
  50. 600. 160. 187. 120. 255.  98. 275. 121. 158.  75. 182. 112. 129.
  63. 200.  95. 700.  81. 187.  87. 116. 101. 495. 116. 102. 180.  67.
  73. 260. 108. 120.  66.  58. 168. 188.  48. 164. 160.  76. 120. 170.
 187. 120. 113.  83.  90. 166.   0. 135. 124. 120.  80.  55.  59. 127.
 214. 128. 240. 130. 137. 100. 135. 131.  72. 127.  60. 116. 144. 175.
 128. 170. 138. 210. 158. 200. 104.  42. 120. 280. 140. 170. 255. 122.
 112.  96. 120. 140. 155. 108. 123. 120. 112. 137. 123.  90. 201. 138.
 104. 279. 192. 255. 115.  94. 304. 128. 330. 134. 155. 120. 128. 151.
 150. 160. 135.  90.  30. 136. 126. 150.  90. 115. 207.  80. 436. 124.
 158. 112.  78.  54.   0.  89.  99. 120. 115. 187. 139. 127. 134. 143.
 172. 110. 200. 135. 151. 113.  93. 105. 132.  96. 140.   0. 135. 104.
 480. 185.  84. 111.  56. 144. 159. 111. 120.  88. 112. 155. 115. 124.
   0. 132. 300. 376. 130. 184. 110.  67. 117.  98.  71. 490. 182.  70.
 160. 176.   0.  71. 173.  46. 158.  74. 125. 160. 152. 126. 259. 187.
 228. 308.  95. 105. 130. 116. 165.  67. 100. 200.  81. 236. 130.  95.
 141. 133.  96. 124. 175. 570.  55. 155. 380. 111. 110. 120. 130. 130.
  71. 130. 128. 296. 156. 128. 100. 113. 132.   0. 136. 125. 185. 275.
 120. 113. 113. 135.  71.  95. 109. 103.  45.  65. 103.  53. 194. 115.
 115.  66. 152. 360.  62. 160. 218. 110. 178.  60. 160. 239. 112. 138.
 138.  80. 100. 110.  96. 121.  81. 133.  87.  60. 150. 105. 405. 143.
 100.   0.  50.   0. 187. 138. 187. 180. 148. 152. 175. 130. 110.  55.
 150. 190. 125.  60. 149.  90.  84.  96. 118. 173. 136. 160. 160. 128.
 153. 132.  98. 140.  70. 110.  98. 110. 162. 113. 100.  93. 162. 150.
 230. 132.  86.   0. 154. 113. 128. 234. 246. 131.  80. 500. 160.  75.
  96. 186. 110. 225. 119. 105. 107. 111.  95. 209. 113. 100. 208. 138.
 124. 243. 480.  96. 188.  40. 100. 250. 148.  70. 311. 150. 113. 123.
 185.  95.  45.  55. 100. 480.   0. 400. 110. 161.  94. 130. 216. 100.
 110. 196. 125. 126. 324. 107.  66. 157. 140.  99.  95. 128. 102. 155.
  80. 145. 103. 110.   0.   0. 158. 181. 132.  26.  84. 260. 162. 182.
 108. 600. 211. 132. 258. 120.  70. 123.   9. 104. 186. 165. 275. 187.
 150. 108. 136. 110. 107. 161. 205.  90.  36.  61. 146. 172. 104.  70.
  94. 106.  56. 205. 292. 142. 260. 110. 187.  88. 180. 192. 350. 155.
 128. 172. 496.   0. 173. 157. 108.  71.  40. 253. 187. 133.].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

## Regression