In [None]:
## Import packages
library(ggplot2)
library(repr)
library(hexbin)
library(dplyr)
library(GGally) ## Adds pair-wise scatter plots to ggplot2
options(repr.plot.width=4, repr.plot.height=4) # Set the initial plot area dimensions

In [None]:
#Read CSV files into variables
test_values = read.csv('test_values.csv', stringsAsFactors = FALSE, header = TRUE)

#Display first 20 rows 
head(test_values,20)

In [None]:
#Display structure of table
str(test_values)
#Summary for test_values
summary(test_values)

#Calculate Stand deviation for numeric column in table
for(col in colnames(test_values)){
    if(is.numeric(test_values[,col])){
        cat(paste(col, as.character(round(sd(test_values[,col]), 2)), '\n'))
    }
}

In [None]:
#Find Empty String in each dataset
lapply(test_values, function(x){any(x == '')})

#Find Empty column in each dataset
lapply(test_values, function(x){any(x == '?')})

#Find Missing Values in each dataset
lapply(test_values, function(x){any(x == '-1')})

In [None]:
#Display dimension and distinct dimension of each table
dim(test_values)
dim(distinct(test_values))

In [6]:
loan_type = c('Conventional', 'FHA-insured', 'VA-guaranteed', 'FSA/RHS')
names(loan_type) = c('1', '2', '3', '4')
property_type = c('One to four-family', 'Manufactured housing', 'Multifamily')
names(property_type) = c('1', '2', '3')
loan_purpose = c('Home purchase', 'Home improvement', 'Refinancing')
names(loan_purpose) = c('1', '2', '3')
occupancy = c('Owner-occupied as a principal dwelling', 'Not owner-occupied', 'Not applicable')
names(occupancy) = c('1', '2', '3')
preapproval = c('Preapproval was requested', 'Preapproval was not requested', 'Not applicable')
names(preapproval) = c('1', '2', '3')
applicant_ethnicity = c('Hispanic or Latino', 'Not Hispanic or Latino', 'Information not provided by applicant in mail, Internet, or telephone pplication',
                       'Not applicable','No co-applicant')
names(applicant_ethnicity) = c('1', '2', '3','4','5')
applicant_race = c('American Indian or Alaska Native', 'Asian', 'Black or African American','Native Hawaiian or Other Pacific Islander',
                       'White','Information not provided by applicant in mail, Internet, or telephone application',
                  'Not applicable','No co-applicant')
names(applicant_race) = c('1', '2', '3','4','5','6','7','8')
applicant_sex = c('Male', 'Female', 'Information not provided by applicant in mail, Internet, or telephone application',
                       'Not applicable','Not applicable')
names(applicant_sex) = c('1', '2', '3','4','5')     

codes = c('loan_type' = loan_type,
         'property_type' = property_type,
         'loan_purpose' = loan_purpose,
         'occupancy' = occupancy,
         'preapproval' = preapproval,
         'applicant_ethnicity' = applicant_ethnicity,
         'applicant_race' = applicant_race,
         'applicant_sex' = applicant_sex)         

cat_cols = c('loan_type','property_type','loan_purpose',
         'occupancy','preapproval','applicant_ethnicity',
         'applicant_race', 'applicant_sex')

for(col in cat_cols){
    test_values[,col] = sapply(test_values[,col], function(code){codes[[paste(col, '.', code, sep = '')]]})
}
#credit$bad_credit = as.numeric(credit$bad_credit)
head(test_values, 20)

row_id,loan_type,property_type,loan_purpose,occupancy,loan_amount,preapproval,msa_md,state_code,county_code,...,applicant_sex,applicant_income,population,minority_population_pct,ffiecmedian_family_income,tract_to_msa_md_income_pct,number_of_owner.occupied_units,number_of_1_to_4_family_units,lender,co_applicant
<int>,<chr>,<chr>,<chr>,<chr>,<dbl>,<chr>,<int>,<int>,<int>,...,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<int>,<chr>
0,Conventional,Manufactured housing,Home purchase,Owner-occupied as a principal dwelling,46,Not applicable,262,37,83,...,Female,50.0,10761,43.415,61732,100.0,2900,3603,1119,False
1,Conventional,One to four-family,Refinancing,Owner-occupied as a principal dwelling,170,Not applicable,282,11,267,...,Male,155.0,2285,20.115,50373,100.0,539,873,4216,False
2,Conventional,One to four-family,Home purchase,Owner-occupied as a principal dwelling,174,Not applicable,274,12,162,...,Male,102.0,7765,78.621,75175,54.165,1559,1712,2467,False
3,Conventional,One to four-family,Home purchase,Not owner-occupied,116,Preapproval was not requested,254,37,282,...,Male,51.0,3387,96.208,48677,74.476,938,784,2773,True
4,Conventional,One to four-family,Refinancing,Owner-occupied as a principal dwelling,108,Not applicable,194,41,17,...,Male,88.0,3106,6.37,68070,100.0,1119,1301,3110,False
5,Conventional,One to four-family,Home purchase,Owner-occupied as a principal dwelling,53,Not applicable,215,11,39,...,Male,26.0,6338,89.621,68957,88.605,1303,1837,356,False
6,Conventional,One to four-family,Refinancing,Owner-occupied as a principal dwelling,175,Not applicable,294,31,101,...,Male,55.0,4488,4.468,84698,100.0,1290,1444,1657,False
7,FHA-insured,One to four-family,Home purchase,Owner-occupied as a principal dwelling,135,Preapproval was not requested,339,14,230,...,Male,40.0,6896,22.123,65203,81.272,1933,2772,809,False
8,FHA-insured,One to four-family,Home purchase,Owner-occupied as a principal dwelling,290,Not applicable,29,30,174,...,Male,72.0,4587,90.579,59894,100.0,884,1173,4243,False
9,Conventional,One to four-family,Home purchase,Owner-occupied as a principal dwelling,4,Not applicable,154,33,232,...,Male,57.0,7436,52.746,65855,88.008,1892,3039,3038,False


In [8]:
write.csv(test_values,file="TEST_DATA.csv")