In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.neighbors import KNeighborsRegressor
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
train = pd.read_csv("D:/project/homecredit/application_train.csv")

In [3]:
# List of columns to drop
columns_to_drop = [
    'FLAG_MOBIL', 'FLAG_EMP_PHONE', 'FLAG_WORK_PHONE', 'FLAG_CONT_MOBILE', 'FLAG_PHONE','FLAG_EMAIL',
    'REG_REGION_NOT_LIVE_REGION', 'REG_REGION_NOT_WORK_REGION', 'LIVE_REGION_NOT_WORK_REGION',
    'REG_CITY_NOT_LIVE_CITY', 'REG_CITY_NOT_WORK_CITY', 'LIVE_CITY_NOT_WORK_CITY',
    'APARTMENTS_AVG', 'BASEMENTAREA_AVG', 'YEARS_BEGINEXPLUATATION_AVG', 'YEARS_BUILD_AVG',
    'COMMONAREA_AVG', 'ELEVATORS_AVG', 'ENTRANCES_AVG', 'FLOORSMAX_AVG', 'FLOORSMIN_AVG',
    'LANDAREA_AVG', 'LIVINGAPARTMENTS_AVG', 'LIVINGAREA_AVG', 'NONLIVINGAPARTMENTS_AVG',
    'NONLIVINGAREA_AVG', 'APARTMENTS_MODE', 'BASEMENTAREA_MODE', 'YEARS_BEGINEXPLUATATION_MODE',
    'YEARS_BUILD_MODE', 'COMMONAREA_MODE', 'ELEVATORS_MODE', 'ENTRANCES_MODE', 'FLOORSMAX_MODE',
    'FLOORSMIN_MODE', 'LANDAREA_MODE', 'LIVINGAPARTMENTS_MODE', 'LIVINGAREA_MODE',
    'NONLIVINGAPARTMENTS_MODE', 'NONLIVINGAREA_MODE', 'APARTMENTS_MEDI', 'BASEMENTAREA_MEDI',
    'YEARS_BEGINEXPLUATATION_MEDI', 'YEARS_BUILD_MEDI', 'COMMONAREA_MEDI', 'ELEVATORS_MEDI',
    'ENTRANCES_MEDI', 'FLOORSMAX_MEDI', 'FLOORSMIN_MEDI', 'LANDAREA_MEDI', 'LIVINGAPARTMENTS_MEDI',
    'LIVINGAREA_MEDI', 'NONLIVINGAPARTMENTS_MEDI', 'NONLIVINGAREA_MEDI', 'FONDKAPREMONT_MODE',
    'HOUSETYPE_MODE', 'TOTALAREA_MODE', 'WALLSMATERIAL_MODE', 'EMERGENCYSTATE_MODE',
    'OBS_30_CNT_SOCIAL_CIRCLE', 'OBS_60_CNT_SOCIAL_CIRCLE', 'FLAG_DOCUMENT_2', 'FLAG_DOCUMENT_3',
    'FLAG_DOCUMENT_4', 'FLAG_DOCUMENT_5', 'FLAG_DOCUMENT_6', 'FLAG_DOCUMENT_7', 'FLAG_DOCUMENT_8',
    'FLAG_DOCUMENT_9', 'FLAG_DOCUMENT_10', 'FLAG_DOCUMENT_11', 'FLAG_DOCUMENT_12', 'FLAG_DOCUMENT_13',
    'FLAG_DOCUMENT_14', 'FLAG_DOCUMENT_15', 'FLAG_DOCUMENT_16', 'FLAG_DOCUMENT_17', 'FLAG_DOCUMENT_18',
    'FLAG_DOCUMENT_19', 'FLAG_DOCUMENT_20', 'FLAG_DOCUMENT_21', 'AMT_REQ_CREDIT_BUREAU_HOUR',
    'AMT_REQ_CREDIT_BUREAU_DAY', 'AMT_REQ_CREDIT_BUREAU_WEEK'
]

train= train.drop(columns=columns_to_drop)

### 以KNN補ANNUITY的空值

In [4]:
annuity = train[['AMT_ANNUITY','AMT_GOODS_PRICE','AMT_CREDIT']]
imputer = KNNImputer(n_neighbors=125, weights="distance")
annuityImputed = imputer.fit_transform(annuity)
annuityImputed = pd.DataFrame(annuityImputed)

temp = pd.DataFrame.copy(annuity,True) 
temp['0']=annuityImputed[0]

train['AMT_ANNUITY'] = annuityImputed[0] #填補完成

### 以KNN補GOODS_PRICE的空值

In [5]:
goods_price = annuity.drop('AMT_ANNUITY',axis=1) # 只用AMT_CREDIT來填
imputer = KNNImputer(n_neighbors=37, weights="distance")
goodsImputed = imputer.fit_transform(goods_price)
goodsImputed = pd.DataFrame(goodsImputed)

temp = pd.DataFrame.copy(goods_price,True) 
temp['0']=goodsImputed[0]

train['AMT_GOODS_PRICE'] = goodsImputed[0] #填補完成

In [6]:
train[['AMT_GOODS_PRICE','AMT_CREDIT','AMT_ANNUITY']].isna().sum()

AMT_GOODS_PRICE    0
AMT_CREDIT         0
AMT_ANNUITY        0
dtype: int64

### 補float類型欄位的空值

In [7]:
data_float = train.columns[train.dtypes=='float64'] #挑出float的欄位
data_float_na = train[data_float].columns[train[data_float].isna().sum()>0]#挑出有空值的float欄位

In [8]:
data_float_na = data_float_na.drop(['CNT_FAM_MEMBERS']) # drop掉CNT_FAM_MEMBERS另外處理
data_float_na

Index(['OWN_CAR_AGE', 'EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3',
       'DEF_30_CNT_SOCIAL_CIRCLE', 'DEF_60_CNT_SOCIAL_CIRCLE',
       'DAYS_LAST_PHONE_CHANGE', 'AMT_REQ_CREDIT_BUREAU_MON',
       'AMT_REQ_CREDIT_BUREAU_QRT', 'AMT_REQ_CREDIT_BUREAU_YEAR'],
      dtype='object')

### 'CNT_FAM_MEMBERS'以眾數補空值
### 其餘欄位以0補空值

In [9]:
train['CNT_FAM_MEMBERS'].mode()
train['CNT_FAM_MEMBERS'] = train['CNT_FAM_MEMBERS'].fillna(value=2) #填眾數

for column in data_float_na: 
    train[column] = train[column].fillna(value=0)

### 處理類別型欄位

In [10]:
seed = 42
np.random.seed(seed)

columns_to_fill_random = ['NAME_TYPE_SUITE', 'OCCUPATION_TYPE']

for col in columns_to_fill_random:
    random_values = np.random.choice(train[col].dropna(), size=train[col].isna().sum())
    train.loc[train[col].isna(), col] = random_values


### 處理極端值

In [11]:
#處理極端值
train['AMT_INCOME_TOTAL'] = train['AMT_INCOME_TOTAL'].replace(100000000, np.nan)

train['DAYS_EMPLOYED_365243'] = (train['DAYS_EMPLOYED'] == 365243).replace(True,1) # 標註異常欄位為1
train['DAYS_EMPLOYED_365243'] = (train['DAYS_EMPLOYED'] == 365243).astype(int) #標記非異常欄位為0
train['DAYS_EMPLOYED'] = train['DAYS_EMPLOYED'].replace(365243,0) # 異常值改為0

### 轉換成年齡及年資

In [12]:
# change DAYS_BIRTH to AGE
train["DAYS_BIRTH"] = train["DAYS_BIRTH"] / 365
train["DAYS_BIRTH"] = train["DAYS_BIRTH"].astype(int).abs()
train = train.rename(columns={"DAYS_BIRTH": "AGE"})

# change DAYS_EMPLOYED to JOB_TENURE
train["DAYS_EMPLOYED"] = train["DAYS_EMPLOYED"] / 365
train["DAYS_EMPLOYED"] = train["DAYS_EMPLOYED"].round(1).abs()
train = train.rename(columns={"DAYS_EMPLOYED": "JOB_TENURE"})



In [13]:
train.OCCUPATION_TYPE.value_counts()

Laborers                 80299
Sales staff              46810
Core staff               40133
Managers                 31192
Drivers                  27105
High skill tech staff    16594
Accountants              14383
Medicine staff           12443
Security staff            9748
Cooking staff             8627
Cleaning staff            6729
Private service staff     3853
Low-skill Laborers        3058
Secretaries               1961
Waiters/barmen staff      1928
Realty agents             1078
HR staff                   818
IT staff                   752
Name: OCCUPATION_TYPE, dtype: int64

In [14]:
# train.to_csv('D:/project/homecredit/final/v3/application_train_ETL_v3.csv',index = False)

In [20]:
train.columns

Index(['SK_ID_CURR', 'TARGET', 'NAME_CONTRACT_TYPE', 'CODE_GENDER',
       'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'CNT_CHILDREN', 'AMT_INCOME_TOTAL',
       'AMT_CREDIT', 'AMT_ANNUITY', 'AMT_GOODS_PRICE', 'NAME_TYPE_SUITE',
       'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS',
       'NAME_HOUSING_TYPE', 'REGION_POPULATION_RELATIVE', 'AGE', 'JOB_TENURE',
       'DAYS_REGISTRATION', 'DAYS_ID_PUBLISH', 'OWN_CAR_AGE',
       'OCCUPATION_TYPE', 'CNT_FAM_MEMBERS', 'REGION_RATING_CLIENT',
       'REGION_RATING_CLIENT_W_CITY', 'WEEKDAY_APPR_PROCESS_START',
       'HOUR_APPR_PROCESS_START', 'ORGANIZATION_TYPE', 'EXT_SOURCE_1',
       'EXT_SOURCE_2', 'EXT_SOURCE_3', 'DEF_30_CNT_SOCIAL_CIRCLE',
       'DEF_60_CNT_SOCIAL_CIRCLE', 'DAYS_LAST_PHONE_CHANGE',
       'AMT_REQ_CREDIT_BUREAU_MON', 'AMT_REQ_CREDIT_BUREAU_QRT',
       'AMT_REQ_CREDIT_BUREAU_YEAR', 'DAYS_EMPLOYED_365243'],
      dtype='object')