# テストデータの欠損値を代表値で補完する

In [1]:
import os
os.chdir('../../')

In [6]:
import numpy as np
import pandas as pd
from scr.util import *

In [40]:
df_test = pd.read_csv('data/test.csv')

In [41]:
df_test.isna().sum()

id                          0
Age                        93
TypeofContact              12
CityTier                    0
DurationOfPitch           131
Occupation                  0
Gender                      0
NumberOfPersonVisiting      0
NumberOfFollowups          24
ProductPitched              0
PreferredPropertyStar       0
NumberOfTrips              42
Passport                    0
PitchSatisfactionScore      0
Designation                 0
MonthlyIncome              49
customer_info               0
dtype: int64

## 正規化 + 欠損値補完 (意味がある特徴量)

In [42]:
#df_train['Age'] = df_train['Age'].astype(str).apply(japanese_to_int)
df_test['Age'] = df_test['Age'].astype(str).apply(japanese_to_int)

#df_train['TypeofContact'] = df_train['TypeofContact'].fillna('No')
df_test['TypeofContact'] = df_test['TypeofContact'].fillna('No')

#df_train['DurationOfPitch'] = df_train['DurationOfPitch'].dropna().apply(convert_to_second)
df_test['DurationOfPitch'] = df_test['DurationOfPitch'].dropna().apply(convert_to_second)

#df_train['Gender'] = df_train['Gender'].apply(normalize_gender)
df_test['Gender'] = df_test['Gender'].apply(normalize_gender)

#df_train['NumberOfFollowups'] = df_train['NumberOfFollowups'].apply(lambda x: x if x < 100 else x / 100)
df_test['NumberOfFollowups'] = df_test['NumberOfFollowups'].apply(lambda x: x if x < 100 else x / 100)
#df_train['NumberOfFollowups'] = df_train['NumberOfFollowups'].fillna(0)
df_test['NumberOfFollowups'] = df_test['NumberOfFollowups'].fillna(0)

#df_train['ProductPitched'] = df_train['ProductPitched'].apply(normalize_product_pitched_1)
#df_train['ProductPitched'] = df_train['ProductPitched'].apply(normalize_product_pitched_2)
df_test['ProductPitched'] = df_test['ProductPitched'].apply(normalize_product_pitched_1)
df_test['ProductPitched'] = df_test['ProductPitched'].apply(normalize_product_pitched_2)

#df_train['NumberOfTrips'] = df_train['NumberOfTrips'].apply(normalize_trips)
df_test['NumberOfTrips'] = df_test['NumberOfTrips'].apply(normalize_trips)

#df_train['Designation'] = df_train['Designation'].apply(normalize_designation_1)
#df_train['Designation'] = df_train['Designation'].apply(normalize_designation_2)
df_test['Designation'] = df_test['Designation'].apply(normalize_designation_1)
df_test['Designation'] = df_test['Designation'].apply(normalize_designation_2)

#df_train['MonthlyIncome'] = df_train['MonthlyIncome'].apply(normalize_monthly_income)
df_test['MonthlyIncome'] = df_test['MonthlyIncome'].apply(normalize_monthly_income)

#df_train[['Marry', 'Car', 'Child']] = df_train['customer_info'].apply(divide_customer_info)
df_test[['Marry', 'Car', 'Child']] = df_test['customer_info'].apply(divide_customer_info)
#df_train['Marry'] = df_train['Marry'].apply(normalize_info_1)
df_test['Marry'] = df_test['Marry'].apply(normalize_info_1)
#df_train['Car'] = df_train['Car'].apply(normalize_info_2)
df_test['Car'] = df_test['Car'].apply(normalize_info_2)
#df_train['Child'] = df_train['Child'].apply(normalize_info_3)
df_test['Child'] = df_test['Child'].apply(normalize_info_3)

In [43]:
drop_list = ['customer_info']

#df_train = df_train.drop(columns=drop_list)
df_test = df_test.drop(columns=drop_list)

## 中央値で補完する

In [44]:
tmp = df_test.loc[df_test['Age'] != 0, 'Age']
df_test.loc[df_test['Age'] == 0, 'Age'] = tmp.median()

In [45]:
tmp = df_test.loc[~(df_test['DurationOfPitch'].isna()), 'DurationOfPitch']
df_test.loc[df_test['DurationOfPitch'].isna(), 'DurationOfPitch'] = tmp.median()

In [46]:
tmp = df_test.loc[~(df_test['NumberOfTrips'] == 'nan'), 'NumberOfTrips']
df_test.loc[df_test['NumberOfTrips'] == 'nan', 'NumberOfTrips'] = tmp.median()

In [47]:
tmp = df_test.loc[~(df_test['MonthlyIncome'].isna()), 'MonthlyIncome']
df_test.loc[df_test['MonthlyIncome'].isna(), 'MonthlyIncome'] = tmp.median()

# 数値変換 (DurationOfPitch, MonthlyIncome)

In [48]:
df_test['DurationOfPitch'] = np.log1p(df_test['DurationOfPitch'])
df_test['MonthlyIncome'] = np.log1p(df_test['MonthlyIncome'])

## 年代特徴量作成

In [49]:
def age_to_agegroup(age):
    if age == 0:
        return np.nan    
    elif age < 20:
        return "10s"
    elif age < 30:
        return "20s"
    elif age < 40:
        return "30s"
    elif age < 50:
        return "40s"
    elif age < 60:
        return "50s"
    else:
        return "60s"

# df_train['AgeGroup'] = df_train['Age'].apply(age_to_agegroup)
df_test['AgeGroup'] = df_test['Age'].apply(age_to_agegroup)

# CSV出力

In [50]:
df_test.to_csv('data/feature_engineered/null_del_xxx/test_null_median.csv', index=False)

# 訓練データ (欠損値削除)

In [66]:
df_train = pd.read_csv('data/train.csv')

In [67]:
df_train.isna().sum()

id                          0
Age                       100
TypeofContact               6
CityTier                    0
DurationOfPitch           121
Occupation                  0
Gender                      0
NumberOfPersonVisiting      0
NumberOfFollowups          33
ProductPitched              0
PreferredPropertyStar       0
NumberOfTrips              22
Passport                    0
PitchSatisfactionScore      0
Designation                 0
MonthlyIncome              56
customer_info               0
ProdTaken                   0
dtype: int64

In [68]:
df_train = df_train.dropna(subset=['Age', 'DurationOfPitch', 'NumberOfFollowups', 'NumberOfTrips', 'MonthlyIncome'])

In [69]:
df_train['Age'] = df_train['Age'].astype(str).apply(japanese_to_int)
#df_test['Age'] = df_test['Age'].astype(str).apply(japanese_to_int)

df_train['TypeofContact'] = df_train['TypeofContact'].fillna('No')
#df_test['TypeofContact'] = df_test['TypeofContact'].fillna('No')

df_train['DurationOfPitch'] = df_train['DurationOfPitch'].dropna().apply(convert_to_second)
#df_test['DurationOfPitch'] = df_test['DurationOfPitch'].dropna().apply(convert_to_second)

df_train['Gender'] = df_train['Gender'].apply(normalize_gender)
#df_test['Gender'] = df_test['Gender'].apply(normalize_gender)

df_train['NumberOfFollowups'] = df_train['NumberOfFollowups'].apply(lambda x: x if x < 100 else x / 100)
#df_test['NumberOfFollowups'] = df_test['NumberOfFollowups'].apply(lambda x: x if x < 100 else x / 100)
df_train['NumberOfFollowups'] = df_train['NumberOfFollowups'].fillna(0)
#df_test['NumberOfFollowups'] = df_test['NumberOfFollowups'].fillna(0)

df_train['ProductPitched'] = df_train['ProductPitched'].apply(normalize_product_pitched_1)
df_train['ProductPitched'] = df_train['ProductPitched'].apply(normalize_product_pitched_2)
#df_test['ProductPitched'] = df_test['ProductPitched'].apply(normalize_product_pitched_1)
#df_test['ProductPitched'] = df_test['ProductPitched'].apply(normalize_product_pitched_2)

df_train['NumberOfTrips'] = df_train['NumberOfTrips'].apply(normalize_trips)
#df_test['NumberOfTrips'] = df_test['NumberOfTrips'].apply(normalize_trips)

df_train['Designation'] = df_train['Designation'].apply(normalize_designation_1)
df_train['Designation'] = df_train['Designation'].apply(normalize_designation_2)
#df_test['Designation'] = df_test['Designation'].apply(normalize_designation_1)
#df_test['Designation'] = df_test['Designation'].apply(normalize_designation_2)

df_train['MonthlyIncome'] = df_train['MonthlyIncome'].apply(normalize_monthly_income)
#df_test['MonthlyIncome'] = df_test['MonthlyIncome'].apply(normalize_monthly_income)

df_train[['Marry', 'Car', 'Child']] = df_train['customer_info'].apply(divide_customer_info)
#df_test[['Marry', 'Car', 'Child']] = df_test['customer_info'].apply(divide_customer_info)
df_train['Marry'] = df_train['Marry'].apply(normalize_info_1)
#df_test['Marry'] = df_test['Marry'].apply(normalize_info_1)
df_train['Car'] = df_train['Car'].apply(normalize_info_2)
#df_test['Car'] = df_test['Car'].apply(normalize_info_2)
df_train['Child'] = df_train['Child'].apply(normalize_info_3)
#df_test['Child'] = df_test['Child'].apply(normalize_info_3)

In [70]:
drop_list = ['customer_info']

df_train = df_train.drop(columns=drop_list)
#df_test = df_test.drop(columns=drop_list)

In [71]:
df_train['DurationOfPitch'] = np.log1p(df_test['DurationOfPitch'])
df_train['MonthlyIncome'] = np.log1p(df_test['MonthlyIncome'])

In [72]:
def age_to_agegroup(age):
    if age == 0:
        return np.nan    
    elif age < 20:
        return "10s"
    elif age < 30:
        return "20s"
    elif age < 40:
        return "30s"
    elif age < 50:
        return "40s"
    elif age < 60:
        return "50s"
    else:
        return "60s"

# df_train['AgeGroup'] = df_train['Age'].apply(age_to_agegroup)
df_train['AgeGroup'] = df_test['Age'].apply(age_to_agegroup)

In [73]:
df_train

Unnamed: 0,id,Age,TypeofContact,CityTier,DurationOfPitch,Occupation,Gender,NumberOfPersonVisiting,NumberOfFollowups,ProductPitched,...,NumberOfTrips,Passport,PitchSatisfactionScore,Designation,MonthlyIncome,ProdTaken,Marry,Car,Child,AgeGroup
0,0,50,Self Enquiry,2,1.110570,Large Business,male,1.0,4.0,Basic,...,5,1,4,Executive,1.293994,1,Single,No Car,0_child,40s
1,1,56,Company Invited,1,1.107109,Salaried,male,1.0,4.0,Standard,...,2,1,4,Senior Manager,1.283962,0,Divorced,Has Car,0_child,30s
3,3,37,Self Enquiry,2,1.082477,Small Business,female,1.0,3.0,Standard,...,1,0,5,Senior Manager,1.281021,0,Divorced,Has Car,0_child,20s
4,4,48,Company Invited,3,1.082477,Small Business,female,1.0,3.0,Basic,...,4,0,4,Executive,1.281718,1,Single,Has Car,0_child,40s
5,5,19,Self Enquiry,2,1.110570,Small Business,male,1.0,3.0,Basic,...,2,0,4,Executive,1.286531,0,Single,No Car,0_child,40s
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3484,3484,40,Self Enquiry,2,1.113732,Salaried,male,2.0,3.0,Basic,...,3,0,1,Executive,1.281176,1,Divorced,Has Car,0_child,40s
3485,3485,40,Self Enquiry,1,1.149806,Large Business,male,3.0,3.0,Basic,...,5,0,3,Executive,1.287063,0,Married,Has Car,2_child,40s
3486,3486,31,Self Enquiry,1,1.130353,Small Business,female,3.0,2.0,Standard,...,5,0,4,Senior Manager,1.281808,0,Single,Has Car,1_child,20s
3487,3487,56,Company Invited,2,1.094260,Salaried,male,3.0,6.0,King,...,7,1,4,VP,1.281992,1,Married,No Car,2_child,20s


In [75]:
df_train.to_csv('data/feature_engineered/null_del_xxx/train_nonnull.csv', index=False)