In [1]:
# import os
# os.chdir('../../')

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scr.util import *

In [3]:
import numpy as np
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import xgboost as xgb
import lightgbm as lgb
import catboost as cb
from sklearn.pipeline import Pipeline

In [4]:
df_train = pd.read_csv('data/train.csv')
df_test = pd.read_csv('data/test.csv')

In [5]:
df_train.isna().sum()

id                          0
Age                       100
TypeofContact               6
CityTier                    0
DurationOfPitch           121
Occupation                  0
Gender                      0
NumberOfPersonVisiting      0
NumberOfFollowups          33
ProductPitched              0
PreferredPropertyStar       0
NumberOfTrips              22
Passport                    0
PitchSatisfactionScore      0
Designation                 0
MonthlyIncome              56
customer_info               0
ProdTaken                   0
dtype: int64

In [6]:
df_test.isna().sum()

id                          0
Age                        93
TypeofContact              12
CityTier                    0
DurationOfPitch           131
Occupation                  0
Gender                      0
NumberOfPersonVisiting      0
NumberOfFollowups          24
ProductPitched              0
PreferredPropertyStar       0
NumberOfTrips              42
Passport                    0
PitchSatisfactionScore      0
Designation                 0
MonthlyIncome              49
customer_info               0
dtype: int64

# 今回調査する欠損値：*Age*, DurationOfPitch, NumberOfTrips, MonthlyIncome

In [7]:
df_train['Age'] = df_train['Age'].astype(str).apply(japanese_to_int)
df_test['Age'] = df_test['Age'].astype(str).apply(japanese_to_int)

df_train['TypeofContact'] = df_train['TypeofContact'].fillna('No')
df_test['TypeofContact'] = df_test['TypeofContact'].fillna('No')

df_train['DurationOfPitch'] = df_train['DurationOfPitch'].dropna().apply(convert_to_second)
df_test['DurationOfPitch'] = df_test['DurationOfPitch'].dropna().apply(convert_to_second)

df_train['Gender'] = df_train['Gender'].apply(normalize_gender)
df_test['Gender'] = df_test['Gender'].apply(normalize_gender)

df_train['NumberOfFollowups'] = df_train['NumberOfFollowups'].apply(lambda x: x if x < 100 else x / 100)
df_test['NumberOfFollowups'] = df_test['NumberOfFollowups'].apply(lambda x: x if x < 100 else x / 100)
df_train['NumberOfFollowups'] = df_train['NumberOfFollowups'].fillna(0)
df_test['NumberOfFollowups'] = df_test['NumberOfFollowups'].fillna(0)

df_train['ProductPitched'] = df_train['ProductPitched'].apply(normalize_product_pitched_1)
df_train['ProductPitched'] = df_train['ProductPitched'].apply(normalize_product_pitched_2)
df_test['ProductPitched'] = df_test['ProductPitched'].apply(normalize_product_pitched_1)
df_test['ProductPitched'] = df_test['ProductPitched'].apply(normalize_product_pitched_2)

df_train['NumberOfTrips'] = df_train['NumberOfTrips'].apply(normalize_trips)
df_test['NumberOfTrips'] = df_test['NumberOfTrips'].apply(normalize_trips)

df_train['Designation'] = df_train['Designation'].apply(normalize_designation_1)
df_train['Designation'] = df_train['Designation'].apply(normalize_designation_2)
df_test['Designation'] = df_test['Designation'].apply(normalize_designation_1)
df_test['Designation'] = df_test['Designation'].apply(normalize_designation_2)

df_train['MonthlyIncome'] = df_train['MonthlyIncome'].apply(normalize_monthly_income)
df_test['MonthlyIncome'] = df_test['MonthlyIncome'].apply(normalize_monthly_income)

df_train[['Marry', 'Car', 'Child']] = df_train['customer_info'].apply(divide_customer_info)
df_test[['Marry', 'Car', 'Child']] = df_test['customer_info'].apply(divide_customer_info)
df_train['Marry'] = df_train['Marry'].apply(normalize_info_1)
df_test['Marry'] = df_test['Marry'].apply(normalize_info_1)
df_train['Car'] = df_train['Car'].apply(normalize_info_2)
df_test['Car'] = df_test['Car'].apply(normalize_info_2)
df_train['Child'] = df_train['Child'].apply(normalize_info_3)
df_test['Child'] = df_test['Child'].apply(normalize_info_3)

In [8]:
# Age : 0
# NumberOfTrips : 'nan'

df_train.isna().sum()

id                          0
Age                         0
TypeofContact               0
CityTier                    0
DurationOfPitch           121
Occupation                  0
Gender                      0
NumberOfPersonVisiting      0
NumberOfFollowups           0
ProductPitched              0
PreferredPropertyStar       0
NumberOfTrips               0
Passport                    0
PitchSatisfactionScore      0
Designation                 0
MonthlyIncome              56
customer_info               0
ProdTaken                   0
Marry                       0
Car                         0
Child                       0
dtype: int64

In [9]:
# Age : 0
# NumberOfTrips : 'nan'

df_test.isna().sum()

id                          0
Age                         0
TypeofContact               0
CityTier                    0
DurationOfPitch           131
Occupation                  0
Gender                      0
NumberOfPersonVisiting      0
NumberOfFollowups           0
ProductPitched              0
PreferredPropertyStar       0
NumberOfTrips               0
Passport                    0
PitchSatisfactionScore      0
Designation                 0
MonthlyIncome              49
customer_info               0
Marry                       0
Car                         0
Child                       0
dtype: int64

In [10]:
drop_list = ['customer_info']

df_train = df_train.drop(columns=drop_list)
df_test = df_test.drop(columns=drop_list)

In [11]:
df_train.to_csv('data/null_survey/train_null.csv', index=False)
df_test.to_csv('data/null_survey/test_null.csv', index=False)

# Age

In [12]:
df_train = pd.read_csv('data/null_survey/train_null.csv')
df_test = pd.read_csv('data/null_survey/test_null.csv')

df_all = pd.concat([df_train, df_test], axis=0)

In [13]:
# Ageを予測するための特徴量を選択
predict_age_features = ['TypeofContact','CityTier','Occupation','Gender','NumberOfPersonVisiting','NumberOfFollowups','ProductPitched','PreferredPropertyStar','PitchSatisfactionScore', 'Passport','Designation' ,'Marry', 'Car','Child']

# 主キー用idを取得
id_train = df_train.loc[df_train['Age'] == 0].index
id_test = df_test.loc[df_test['Age'] == 0].index

# 
X_train_predict_age = df_all.loc[~(df_all['Age'] == 0), predict_age_features]
X_test_predict_age = df_all.loc[df_all['Age'] == 0, predict_age_features]
y_train_predict_age = df_all.loc[~(df_all['Age'] == 0), 'Age']
y_test_predict_age = None

In [15]:
X_train_predict_age , X_test_predict_age = mapping_first_category(X_train_predict_age, X_test_predict_age)
X_train_predict_age, X_test_predict_age = feature_to_int(X_train_predict_age, X_test_predict_age)

In [20]:
n_splits = 5
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
cv_list = list(kf.split(X_train_predict_age, y_train_predict_age))

def train_catboost(X, y, cv, params: dict = None):
    if params is None:
        params = {}

    models = []
    n_records = len(X)
    oof_pred = np.zeros((n_records, ))
    for i, (tr_idx, va_idx) in enumerate(cv):
        tr_x, va_x = X.iloc[tr_idx], X.iloc[va_idx]
        tr_y, va_y = y.iloc[tr_idx], y.iloc[va_idx]
        model = cb.CatBoostRegressor(**params)
        model.fit(tr_x, tr_y, eval_set=(va_x, va_y), use_best_model=True, verbose=100)
        oof_pred[va_idx] = model.predict(va_x)
        models.append(model)
    return oof_pred, models

params = {
    'loss_function': 'RMSE',
    'eval_metric': 'RMSE',
    'iterations': 5000,
    'learning_rate': 0.005,
    'depth': 5,
    'l2_leaf_reg': 4,    # 3 ~ 10
    'verbose': 200,
    'random_seed': 42,
}

oof, models = train_catboost(X_train_predict_age, y_train_predict_age, cv_list, params)

0:	learn: 10.0184299	test: 9.8407180	best: 9.8407180 (0)	total: 1.54ms	remaining: 7.69s
100:	learn: 9.2286071	test: 9.1613531	best: 9.1613531 (100)	total: 126ms	remaining: 6.1s
200:	learn: 8.8978797	test: 8.9003201	best: 8.9003201 (200)	total: 234ms	remaining: 5.58s
300:	learn: 8.7468465	test: 8.7943495	best: 8.7943495 (300)	total: 346ms	remaining: 5.4s
400:	learn: 8.6638190	test: 8.7426543	best: 8.7426543 (400)	total: 454ms	remaining: 5.21s
500:	learn: 8.6133952	test: 8.7161579	best: 8.7161579 (500)	total: 568ms	remaining: 5.1s
600:	learn: 8.5766715	test: 8.6969008	best: 8.6969008 (600)	total: 673ms	remaining: 4.93s
700:	learn: 8.5460660	test: 8.6833837	best: 8.6833837 (700)	total: 777ms	remaining: 4.76s
800:	learn: 8.5202163	test: 8.6734925	best: 8.6734925 (800)	total: 881ms	remaining: 4.62s
900:	learn: 8.4950034	test: 8.6671895	best: 8.6671895 (900)	total: 995ms	remaining: 4.53s
1000:	learn: 8.4743768	test: 8.6622049	best: 8.6622049 (1000)	total: 1.11s	remaining: 4.43s
1100:	learn: 

In [21]:
y_test_predict_age = np.zeros(len(X_test_predict_age))

# アンサンブル
for model in models:
    y_test_predict_age += model.predict(X_test_predict_age) / len(models)

y_test_predict_age = pd.DataFrame({
    'Age': y_test_predict_age
})

In [22]:
# 訓練データとテストデータに分割する
predicted_train_age = y_test_predict_age.iloc[:len(id_train), 0]
predicted_test_age = y_test_predict_age.iloc[len(id_train): , 0]

# id(主キー)を与える。
predicted_train_age.index = id_train
predicted_test_age.index = id_test

In [23]:
# 欠損値を予測値で補完する
df_train.loc[df_train['Age'] == 0, 'Age'] = round(predicted_train_age)
df_test.loc[df_test['Age'] == 0, 'Age'] = round(predicted_test_age)

In [24]:
df_train.head()

Unnamed: 0,id,Age,TypeofContact,CityTier,DurationOfPitch,Occupation,Gender,NumberOfPersonVisiting,NumberOfFollowups,ProductPitched,PreferredPropertyStar,NumberOfTrips,Passport,PitchSatisfactionScore,Designation,MonthlyIncome,ProdTaken,Marry,Car,Child
0,0,50,Self Enquiry,2,900.0,Large Business,male,1.0,4.0,Basic,3.0,5.0,1,4,Executive,253905.0,1,Single,No Car,0_child
1,1,56,Company Invited,1,840.0,Salaried,male,1.0,4.0,Standard,3.0,2.0,1,4,Senior Manager,404475.0,0,Divorced,Has Car,0_child
2,2,35,Self Enquiry,1,600.0,Large Business,female,1.0,3.0,Basic,3.0,4.0,0,4,Executive,278145.0,1,Married,No Car,0_child
3,3,37,Self Enquiry,2,1080.0,Small Business,female,1.0,3.0,Standard,4.0,1.0,0,5,Senior Manager,326805.0,0,Divorced,Has Car,0_child
4,4,48,Company Invited,3,1020.0,Small Business,female,1.0,3.0,Basic,4.0,4.0,0,4,Executive,258435.0,1,Single,Has Car,0_child


In [25]:
df_test[20:26]

Unnamed: 0,id,Age,TypeofContact,CityTier,DurationOfPitch,Occupation,Gender,NumberOfPersonVisiting,NumberOfFollowups,ProductPitched,PreferredPropertyStar,NumberOfTrips,Passport,PitchSatisfactionScore,Designation,MonthlyIncome,Marry,Car,Child
20,3509,36,Company Invited,1,960.0,Large Business,male,1.0,3.0,Basic,4.0,4.0,0,3,Executive,300000.0,Single,No Car,0_child
21,3510,39,Company Invited,2,540.0,Large Business,female,1.0,3.0,Basic,3.0,1.0,0,3,Executive,256410.0,Single,No Car,0_child
22,3511,33,Company Invited,1,1080.0,Small Business,male,1.0,3.0,Basic,4.0,2.0,0,4,Executive,259410.0,Single,Has Car,0_child
23,3512,40,Self Enquiry,2,900.0,Small Business,male,1.0,3.0,Deluxe,4.0,4.0,1,4,AVP,400000.0,Single,Has Car,0_child
24,3513,37,Company Invited,1,900.0,Small Business,female,1.0,3.0,Deluxe,3.0,4.0,1,4,Manager,300000.0,Single,No Car,0_child
25,3514,38,Self Enquiry,3,540.0,Small Business,female,1.0,3.0,Standard,3.0,1.0,1,3,Manager,362850.0,Married,Has Car,0_child


In [26]:
def age_to_agegroup(age):
    if age == 0:
        return np.nan    
    elif age < 20:
        return "10s"
    elif age < 30:
        return "20s"
    elif age < 40:
        return "30s"
    elif age < 50:
        return "40s"
    elif age < 60:
        return "50s"
    else:
        return "60s"

df_train['AgeGroup'] = df_train['Age'].apply(age_to_agegroup)
df_test['AgeGroup'] = df_test['Age'].apply(age_to_agegroup)

In [27]:
df_train.to_csv('data/null_survey/train_age_ok.csv', index=False)
df_test.to_csv('data/null_survey/test_age_ok.csv', index=False)