In [1]:
import os
os.chdir('../../')

In [27]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scr.util import *

In [28]:
import numpy as np
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import xgboost as xgb
import lightgbm as lgb
import catboost as cb
from sklearn.pipeline import Pipeline

In [166]:
df_train = pd.read_csv('data/train.csv')
df_test = pd.read_csv('data/test.csv')

In [167]:
df_train.isna().sum()

id                          0
Age                       100
TypeofContact               6
CityTier                    0
DurationOfPitch           121
Occupation                  0
Gender                      0
NumberOfPersonVisiting      0
NumberOfFollowups          33
ProductPitched              0
PreferredPropertyStar       0
NumberOfTrips              22
Passport                    0
PitchSatisfactionScore      0
Designation                 0
MonthlyIncome              56
customer_info               0
ProdTaken                   0
dtype: int64

In [168]:
df_test.isna().sum()

id                          0
Age                        93
TypeofContact              12
CityTier                    0
DurationOfPitch           131
Occupation                  0
Gender                      0
NumberOfPersonVisiting      0
NumberOfFollowups          24
ProductPitched              0
PreferredPropertyStar       0
NumberOfTrips              42
Passport                    0
PitchSatisfactionScore      0
Designation                 0
MonthlyIncome              49
customer_info               0
dtype: int64

# 学習データの欠損値4つ（連続値で、欠損がランダムに発生したもの）を削除する

In [169]:
df_train = df_train.dropna(subset=['Age', 'DurationOfPitch', 'NumberOfTrips', 'MonthlyIncome'], axis=0)

In [170]:
df_train.isna().sum()

id                         0
Age                        0
TypeofContact              2
CityTier                   0
DurationOfPitch            0
Occupation                 0
Gender                     0
NumberOfPersonVisiting     0
NumberOfFollowups         29
ProductPitched             0
PreferredPropertyStar      0
NumberOfTrips              0
Passport                   0
PitchSatisfactionScore     0
Designation                0
MonthlyIncome              0
customer_info              0
ProdTaken                  0
dtype: int64

# 今回調査する欠損値：*Age*, DurationOfPitch, NumberOfTrips, MonthlyIncome

In [171]:
df_train['Age'] = df_train['Age'].astype(str).apply(japanese_to_int)
df_test['Age'] = df_test['Age'].astype(str).apply(japanese_to_int)

df_train['TypeofContact'] = df_train['TypeofContact'].fillna('No')
df_test['TypeofContact'] = df_test['TypeofContact'].fillna('No')

df_train['DurationOfPitch'] = df_train['DurationOfPitch'].dropna().apply(convert_to_second)
df_test['DurationOfPitch'] = df_test['DurationOfPitch'].dropna().apply(convert_to_second)

df_train['Gender'] = df_train['Gender'].apply(normalize_gender)
df_test['Gender'] = df_test['Gender'].apply(normalize_gender)

df_train['NumberOfFollowups'] = df_train['NumberOfFollowups'].apply(lambda x: x if x < 100 else x / 100)
df_test['NumberOfFollowups'] = df_test['NumberOfFollowups'].apply(lambda x: x if x < 100 else x / 100)
df_train['NumberOfFollowups'] = df_train['NumberOfFollowups'].fillna(0)
df_test['NumberOfFollowups'] = df_test['NumberOfFollowups'].fillna(0)

df_train['ProductPitched'] = df_train['ProductPitched'].apply(normalize_product_pitched_1)
df_train['ProductPitched'] = df_train['ProductPitched'].apply(normalize_product_pitched_2)
df_test['ProductPitched'] = df_test['ProductPitched'].apply(normalize_product_pitched_1)
df_test['ProductPitched'] = df_test['ProductPitched'].apply(normalize_product_pitched_2)

df_train['NumberOfTrips'] = df_train['NumberOfTrips'].apply(normalize_trips)
df_test['NumberOfTrips'] = df_test['NumberOfTrips'].apply(normalize_trips)

df_train['Designation'] = df_train['Designation'].apply(normalize_designation_1)
df_train['Designation'] = df_train['Designation'].apply(normalize_designation_2)
df_test['Designation'] = df_test['Designation'].apply(normalize_designation_1)
df_test['Designation'] = df_test['Designation'].apply(normalize_designation_2)

df_train['MonthlyIncome'] = df_train['MonthlyIncome'].apply(normalize_monthly_income)
df_test['MonthlyIncome'] = df_test['MonthlyIncome'].apply(normalize_monthly_income)

df_train[['Marry', 'Car', 'Child']] = df_train['customer_info'].apply(divide_customer_info)
df_test[['Marry', 'Car', 'Child']] = df_test['customer_info'].apply(divide_customer_info)
df_train['Marry'] = df_train['Marry'].apply(normalize_info_1)
df_test['Marry'] = df_test['Marry'].apply(normalize_info_1)
df_train['Car'] = df_train['Car'].apply(normalize_info_2)
df_test['Car'] = df_test['Car'].apply(normalize_info_2)
df_train['Child'] = df_train['Child'].apply(normalize_info_3)
df_test['Child'] = df_test['Child'].apply(normalize_info_3)

In [172]:
# 学習データは欠損値を削除済み

df_train.isna().sum()

id                        0
Age                       0
TypeofContact             0
CityTier                  0
DurationOfPitch           0
Occupation                0
Gender                    0
NumberOfPersonVisiting    0
NumberOfFollowups         0
ProductPitched            0
PreferredPropertyStar     0
NumberOfTrips             0
Passport                  0
PitchSatisfactionScore    0
Designation               0
MonthlyIncome             0
customer_info             0
ProdTaken                 0
Marry                     0
Car                       0
Child                     0
dtype: int64

In [173]:
# Age : 0
# NumberOfTrips : 'nan'

df_test.isna().sum()

id                          0
Age                         0
TypeofContact               0
CityTier                    0
DurationOfPitch           131
Occupation                  0
Gender                      0
NumberOfPersonVisiting      0
NumberOfFollowups           0
ProductPitched              0
PreferredPropertyStar       0
NumberOfTrips               0
Passport                    0
PitchSatisfactionScore      0
Designation                 0
MonthlyIncome              49
customer_info               0
Marry                       0
Car                         0
Child                       0
dtype: int64

In [174]:
drop_list = ['customer_info']

df_train = df_train.drop(columns=drop_list)
df_test = df_test.drop(columns=drop_list)

In [175]:
df_train.to_csv('data/null_survey/train_null.csv', index=False)
df_test.to_csv('data/null_survey/test_null.csv', index=False)

# Age (学習用データは、欠損値を削除している)

In [176]:
df_train = pd.read_csv('data/null_survey/train_null.csv')
df_test = pd.read_csv('data/null_survey/test_null.csv')

df_all = pd.concat([df_train, df_test], axis=0)

In [177]:
# Ageを予測するための特徴量を選択
predict_age_features = ['TypeofContact','CityTier','Occupation','Gender','NumberOfPersonVisiting','NumberOfFollowups','ProductPitched','PreferredPropertyStar','PitchSatisfactionScore', 'Passport','Designation' ,'Marry', 'Car','Child']

# 主キー用idを取得
# id_train = df_train.loc[df_train['Age'] == 0].index
id_test = df_test.loc[df_test['Age'] == 0]['id'].values

# Ageを予測するための、訓練データ・テストデータに分割
X_train_predict_age = df_all.loc[~(df_all['Age'] == 0), predict_age_features]
X_test_predict_age = df_all.loc[df_all['Age'] == 0, predict_age_features]
y_train_predict_age = df_all.loc[~(df_all['Age'] == 0), 'Age']
y_test_predict_age = None

In [178]:
X_train_predict_age = mapping_columns_if_exist(X_train_predict_age)
X_test_predict_age = mapping_columns_if_exist(X_test_predict_age)
X_train_predict_age, X_test_predict_age = feature_to_int(X_train_predict_age, X_test_predict_age)

In [179]:
n_splits = 5
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
cv_list = list(kf.split(X_train_predict_age, y_train_predict_age))

def train_catboost(X, y, cv, params: dict = None):
    if params is None:
        params = {}

    models = []
    n_records = len(X)
    oof_pred = np.zeros((n_records, ))
    for i, (tr_idx, va_idx) in enumerate(cv):
        tr_x, va_x = X.iloc[tr_idx], X.iloc[va_idx]
        tr_y, va_y = y.iloc[tr_idx], y.iloc[va_idx]
        model = cb.CatBoostRegressor(**params)
        model.fit(tr_x, tr_y, eval_set=(va_x, va_y), use_best_model=True, verbose=100)
        oof_pred[va_idx] = model.predict(va_x)
        models.append(model)
    return oof_pred, models

params = {
    'loss_function': 'Quantile:alpha=0.5',
    'eval_metric': 'Quantile:alpha=0.5',
    'iterations': 5000,
    'learning_rate': 0.005,
    'depth': 5,
    'l2_leaf_reg': 4,    # 3 ~ 10
    'verbose': 200,
    'random_seed': 42,
}

oof, models = train_catboost(X_train_predict_age, y_train_predict_age, cv_list, params)

0:	learn: 4.1350895	test: 4.2680948	best: 4.2680948 (0)	total: 1.76ms	remaining: 8.82s
100:	learn: 3.7707813	test: 3.9297618	best: 3.9297618 (100)	total: 153ms	remaining: 7.42s
200:	learn: 3.6142832	test: 3.7869050	best: 3.7869050 (200)	total: 301ms	remaining: 7.19s
300:	learn: 3.5369587	test: 3.7238032	best: 3.7238032 (300)	total: 438ms	remaining: 6.84s
400:	learn: 3.4915857	test: 3.6918799	best: 3.6918799 (400)	total: 589ms	remaining: 6.75s
500:	learn: 3.4582784	test: 3.6674637	best: 3.6674637 (500)	total: 741ms	remaining: 6.66s
600:	learn: 3.4361520	test: 3.6541932	best: 3.6541932 (600)	total: 890ms	remaining: 6.51s
700:	learn: 3.4166347	test: 3.6439980	best: 3.6439980 (700)	total: 1.03s	remaining: 6.35s
800:	learn: 3.4000807	test: 3.6382455	best: 3.6382455 (800)	total: 1.18s	remaining: 6.2s
900:	learn: 3.3864671	test: 3.6340272	best: 3.6340272 (900)	total: 1.32s	remaining: 6.02s
1000:	learn: 3.3741632	test: 3.6311471	best: 3.6311471 (1000)	total: 1.48s	remaining: 5.92s
1100:	learn:

In [180]:
y_test_predict_age = np.zeros(len(X_test_predict_age))

# アンサンブル
for model in models:
    y_test_predict_age += model.predict(X_test_predict_age) / len(models)

# y_test_predict_age = pd.DataFrame({
#     'Age': y_test_predict_age
# })

In [22]:
# # 訓練データとテストデータに分割する
# predicted_train_age = y_test_predict_age.iloc[:len(id_train), 0]
# predicted_test_age = y_test_predict_age.iloc[len(id_train): , 0]

# # id(主キー)を与える。
# predicted_train_age.index = id_train
# predicted_test_age.index = id_test

In [181]:
df_test[200:205]

Unnamed: 0,id,Age,TypeofContact,CityTier,DurationOfPitch,Occupation,Gender,NumberOfPersonVisiting,NumberOfFollowups,ProductPitched,PreferredPropertyStar,NumberOfTrips,Passport,PitchSatisfactionScore,Designation,MonthlyIncome,Marry,Car,Child
200,3689,0,Company Invited,1,540.0,Large Business,male,2.0,3.0,Basic,3.0,4.0,0,4,Executive,265770.0,Married,Has Car,0_child
201,3690,33,Company Invited,2,1080.0,Small Business,male,2.0,3.0,Basic,3.0,4.0,0,4,Executive,250000.0,Divorced,No Car,0_child
202,3691,0,Self Enquiry,1,480.0,Small Business,female,2.0,3.0,Deluxe,3.0,8.0,0,4,Manager,291360.0,Single,No Car,0_child
203,3692,49,Company Invited,2,780.0,Large Business,male,2.0,3.0,Standard,3.0,7.0,0,3,Senior Manager,379755.0,Married,No Car,0_child
204,3693,36,Company Invited,1,900.0,Large Business,female,2.0,3.0,Basic,3.0,5.0,0,4,Executive,251790.0,Divorced,No Car,0_child


In [182]:
for index, value in zip(id_test, y_test_predict_age):
    df_test.loc[df_test['id'] == index, 'Age'] = round(value)

In [139]:
# 欠損値を予測値で補完する
# df_train.loc[df_train['Age'] == 0, 'Age'] = round(predicted_train_age)
# df_test.loc[df_test['Age'].isna(), 'Age'] = round(y_test_predict_age)

In [183]:
df_test[200:205]

Unnamed: 0,id,Age,TypeofContact,CityTier,DurationOfPitch,Occupation,Gender,NumberOfPersonVisiting,NumberOfFollowups,ProductPitched,PreferredPropertyStar,NumberOfTrips,Passport,PitchSatisfactionScore,Designation,MonthlyIncome,Marry,Car,Child
200,3689,33,Company Invited,1,540.0,Large Business,male,2.0,3.0,Basic,3.0,4.0,0,4,Executive,265770.0,Married,Has Car,0_child
201,3690,33,Company Invited,2,1080.0,Small Business,male,2.0,3.0,Basic,3.0,4.0,0,4,Executive,250000.0,Divorced,No Car,0_child
202,3691,39,Self Enquiry,1,480.0,Small Business,female,2.0,3.0,Deluxe,3.0,8.0,0,4,Manager,291360.0,Single,No Car,0_child
203,3692,49,Company Invited,2,780.0,Large Business,male,2.0,3.0,Standard,3.0,7.0,0,3,Senior Manager,379755.0,Married,No Car,0_child
204,3693,36,Company Invited,1,900.0,Large Business,female,2.0,3.0,Basic,3.0,5.0,0,4,Executive,251790.0,Divorced,No Car,0_child


In [184]:
def age_to_agegroup(age):
    if age == 0:
        return np.nan    
    elif age < 20:
        return "10s"
    elif age < 30:
        return "20s"
    elif age < 40:
        return "30s"
    elif age < 50:
        return "40s"
    elif age < 60:
        return "50s"
    else:
        return "60s"

df_train['AgeGroup'] = df_train['Age'].apply(age_to_agegroup)
df_test['AgeGroup'] = df_test['Age'].apply(age_to_agegroup)

In [185]:
df_train.to_csv('data/null_survey/train_age_ok.csv', index=False)
df_test.to_csv('data/null_survey/test_age_ok.csv', index=False)