In [2]:
import os
os.chdir('../../')

In [3]:
import numpy as np
import pandas as pd

from scr.util import *
from scr.feature_engineering.engineering import *

In [4]:
df_train = pd.read_csv('data/feature_engineered/null_representative/train_null_del_beta.csv')
df_test = pd.read_csv('data/feature_engineered/null_cat/test_null_cat.csv')

In [4]:
display(df_train.iloc[:5, 21:])
display(df_test.iloc[:5, 20:])

Unnamed: 0,TypeofContactNULL,Motivation,EconomicPower,Child01,TripEasier,SalesPerformance,LivingCost,EconomicStability,NumberOfTrips_log,TravelCost,...,Monetary,MonetarySeg,FreaqencySeg,ContractRate_FM,ContractRate_G1,ContractRate_G2,ContractRate_G3,ContractRate_G4,ContractRate_G5,ContractRate_G6
0,0,14,32.35627,0,48,2.351729,9.572861,16.42703,1.791759,2.791759,...,22.297944,3,1,0.141921,0.244444,0.5,0.454545,0.192308,0.454545,0.197368
1,0,14,50.350356,0,8,2.375794,8.197046,18.461797,1.098612,2.098612,...,14.183467,2,0,0.135781,0.108844,0.25,0.2,0.097046,0.195122,0.095238
2,0,3,37.607704,0,24,1.875412,7.597516,16.54739,1.609438,2.609438,...,20.175755,3,0,0.146226,0.212389,0.171569,0.223382,0.26186,0.228512,0.266667
3,0,3,25.394244,0,8,2.147262,9.301921,14.855633,0.693147,2.539721,...,8.800974,1,0,0.180328,0.032258,0.012048,0.058091,0.068441,0.055556,0.065693
4,0,3,12.462403,0,48,1.731967,11.868956,12.337779,1.609438,3.914157,...,20.057464,3,0,0.146226,0.084746,0.056604,0.185185,0.22905,0.186747,0.232044


Unnamed: 0,TypeofContactNULL,Motivation,EconomicPower,Child01,TripEasier,SalesPerformance,LivingCost,EconomicStability,NumberOfTrips_log,TravelCost,...,Monetary,MonetarySeg,FreaqencySeg,ContractRate_FM,ContractRate_G1,ContractRate_G2,ContractRate_G3,ContractRate_G4,ContractRate_G5,ContractRate_G6
0,0,4,34.102241,0,0,1.801646,8.773409,16.526471,2.079442,3.079442,...,27.274468,3,1,0.141921,0.0,0.0,0.0,0.0,0.044944,0.05
1,0,14,25.223082,0,8,1.823531,8.81926,14.755503,1.609438,2.609438,...,20.297492,3,0,0.146226,0.032258,0.2,0.181818,0.068441,0.181818,0.065693
2,0,4,37.405322,0,24,1.430065,8.312294,15.086813,0.693147,1.693147,...,8.642465,1,0,0.180328,0.311594,0.285714,0.383871,0.416201,0.391447,0.422096
3,0,4,32.416696,0,24,1.489417,9.134037,17.829183,0.693147,2.539721,...,8.642131,1,0,0.180328,0.481818,0.451087,0.383871,0.416201,0.108108,0.154762
4,0,4,48.757169,0,48,1.985889,7.937675,15.127224,0.693147,1.693147,...,8.665614,1,0,0.180328,0.3,0.247706,0.185185,0.22905,0.186747,0.232044


# 特徴量作成

## TypeOfContactNuLL
TypeOfContact が欠損値の場合を、明らかに学習させるための特徴量

In [5]:
df_train['TypeofContactNULL'] = df_train['TypeofContact'].apply(make_TypeOfContactNULL)
df_test['TypeofContactNULL'] = df_test['TypeofContact'].apply(make_TypeOfContactNULL)

## Motivation
顧客の旅行に対する意欲を表す特徴量

In [6]:
df_train['Motivation'] = df_train['NumberOfPersonVisiting'] * df_train['NumberOfFollowups'] + (df_train['Passport'] * 10)
df_test['Motivation'] = df_test['NumberOfPersonVisiting'] * df_test['NumberOfFollowups'] + (df_test['Passport'] * 10)

## EconomicPower
顧客の経済力を表す特徴量

In [7]:
df_train['EconomicPower'] = df_train['MonthlyIncome'] * (4 - df_train['CityTier']) * df_train['Gender'].apply(make_motivation_gender)
df_test['EconomicPower'] = df_test['MonthlyIncome'] * (4 - df_test['CityTier']) * df_test['Gender'].apply(make_motivation_gender)

## Child01
顧客に子どもがいるかどうかを表す特徴量

In [8]:
df_train['Child01'] = df_train['Child'].apply(make_child01)
df_test['Child01'] = df_test['Child'].apply(make_child01)

## TripEasier
旅行の行きやすさを表す特徴量

In [9]:
df_train['TripEasier'] = (5 - df_train['NumberOfPersonVisiting']) * df_train['ProductPitched'].apply(make_TripEasier_ProductPitched) * df_train['Marry'].apply(make_TripEasier_Marry) * df_train['Child'].apply(make_TripEasier_Child)
df_test['TripEasier'] = (5 - df_test['NumberOfPersonVisiting']) * df_test['ProductPitched'].apply(make_TripEasier_ProductPitched) * df_test['Marry'].apply(make_TripEasier_Marry) * df_test['Child'].apply(make_TripEasier_Child)

## SalesPerformance
営業担当者の単位時間当たりのパフォーマンス

In [10]:
df_train['SalesPerformance'] = df_train['NumberOfFollowups'] * df_train['PitchSatisfactionScore'] / df_train['DurationOfPitch']
df_test['SalesPerformance'] = df_train['NumberOfFollowups'] * df_test['PitchSatisfactionScore'] / df_test['DurationOfPitch']

## LivingCost
生活コストを表す数値

In [11]:
df_train['LivingCost'] = df_train['MonthlyIncome'] / (df_train['CityTier'].apply(make_LivingCost_CityTier) * (1 + df_train['Child'].apply(make_LivingCost_Child) + df_train['Marry'].apply(make_LivingCost_Marry) + df_train['Car'].apply(make_LivingCost_Car)))
df_test['LivingCost'] = df_test['MonthlyIncome'] / (df_test['CityTier'].apply(make_LivingCost_CityTier) * (1 + df_test['Child'].apply(make_LivingCost_Child) + df_test['Marry'].apply(make_LivingCost_Marry) + df_test['Car'].apply(make_LivingCost_Car)))

## EconomicStability
経済安定性指標

In [12]:
df_train['EconomicStability'] = df_train['MonthlyIncome'] * df_train['Occupation'].apply(make_EconomicStability_Occupation) * df_train['Designation'].apply(make_EconomicStability_Disignation)
df_test['EconomicStability'] = df_test['MonthlyIncome'] * df_test['Occupation'].apply(make_EconomicStability_Occupation) * df_test['Designation'].apply(make_EconomicStability_Disignation)

## NumberOfTrips_log

In [13]:
df_train['NumberOfTrips_log'] = np.log1p(df_train['NumberOfTrips'])
df_test['NumberOfTrips_log'] = np.log1p(df_test['NumberOfTrips'])

## TravelCost
総旅行費の推定

In [14]:
df_train['TravelCost'] = df_train['PreferredPropertyStar'].apply(make_TravelCost_PreferredPropertyStar) * (1 + df_train['NumberOfTrips_log'])
df_test['TravelCost'] = df_test['PreferredPropertyStar'].apply(make_TravelCost_PreferredPropertyStar) * (1 + df_test['NumberOfTrips_log'])

## EconomicSegment
経済力のセグメント

In [15]:
df_train['EconomicSegment'] = df_train['Occupation'] + df_train['Designation'] + df_train['CityTier'].astype(str)
df_test['EconomicSegment'] = df_test['Occupation'] + df_test['Designation'] + df_test['CityTier'].astype(str)

df_train = mapping_EconomicSegment(df_train)
df_test = mapping_EconomicSegment(df_test)

## PacakgeMatch
営業担当者がセールスした商品と、顧客の希望するホテルのランクがマッチしているかを表す数値

In [16]:
product_to_star = {
    'Basic': 3,
    'Standard': 3,
    'Deluxe': 4,
    'Super Deluxe': 5,
    'King': 5
}

# 製品と希望ホテルの星評価の適合度を計算
def calculate_fit(product, preferred_star):
    ideal_star = product_to_star[product]
    max_star_difference = 5  # 星評価の最大差（1星から5星）
    return max(0, 1 - abs(ideal_star - preferred_star) / max_star_difference)

# 新しい特徴量をデータフレームに追加
df_train['PackageMatch'] = df_train.apply(lambda row: calculate_fit(row['ProductPitched'], row['PreferredPropertyStar']), axis=1)
df_test['PackageMatch'] = df_test.apply(lambda row: calculate_fit(row['ProductPitched'], row['PreferredPropertyStar']), axis=1)

In [60]:
df_train[['PackageMatch', 'ProductPitched', 'PreferredPropertyStar']]

Unnamed: 0,PackageMatch,ProductPitched,PreferredPropertyStar
0,1.0,Basic,3.0
1,1.0,Standard,3.0
2,1.0,Basic,3.0
3,0.8,Standard,4.0
4,0.8,Basic,4.0
...,...,...,...
3484,1.0,Basic,3.0
3485,0.6,Basic,5.0
3486,1.0,Standard,3.0
3487,0.6,King,3.0


## IsFamily
家族か、そうでないか、2人以上ならば家族とする。

In [17]:
df_train['IsFamily'] = df_train.apply(lambda x: int(x['Marry'] == 'Married' or x['Child'] != '0_child'), axis=1)
df_test['IsFamily'] = df_test.apply(lambda x: int(x['Marry'] == 'Married' or x['Child'] != '0_child'), axis=1)

## FreaqencySeg
旅行頻度をセグメント化。RFMのF

In [18]:
df_train['FreaqencySeg'] = df_train['NumberOfTrips'].apply(make_TripFreaqency)
df_test['FreaqencySeg'] = df_test['NumberOfTrips'].apply(make_TripFreaqency)

## Monetary
旅行に費やした総費用。RFM分析のM

In [19]:
df_train['Monetary'] = df_train['MonthlyIncome'] * df_train['NumberOfTrips_log']
df_test['Monetary'] = df_test['MonthlyIncome'] * df_test['NumberOfTrips_log']

## MonetarySeg
Monetaryを区分で分割

In [20]:
df_train['MonetarySeg'] = df_train['Monetary'].apply(make_MonetarySeg)
df_test['MonetarySeg'] = df_test['Monetary'].apply(make_MonetarySeg)

## ContractRate_FM
RFMのFMの部分。Freaqency、Monetary毎の契約成功率

In [21]:
tmp = df_train.groupby(by=['FreaqencySeg', 'MonetarySeg'], as_index=False)['ProdTaken'].mean()
tmp = tmp.rename(columns={'ProdTaken': 'ContractRate_FM'})

In [23]:
df_train = df_train.merge(tmp, on=['FreaqencySeg', 'MonetarySeg'], how='left')
df_test = df_test.merge(tmp, on=['FreaqencySeg', 'MonetarySeg'], how='left')

## ContractRate_G1 (G: Group)
顧客セグメント毎の契約成功率。訓練データの集計データをテストデータに結合しているため、少し問題がある？

In [24]:
tmp = df_train.groupby(by=['AgeGroup', 'Gender', 'ProductPitched'], as_index=False)['ProdTaken'].mean()
tmp = tmp.rename(columns={'ProdTaken': 'ContractRate_G1'})

In [10]:
tmp

Unnamed: 0,AgeGroup,Gender,ProductPitched,ContractRate_G1
0,10s,female,Basic,0.666667
1,10s,female,Deluxe,0.0
2,10s,female,Standard,0.0
3,10s,male,Basic,0.6
4,10s,male,Deluxe,0.5
5,10s,male,Standard,0.0
6,20s,female,Basic,0.311594
7,20s,female,Deluxe,0.057143
8,20s,female,Standard,0.129032
9,20s,male,Basic,0.481818


In [25]:
df_train = df_train.merge(tmp, on=['AgeGroup', 'Gender', 'ProductPitched'], how='left')
df_test = df_test.merge(tmp, on=['AgeGroup', 'Gender', 'ProductPitched'], how='left')

## ContractRate_G2
年代、性別、都市層、パスポートの有無ごとの契約成功率

In [26]:
tmp = df_train.groupby(by=['AgeGroup', 'Gender', 'ProductPitched', 'Passport'], as_index=False)['ProdTaken'].mean()
tmp = tmp.rename(columns={'ProdTaken': 'ContractRate_G2'})

In [13]:
tmp[:30]

Unnamed: 0,AgeGroup,Gender,ProductPitched,Passport,ContractRate_G2
0,10s,female,Basic,0,0.5
1,10s,female,Basic,1,1.0
2,10s,female,Deluxe,0,0.0
3,10s,female,Standard,1,0.0
4,10s,male,Basic,0,0.625
5,10s,male,Basic,1,0.5
6,10s,male,Deluxe,0,0.5
7,10s,male,Standard,0,0.0
8,20s,female,Basic,0,0.285714
9,20s,female,Basic,1,0.583333


In [27]:
df_train = df_train.merge(tmp, on=['AgeGroup', 'Gender', 'ProductPitched', 'Passport'], how='left')
df_test = df_test.merge(tmp, on=['AgeGroup', 'Gender', 'ProductPitched', 'Passport'], how='left')

## ContractRate_G3
上の分割方法から、性別を除いたもの

In [28]:
tmp = df_train.groupby(by=['AgeGroup', 'ProductPitched', 'Passport'], as_index=False)['ProdTaken'].mean()
tmp = tmp.rename(columns={'ProdTaken': 'ContractRate_G3'})

In [15]:
tmp

Unnamed: 0,AgeGroup,ProductPitched,Passport,ContractRate_G3
0,10s,Basic,0,0.6
1,10s,Basic,1,0.666667
2,10s,Deluxe,0,0.333333
3,10s,Standard,0,0.0
4,10s,Standard,1,0.0
5,20s,Basic,0,0.383871
6,20s,Basic,1,0.625
7,20s,Deluxe,0,0.083333
8,20s,Deluxe,1,0.090909
9,20s,King,0,0.0


In [29]:
df_train = df_train.merge(tmp, on=['AgeGroup', 'ProductPitched', 'Passport'], how='left')
df_test = df_test.merge(tmp, on=['AgeGroup', 'ProductPitched', 'Passport'], how='left')

## ContractRate_G4
上の分割方法から、パスポートを除いたもの

In [30]:
tmp = df_train.groupby(by=['AgeGroup', 'ProductPitched'], as_index=False)['ProdTaken'].mean()
tmp = tmp.rename(columns={'ProdTaken': 'ContractRate_G4'})

In [17]:
tmp

Unnamed: 0,AgeGroup,ProductPitched,ContractRate_G4
0,10s,Basic,0.615385
1,10s,Deluxe,0.333333
2,10s,Standard,0.0
3,20s,Basic,0.416201
4,20s,Deluxe,0.084034
5,20s,King,0.0
6,20s,Standard,0.168675
7,30s,Basic,0.26186
8,30s,Deluxe,0.043564
9,30s,King,0.043478


In [31]:
df_train = df_train.merge(tmp, on=['AgeGroup', 'ProductPitched'], how='left')
df_test = df_test.merge(tmp, on=['AgeGroup', 'ProductPitched'], how='left')

## ContractRate_G5
役職、年代、パスポート

In [32]:
tmp = df_train.groupby(by=['Designation', 'AgeGroup', 'Passport'], as_index=False)['ProdTaken'].mean()
tmp = tmp.rename(columns={'ProdTaken': 'ContractRate_G5'})

In [8]:
tmp

Unnamed: 0,Designation,AgeGroup,Passport,ContractRate_G5
0,AVP,10s,0,0.0
1,AVP,20s,0,0.3
2,AVP,30s,0,0.096774
3,AVP,30s,1,0.0
4,AVP,40s,0,0.044944
5,AVP,40s,1,0.090909
6,AVP,50s,0,0.065217
7,AVP,50s,1,0.222222
8,AVP,60s,0,0.0
9,AVP,60s,1,0.0


In [33]:
df_train = df_train.merge(tmp, on=['Designation', 'AgeGroup', 'Passport'], how='left')
df_test = df_test.merge(tmp, on=['Designation', 'AgeGroup', 'Passport'], how='left')

## ContractRate_G6
上からパスポート抜き

In [34]:
tmp = df_train.groupby(by=['Designation', 'AgeGroup'], as_index=False)['ProdTaken'].mean()
tmp = tmp.rename(columns={'ProdTaken': 'ContractRate_G6'})

In [6]:
tmp

Unnamed: 0,Designation,AgeGroup,ContractRate_G6
0,AVP,10s,0.0
1,AVP,20s,0.3
2,AVP,30s,0.09375
3,AVP,40s,0.05
4,AVP,50s,0.083333
5,AVP,60s,0.0
6,Executive,10s,0.615385
7,Executive,20s,0.422096
8,Executive,30s,0.266667
9,Executive,40s,0.232044


In [35]:
df_train = df_train.merge(tmp, on=['Designation', 'AgeGroup'], how='left')
df_test = df_test.merge(tmp, on=['Designation', 'AgeGroup'], how='left')

# 特徴量のデータ型

In [36]:
feature = [
    'Age',
    'TypeofContact',
    'CityTier',
    'DurationOfPitch',
    'Occupation',
    'Gender',
    'NumberOfPersonVisiting',
    'NumberOfFollowups',
    'ProductPitched',
    'PreferredPropertyStar',
    'NumberOfTrips',
    'Passport',
    'PitchSatisfactionScore',
    'Designation',
    'MonthlyIncome',
    'Marry',
    'Car',
    'Child',
    # 以下、作成特徴量
    'AgeGroup',
    'TypeofContactNULL',
    'Motivation',
    'EconomicPower',
    'Child01',
    'TripEasier',
    'SalesPerformance',
    'LivingCost',
    'EconomicStability',
    'TravelCost',
    'NumberOfTrips_log',
    'EconomicSegment',
    'PackageMatch',
    'IsFamily',
    'FreaqencySeg',
    'Monetary',
    'MonetarySeg',
    'ContractRate_FM',
    'ContractRate_G1',
    'ContractRate_G2',
    'ContractRate_G3',
    'ContractRate_G4',
    'ContractRate_G5',
    'ContractRate_G6'
]

In [37]:
float_columns = ['DurationOfPitch', 'MonthlyIncome', 'EconomicPower', 'SalesPerformance',
                'LivingCost', 'EconomicStability', 'TravelCost', 'NumberOfTrips_log',
                'PackageMatch', 'Monetary', 'ContractRate_FM', 'ContractRate_G1',
                'ContractRate_G2', 'ContractRate_G3', 'ContractRate_G4', 'ContractRate_G5',
                'ContractRate_G6']
int_columns = ['Age', 'CityTier', 'NumberOfPersonVisiting', 'NumberOfFollowups', 'PreferredPropertyStar',
                'NumberOfTrips', 'Passport', 'PitchSatisfactionScore', 'TypeofContactNULL',
                'Motivation', 'Child01', 'TripEasier', 'EconomicSegment',
                'IsFamily', 'FreaqencySeg', 'MonetarySeg']

In [39]:
for col in float_columns:
    df_train[col] = df_train[col].astype(float)
    # df_test[col] = df_test[col].astype(float)

for col in int_columns:
    df_train[col] = df_train[col].astype(int)
    # df_test[col] = df_test[col].astype(int)

# 確認

In [42]:
display(df_train.iloc[:5, 20:])
display(df_test.iloc[:5, 19:])

Unnamed: 0,AgeGroup,TypeofContactNULL,Motivation,EconomicPower,Child01,TripEasier,SalesPerformance,LivingCost,EconomicStability,NumberOfTrips_log,...,Monetary,MonetarySeg,FreaqencySeg,ContractRate_FM,ContractRate_G1,ContractRate_G2,ContractRate_G3,ContractRate_G4,ContractRate_G5,ContractRate_G6
0,50s,0,14,32.35627,0,48,2.351729,9.572861,16.42703,1.791759,...,22.297944,3,1,0.141921,0.244444,0.5,0.454545,0.192308,0.454545,0.197368
1,50s,0,14,50.350356,0,8,2.375794,8.197046,18.461797,1.098612,...,14.183467,2,0,0.135781,0.108844,0.25,0.2,0.097046,0.195122,0.095238
2,30s,0,3,37.607704,0,24,1.875412,7.597516,16.54739,1.609438,...,20.175755,3,0,0.146226,0.212389,0.171569,0.223382,0.26186,0.228512,0.266667
3,30s,0,3,25.394244,0,8,2.147262,9.301921,14.855633,0.693147,...,8.800974,1,0,0.180328,0.032258,0.012048,0.058091,0.068441,0.055556,0.065693
4,40s,0,3,12.462403,0,48,1.731967,11.868956,12.337779,1.609438,...,20.057464,3,0,0.146226,0.084746,0.056604,0.185185,0.22905,0.186747,0.232044


Unnamed: 0,AgeGroup,TypeofContactNULL,Motivation,EconomicPower,Child01,TripEasier,SalesPerformance,LivingCost,EconomicStability,NumberOfTrips_log,...,Monetary,MonetarySeg,FreaqencySeg,ContractRate_FM,ContractRate_G1,ContractRate_G2,ContractRate_G3,ContractRate_G4,ContractRate_G5,ContractRate_G6
0,40s,0,4,34.102241,0,0,1.801646,8.773409,16.526471,2.079442,...,27.274468,3,1,0.141921,0.0,0.0,0.0,0.0,0.044944,0.05
1,30s,0,14,25.223082,0,8,1.823531,8.81926,14.755503,1.609438,...,20.297492,3,0,0.146226,0.032258,0.2,0.181818,0.068441,0.181818,0.065693
2,20s,0,4,37.405322,0,24,1.430065,8.312294,15.086813,0.693147,...,8.642465,1,0,0.180328,0.311594,0.285714,0.383871,0.416201,0.391447,0.422096
3,20s,0,4,32.416696,0,24,1.489417,9.134037,17.829183,0.693147,...,8.642131,1,0,0.180328,0.481818,0.451087,0.383871,0.416201,0.108108,0.154762
4,40s,0,4,48.757169,0,48,1.985889,7.937675,15.127224,0.693147,...,8.665614,1,0,0.180328,0.3,0.247706,0.185185,0.22905,0.186747,0.232044


# CSV出力

In [40]:
if 'test' in df_train.columns:
    df_train = df_train.drop(columns='test', axis=1)
    df_test = df_test.drop(columns='test', axis=1)

In [41]:
df_train.to_csv('data/feature_engineered/null_representative/train_null_del_beta.csv', index=False)
# df_test.to_csv('data/feature_engineered/null_cat/test_null_cat.csv', index=False)