In [2]:
import os
os.chdir('../../')

In [3]:
import numpy as np
import pandas as pd

from scr.util import *
from scr.feature_engineering.engineering import *

In [69]:
df_train = pd.read_csv('data/feature_engineered/null_cat/train_null_cat.csv')
df_test = pd.read_csv('data/feature_engineered/null_cat/test_null_cat.csv')

In [71]:
display(df_train.iloc[:5, 21:])
display(df_test.iloc[:5, 20:])

Unnamed: 0,TypeofContactNULL,Motivation,EconomicPower,Child01,TripEasier,SalesPerformance,LivingCost,EconomicStability,TripFreaqency,NumberOfTrips_log,TravelCost,EconomicSegment,PackageMatch,IsFamily
0,0,14,32.35627,0,48,2.351729,9.572861,16.42703,1,1.791759,2.791759,0,1.0,0
1,0,14,50.350356,0,8,2.375794,8.197046,18.461797,0,1.098612,2.098612,1,1.0,0
2,0,3,37.607704,0,24,1.875412,7.597516,16.54739,0,1.609438,2.609438,2,1.0,1
3,0,3,25.394244,0,8,2.147262,9.301921,14.855633,0,0.693147,2.539721,3,0.8,0
4,0,3,12.462403,0,48,1.731967,11.868956,12.337779,0,1.609438,3.914157,4,0.8,0


Unnamed: 0,TypeofContactNULL,Motivation,EconomicPower,Child01,TripEasier,SalesPerformance,LivingCost,EconomicStability,TripFreaqency,NumberOfTrips_log,TravelCost,EconomicSegment,PackageMatch,IsFamily
0,0,4,34.102241,0,0,1.801646,8.773409,16.526471,1,2.079442,3.079442,10,0.6,1
1,0,14,25.223082,0,8,1.823531,8.81926,14.755503,0,1.609438,2.609438,3,1.0,1
2,0,4,37.405322,0,24,1.430065,8.312294,15.086813,0,0.693147,1.693147,13,1.0,0
3,0,4,32.416696,0,24,1.489417,9.134037,17.829183,0,0.693147,2.539721,20,0.8,0
4,0,4,48.757169,0,48,1.985889,7.937675,15.127224,0,0.693147,1.693147,13,1.0,0


# 特徴量作成

## TypeOfContactNuLL
TypeOfContact が欠損値の場合を、明らかに学習させるための特徴量

In [47]:
df_train['TypeofContactNULL'] = df_train['TypeofContact'].apply(make_TypeOfContactNULL)
df_test['TypeofContactNULL'] = df_test['TypeofContact'].apply(make_TypeOfContactNULL)

## Motivation
顧客の旅行に対する意欲を表す特徴量

In [48]:
df_train['Motivation'] = df_train['NumberOfPersonVisiting'] * df_train['NumberOfFollowups'] + (df_train['Passport'] * 10)
df_test['Motivation'] = df_test['NumberOfPersonVisiting'] * df_test['NumberOfFollowups'] + (df_test['Passport'] * 10)

## EconomicPower
顧客の経済力を表す特徴量

In [49]:
df_train['EconomicPower'] = df_train['MonthlyIncome'] * (4 - df_train['CityTier']) * df_train['Gender'].apply(make_motivation_gender)
df_test['EconomicPower'] = df_test['MonthlyIncome'] * (4 - df_test['CityTier']) * df_test['Gender'].apply(make_motivation_gender)

## Child01
顧客に子どもがいるかどうかを表す特徴量

In [50]:
df_train['Child01'] = df_train['Child'].apply(make_child01)
df_test['Child01'] = df_test['Child'].apply(make_child01)

## TripEasier
旅行の行きやすさを表す特徴量

In [51]:
df_train['TripEasier'] = (5 - df_train['NumberOfPersonVisiting']) * df_train['ProductPitched'].apply(make_TripEasier_ProductPitched) * df_train['Marry'].apply(make_TripEasier_Marry) * df_train['Child'].apply(make_TripEasier_Child)
df_test['TripEasier'] = (5 - df_test['NumberOfPersonVisiting']) * df_test['ProductPitched'].apply(make_TripEasier_ProductPitched) * df_test['Marry'].apply(make_TripEasier_Marry) * df_test['Child'].apply(make_TripEasier_Child)

## SalesPerformance
営業担当者の単位時間当たりのパフォーマンス

In [52]:
df_train['SalesPerformance'] = df_train['NumberOfFollowups'] * df_train['PitchSatisfactionScore'] / df_train['DurationOfPitch']
df_test['SalesPerformance'] = df_train['NumberOfFollowups'] * df_test['PitchSatisfactionScore'] / df_test['DurationOfPitch']

## LivingCost
生活コストを表す数値

In [53]:
df_train['LivingCost'] = df_train['MonthlyIncome'] / (df_train['CityTier'].apply(make_LivingCost_CityTier) * (1 + df_train['Child'].apply(make_LivingCost_Child) + df_train['Marry'].apply(make_LivingCost_Marry) + df_train['Car'].apply(make_LivingCost_Car)))
df_test['LivingCost'] = df_test['MonthlyIncome'] / (df_test['CityTier'].apply(make_LivingCost_CityTier) * (1 + df_test['Child'].apply(make_LivingCost_Child) + df_test['Marry'].apply(make_LivingCost_Marry) + df_test['Car'].apply(make_LivingCost_Car)))

## EconomicStability
経済安定性指標

In [54]:
df_train['EconomicStability'] = df_train['MonthlyIncome'] * df_train['Occupation'].apply(make_EconomicStability_Occupation) * df_train['Designation'].apply(make_EconomicStability_Disignation)
df_test['EconomicStability'] = df_test['MonthlyIncome'] * df_test['Occupation'].apply(make_EconomicStability_Occupation) * df_test['Designation'].apply(make_EconomicStability_Disignation)

## TripFreaqency
旅行頻度

In [55]:
df_train['TripFreaqency'] = df_train['NumberOfTrips'].apply(make_TripFreaqency)
df_test['TripFreaqency'] = df_test['NumberOfTrips'].apply(make_TripFreaqency)

## NumberOfTrips_log

In [56]:
df_train['NumberOfTrips_log'] = np.log1p(df_train['NumberOfTrips'])
df_test['NumberOfTrips_log'] = np.log1p(df_test['NumberOfTrips'])

## TravelCost
総旅行費の推定

In [57]:
df_train['TravelCost'] = df_train['PreferredPropertyStar'].apply(make_TravelCost_PreferredPropertyStar) * (1 + df_train['NumberOfTrips_log'])
df_test['TravelCost'] = df_test['PreferredPropertyStar'].apply(make_TravelCost_PreferredPropertyStar) * (1 + df_test['NumberOfTrips_log'])

## EconomicSegment
経済力のセグメント

In [58]:
df_train['EconomicSegment'] = df_train['Occupation'] + df_train['Designation'] + df_train['CityTier'].astype(str)
df_test['EconomicSegment'] = df_test['Occupation'] + df_test['Designation'] + df_test['CityTier'].astype(str)

df_train = mapping_EconomicSegment(df_train)
df_test = mapping_EconomicSegment(df_test)

## PacakgeMatch
営業担当者がセールスした商品と、顧客の希望するホテルのランクがマッチしているかを表す数値

In [59]:
product_to_star = {
    'Basic': 3,
    'Standard': 3,
    'Deluxe': 4,
    'Super Deluxe': 5,
    'King': 5
}

# 製品と希望ホテルの星評価の適合度を計算
def calculate_fit(product, preferred_star):
    ideal_star = product_to_star[product]
    max_star_difference = 5  # 星評価の最大差（1星から5星）
    return max(0, 1 - abs(ideal_star - preferred_star) / max_star_difference)

# 新しい特徴量をデータフレームに追加
df_train['PackageMatch'] = df_train.apply(lambda row: calculate_fit(row['ProductPitched'], row['PreferredPropertyStar']), axis=1)
df_test['PackageMatch'] = df_test.apply(lambda row: calculate_fit(row['ProductPitched'], row['PreferredPropertyStar']), axis=1)

In [60]:
df_train[['PackageMatch', 'ProductPitched', 'PreferredPropertyStar']]

Unnamed: 0,PackageMatch,ProductPitched,PreferredPropertyStar
0,1.0,Basic,3.0
1,1.0,Standard,3.0
2,1.0,Basic,3.0
3,0.8,Standard,4.0
4,0.8,Basic,4.0
...,...,...,...
3484,1.0,Basic,3.0
3485,0.6,Basic,5.0
3486,1.0,Standard,3.0
3487,0.6,King,3.0


## Family
家族か、そうでないか、2人以上ならば家族とする。

In [61]:
df_train['IsFamily'] = df_train.apply(lambda x: int(x['Marry'] == 'Married' or x['Child'] != '0_child'), axis=1)
df_test['IsFamily'] = df_test.apply(lambda x: int(x['Marry'] == 'Married' or x['Child'] != '0_child'), axis=1)

## IncomeGrouped
顧客セグメント毎の1人当たりの収入（年齢、性別、都市層）

In [40]:
tmp_train = df_train.groupby(by=['AgeGroup', 'Gender', 'CityTier']).agg(
    count = ('id', 'count'),
    total = ('MonthlyIncome', 'sum')
)
tmp_train['IncomeGrouped'] = tmp_train['total'] / tmp_train['count']

tmp_test = df_test.groupby(by=['AgeGroup', 'Gender', 'CityTier']).agg(
    count = ('id', 'count'),
    total = ('MonthlyIncome', 'sum')
)
tmp_test['IncomeGrouped'] = tmp_test['total'] / tmp_test['count']

In [42]:
df_train['MonthlyIncome']

0       253905.0
1       404475.0
2       278145.0
3       326805.0
4       258435.0
          ...   
3484    258900.0
3485    260415.0
3486    317340.0
3487    527910.0
3488    278190.0
Name: MonthlyIncome, Length: 3489, dtype: float64

In [41]:
tmp_train

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,count,total,IncomeGrouped
AgeGroup,Gender,CityTier,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
10s,female,1,2,567010.0,283505.0
10s,female,2,2,676845.0,338422.5
10s,female,3,1,263040.0,263040.0
10s,male,1,6,1814975.0,302495.833333
10s,male,2,7,2072660.0,296094.285714
20s,female,1,65,19208190.0,295510.655086
20s,female,2,96,29637660.0,308725.600416
20s,female,3,43,13158920.0,306021.452611
20s,male,1,111,34382020.0,309747.882883
20s,male,2,211,67523690.0,320017.488152


# 特徴量のデータ型

In [130]:
feature = [
    'Age',
    'TypeofContact',
    'CityTier',
    'DurationOfPitch',
    'Occupation',
    'Gender',
    'NumberOfPersonVisiting',
    'NumberOfFollowups',
    'ProductPitched',
    'PreferredPropertyStar',
    'NumberOfTrips',
    'Passport',
    'PitchSatisfactionScore',
    'Designation',
    'MonthlyIncome',
    'Marry',
    'Car',
    'Child',
    # 以下、作成特徴量
    'AgeGroup',
    'TypeofContactNULL',
    'Motivation',
    'EconomicPower',
    'Child01',
    'TripEasier',
    'SalesPerformance',
    'LivingCost',
    'EconomicStability',
    'TripFreaqency',
    'TravelCost',
    'NumberOfTrips_log',
    'EconomicSegment',
    'PackageMatch',
    'IsFamily'
]

In [62]:
float_columns = ['DurationOfPitch', 'MonthlyIncome', 'EconomicPower', 'SalesPerformance',
                'LivingCost', 'EconomicStability', 'TravelCost', 'NumberOfTrips_log',
                'PackageMatch']
int_columns = ['Age', 'CityTier', 'NumberOfPersonVisiting', 'NumberOfFollowups', 'PreferredPropertyStar',
                'NumberOfTrips', 'Passport', 'PitchSatisfactionScore', 'TypeofContactNULL',
                'Motivation', 'Child01', 'TripEasier', 'TripFreaqency', 'EconomicSegment',
                'IsFamily']

In [63]:
for col in float_columns:
    df_train[col] = df_train[col].astype(float)
    df_test[col] = df_test[col].astype(float)

for col in int_columns:
    df_train[col] = df_train[col].astype(int)
    df_test[col] = df_test[col].astype(int)

# 確認

In [64]:
display(df_train.iloc[:5, 20:])
display(df_test.iloc[:5, 19:])

Unnamed: 0,AgeGroup,TypeofContactNULL,Motivation,EconomicPower,Child01,TripEasier,SalesPerformance,LivingCost,EconomicStability,TripFreaqency,NumberOfTrips_log,TravelCost,EconomicSegment,PackageMatch,IsFamily
0,50s,0,14,32.35627,0,48,2.351729,9.572861,16.42703,1,1.791759,2.791759,0,1.0,0
1,50s,0,14,50.350356,0,8,2.375794,8.197046,18.461797,0,1.098612,2.098612,1,1.0,0
2,30s,0,3,37.607704,0,24,1.875412,7.597516,16.54739,0,1.609438,2.609438,2,1.0,1
3,30s,0,3,25.394244,0,8,2.147262,9.301921,14.855633,0,0.693147,2.539721,3,0.8,0
4,40s,0,3,12.462403,0,48,1.731967,11.868956,12.337779,0,1.609438,3.914157,4,0.8,0


Unnamed: 0,AgeGroup,TypeofContactNULL,Motivation,EconomicPower,Child01,TripEasier,SalesPerformance,LivingCost,EconomicStability,TripFreaqency,NumberOfTrips_log,TravelCost,EconomicSegment,PackageMatch,IsFamily
0,40s,0,4,34.102241,0,0,1.801646,8.773409,16.526471,1,2.079442,3.079442,10,0.6,1
1,30s,0,14,25.223082,0,8,1.823531,8.81926,14.755503,0,1.609438,2.609438,3,1.0,1
2,20s,0,4,37.405322,0,24,1.430065,8.312294,15.086813,0,0.693147,1.693147,13,1.0,0
3,20s,0,4,32.416696,0,24,1.489417,9.134037,17.829183,0,0.693147,2.539721,20,0.8,0
4,40s,0,4,48.757169,0,48,1.985889,7.937675,15.127224,0,0.693147,1.693147,13,1.0,0


# CSV出力

In [65]:
if 'test' in df_train.columns:
    df_train = df_train.drop(columns='test', axis=1)
    df_test = df_test.drop(columns='test', axis=1)

In [68]:
df_train.to_csv('data/feature_engineered/null_cat/train_null_cat.csv', index=False)
df_test.to_csv('data/feature_engineered/null_cat/test_null_cat.csv', index=False)