# 予測対象のデータの前処理

In [259]:
# ライブラリの読み込み
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re

# utilの読み込み
from my_utils import *

In [260]:
#train = pd.read_csv("../data/train.csv")
test = pd.read_csv("../data/test.csv")
test.head()

Unnamed: 0,id,Age,TypeofContact,CityTier,DurationOfPitch,Occupation,Gender,NumberOfPersonVisiting,NumberOfFollowups,ProductPitched,PreferredPropertyStar,NumberOfTrips,Passport,PitchSatisfactionScore,Designation,MonthlyIncome,customer_info
0,3489,４８歳,Self Enquiry,2,13分,Small Business,Male,1.0,4.0,Super De|uxe,3.0,7,0,3,AVP,496950.0,結婚済み 車所持 子供なし
1,3490,30代,Self Enquiry,2,12分,Small Business,Ｆｅｍａｌｅ,1.0,4.0,Standard,3.0,4,1,3,Senior Manager,月収30.0万円,結婚済み、車未所持、子供なし
2,3491,25歳,Self Enquiry,1,540秒,Salaried,Female,1.0,4.0,Basic,3.0,1,0,3,Executive,月収26.0万円,離婚済み、自動車未所有、子供なし
3,3492,21歳,Company Invited,2,420秒,Salaried,Male,1.0,4.0,Basic,4.0,1,0,3,Senior Manager,259875.0,離婚済み、自動車所有、子供なし
4,3493,41歳,Company Invited,1,7分,Salaried,MALE,1.0,4.0,Basic,3.0,1,0,4,Executive,268830.0,独身/車所持／子供なし


In [261]:
# 欠損値の確認
test.isnull().sum()

id                          0
Age                        93
TypeofContact              12
CityTier                    0
DurationOfPitch           131
Occupation                  0
Gender                      0
NumberOfPersonVisiting      0
NumberOfFollowups          24
ProductPitched              0
PreferredPropertyStar       0
NumberOfTrips              42
Passport                    0
PitchSatisfactionScore      0
Designation                 0
MonthlyIncome              49
customer_info               0
dtype: int64

In [262]:
# 前処理後のデータを格納するdataframe
test_copied = test.copy()
test_preprocessed = pd.DataFrame()

In [263]:
test_copied["Age"].value_counts()

Age
30代     108
33歳      98
32歳      93
36歳      93
37歳      91
       ... 
５２才       1
19才       1
四十四歳      1
59才       1
５４才       1
Name: count, Length: 236, dtype: int64

In [264]:
# Ageの前処理
age_int = test_copied['Age'].apply(convert_age_to_int)
test_copied.loc[:, 'AgeInt'] = age_int
#test_copied.to_csv("age_int.csv")


In [265]:
test_copied["TypeofContact"].value_counts()

TypeofContact
Self Enquiry       2250
Company Invited    1227
Name: count, dtype: int64

In [266]:
type_of_contact_mapping = {
    "Self Enquiry": 0,
    "Company Invited": 1,
}
type_of_contact_int = test_copied["TypeofContact"].map(type_of_contact_mapping)
test_copied.loc[:, "TypeofContactInt"] = type_of_contact_int

In [267]:
test_copied["CityTier"].value_counts()

CityTier
2    1564
1    1516
3     409
Name: count, dtype: int64

In [268]:
test_copied["DurationOfPitch"].value_counts()

DurationOfPitch
9分       251
8分       236
15分      216
16分      206
14分      191
        ... 
1800秒      7
1620秒      5
1680秒      3
240秒       1
2160秒      1
Name: count, Length: 65, dtype: int64

In [269]:
# DurationOfPitchの前処理
duration_int = test_copied['DurationOfPitch'].apply(convert_duration_to_int)
test_copied.loc[:, 'DurationInt'] = duration_int

In [270]:
test_copied["Occupation"].value_counts()

Occupation
Small Business    1729
Salaried          1400
Large Business     360
Name: count, dtype: int64

In [271]:
occupation_mapping = {
    "Salaried": 0,
    "Small Business": 1,
    "Large Business": 2,
}
occupation_int = test_copied["Occupation"].map(occupation_mapping)
test_copied.loc[:, "OccupationInt"] = occupation_int

In [272]:
test_copied["Gender"].value_counts()

Gender
Male       1299
Female      702
male        472
female      238
MALE        180
Ｍａｌｅ        134
Fe Male     104
FEMALE       98
Ｆｅｍａｌｅ       92
ｍａｌｅ         49
ｆｅｍａｌｅ       29
fe male      28
ＭＡＬＥ         20
FE MALE      20
Ｆｅ　Ｍａｌｅ      11
ＦＥＭＡＬＥ        8
ｆｅ　ｍａｌｅ       5
Name: count, dtype: int64

In [273]:
gender_int = test_copied["Gender"].map(convert_gender_to_int)
test_copied.loc[:, "GenderInt"] = gender_int

In [274]:
test_copied["NumberOfPersonVisiting"].value_counts()

NumberOfPersonVisiting
2.0    1555
3.0    1396
1.0     350
4.0     188
Name: count, dtype: int64

In [275]:
test_copied["NumberOfFollowups"].value_counts()

NumberOfFollowups
4.0      1350
3.0      1257
5.0       594
2.0       104
1.0        99
6.0        31
400.0      16
300.0       8
500.0       4
100.0       1
200.0       1
Name: count, dtype: int64

In [276]:
number_of_followups_normalized = test_copied["NumberOfFollowups"].map(normalize_number_of_followups)
test_copied.loc[:, "FollowupsNormalized"] = number_of_followups_normalized
test_copied["FollowupsNormalized"].value_counts()

FollowupsNormalized
4.0    1366
3.0    1265
5.0     598
2.0     105
1.0     100
6.0      31
Name: count, dtype: int64

In [277]:
test_copied["ProductPitched"].value_counts()

ProductPitched
Basic           923
Deluxe          824
Standard        591
Super Deluxe    233
King            126
               ... 
BASIС             1
𐊡asic             1
Вasic             1
basiс             1
BAЅIC             1
Name: count, Length: 71, dtype: int64

In [278]:
product_pitched_int = test_copied["ProductPitched"].map(convert_product_pitched_to_int)
test_copied.loc[:, "ProductPitchedInt"] = product_pitched_int

In [279]:
test_copied["PreferredPropertyStar"].value_counts()

PreferredPropertyStar
3.0    2181
4.0    1109
5.0     199
Name: count, dtype: int64

In [280]:
test_copied["NumberOfTrips"].value_counts()

NumberOfTrips
2         950
3         762
5         400
1         320
7         226
年に2回      153
4         136
6         125
年に3回      124
年に1回       72
年に5回       62
年に4回       31
年に7回       31
年に6回       26
8          12
半年に1回      10
年に8回        4
四半期に1回      3
Name: count, dtype: int64

In [281]:
number_of_trips_int = test_copied["NumberOfTrips"].map(convert_trips_to_int)
test_copied.loc[:, "NumberOfTripsInt"] = number_of_trips_int

In [282]:
test_copied["Passport"].value_counts()

Passport
0    3157
1     332
Name: count, dtype: int64

In [283]:
test_copied["PitchSatisfactionScore"].value_counts()

PitchSatisfactionScore
2    1231
1     753
4     713
3     652
5     140
Name: count, dtype: int64

In [284]:
test_copied["Designation"].value_counts()

Designation
Executive         1129
Manager           1016
Senior Manager     740
AVP                305
VP                 142
Exеcutivе           10
Managеr             10
Executivе            9
Exеcutive            9
Mαnager              9
Manαger              8
ΑVP                  8
АVP                  7
Executiѵe            7
Senior Managеr       6
Senio𝙧 Manager       5
Еxecutive            5
Manage𝙧              5
Μanager              5
Sеnior Manager       4
VＰ                   3
Execｕtive            3
Senior Manage𝙧       3
Senior Μanager       2
Ѕenior Manager       2
Senior Manαger       2
AVＰ                  2
Տenior Manager       2
E×ecutive            2
Μαnager              2
Mαnαger              2
Execｕtivе            1
Sеnio𝙧 Manager       1
Exеcｕtivе            1
Mαnαgеr              1
Sеnior Managеr       1
АVＰ                  1
Senior Mαnαger       1
Senior Managе𝙧       1
Mαnagеr              1
Еxecｕtive            1
Senio𝙧 Manage𝙧       1
Senio𝙧 Mαnage𝙧       1

In [285]:
designation_pitched_int = test_copied["Designation"].map(convert_designation_to_int)
test_copied.loc[:, "DesignationInt"] = designation_pitched_int
test_copied["DesignationInt"].value_counts()

DesignationInt
0    1180
1    1059
2     781
3     307
4     162
Name: count, dtype: int64

In [286]:
test_copied["MonthlyIncome"].value_counts()

MonthlyIncome
月収30.0万円    203
月収40.0万円    127
月収50.0万円     45
月収26.0万円     38
月収32.0万円     35
           ... 
247425.0      1
312990.0      1
240735.0      1
255000.0      1
411795.0      1
Name: count, Length: 2528, dtype: int64

In [287]:
monthly_income_int = test_copied["MonthlyIncome"].map(convert_income_to_int)
test_copied.loc[:, "MonthlyIncomeInt"] = monthly_income_int


In [288]:
customer_info_kekkon_int = test_copied["customer_info"].map(extract_marital_from_customer_info)
test_copied.loc[:, "MaritalInt"] = customer_info_kekkon_int
test_copied["MaritalInt"].value_counts()

MaritalInt
1    1462
0    1217
2     810
Name: count, dtype: int64

In [289]:
customer_info_car_int = test_copied["customer_info"].map(extract_car_from_customer_info)
test_copied.loc[:, "CarInt"] = customer_info_car_int
test_copied["CarInt"].value_counts()

CarInt
0    1834
1    1655
Name: count, dtype: int64

In [290]:
customer_info_child_int = test_copied["customer_info"].map(extract_child_from_customer_info)
test_copied.loc[:, "ChildInt"] = customer_info_child_int
test_copied["ChildInt"].value_counts()

ChildInt
 1    1564
 0     924
 2     897
 3      67
-1      37
Name: count, dtype: int64

In [291]:
test_preprocessed = test_copied[[
    "id", 'AgeInt', 'TypeofContactInt', 'CityTier', 'DurationInt', 'OccupationInt', "GenderInt", "NumberOfPersonVisiting", 
    "FollowupsNormalized", "ProductPitchedInt", "PreferredPropertyStar", "NumberOfTripsInt", "Passport", "PitchSatisfactionScore", 
    "DesignationInt", "MonthlyIncomeInt", "MaritalInt", "CarInt", "ChildInt"
]]
test_preprocessed.head()

Unnamed: 0,id,AgeInt,TypeofContactInt,CityTier,DurationInt,OccupationInt,GenderInt,NumberOfPersonVisiting,FollowupsNormalized,ProductPitchedInt,PreferredPropertyStar,NumberOfTripsInt,Passport,PitchSatisfactionScore,DesignationInt,MonthlyIncomeInt,MaritalInt,CarInt,ChildInt
0,3489,48.0,0.0,2,780.0,1,0,1.0,4.0,3,3.0,7.0,0,3,3,496950.0,1,1,0
1,3490,35.0,0.0,2,720.0,1,1,1.0,4.0,1,3.0,4.0,1,3,2,300000.0,1,0,0
2,3491,25.0,0.0,1,540.0,0,1,1.0,4.0,0,3.0,1.0,0,3,0,260000.0,2,0,0
3,3492,21.0,1.0,2,420.0,0,0,1.0,4.0,0,4.0,1.0,0,3,2,259875.0,2,1,0
4,3493,41.0,1.0,1,420.0,0,0,1.0,4.0,0,3.0,1.0,0,4,0,268830.0,0,1,0


In [292]:
test_preprocessed.isnull().sum()

id                          0
AgeInt                     93
TypeofContactInt           12
CityTier                    0
DurationInt               131
OccupationInt               0
GenderInt                   0
NumberOfPersonVisiting      0
FollowupsNormalized        24
ProductPitchedInt           0
PreferredPropertyStar       0
NumberOfTripsInt           42
Passport                    0
PitchSatisfactionScore      0
DesignationInt              0
MonthlyIncomeInt           49
MaritalInt                  0
CarInt                      0
ChildInt                    0
dtype: int64

In [293]:
# 欠損値を平均値で補完
test_preprocessed = test_preprocessed.fillna(test_preprocessed.mean())

In [294]:
test_preprocessed.isnull().sum()

id                        0
AgeInt                    0
TypeofContactInt          0
CityTier                  0
DurationInt               0
OccupationInt             0
GenderInt                 0
NumberOfPersonVisiting    0
FollowupsNormalized       0
ProductPitchedInt         0
PreferredPropertyStar     0
NumberOfTripsInt          0
Passport                  0
PitchSatisfactionScore    0
DesignationInt            0
MonthlyIncomeInt          0
MaritalInt                0
CarInt                    0
ChildInt                  0
dtype: int64

In [295]:
test_preprocessed.to_csv("../mid_output/test_preprocessed.csv", index=False)