# Make ctgr

【内容】
- data_cleaningで精緻化したデータを使用
- 数値データをカテゴリ化

In [1]:
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold

## Setting

In [2]:
path_train = "../data/input/cleaned_train.csv"
path_test = "../data/input/cleaned_test.csv"
path_input = "../data/input/"

## Data read

In [3]:
train_df = pd.read_csv(path_train)
test_df = pd.read_csv(path_test)

In [4]:
train_df.head()

Unnamed: 0,id,Age,TypeofContact,CityTier,DurationOfPitch,Occupation,Gender,NumberOfPersonVisiting,NumberOfFollowups,ProductPitched,PreferredPropertyStar,NumberOfTrips,Passport,PitchSatisfactionScore,Designation,MonthlyIncome,ProdTaken,customer_marriage,customer_car,customer_child
0,0,50.0,Self Enquiry,2,900.0,Large Business,Male,1.0,4.0,basic,3.0,5.0,1,4,executive,253905.0,1,未婚,自動車未所有,子供なし
1,1,56.0,Company Invited,1,840.0,Salaried,Male,1.0,4.0,standard,3.0,2.0,1,4,senior manager,404475.0,0,離婚済み,自動車所有,子供なし
2,2,,Self Enquiry,1,600.0,Large Business,Female,1.0,3.0,basic,3.0,4.0,0,4,executive,278145.0,1,結婚済み,自動車未所有,子供なし
3,3,37.0,Self Enquiry,2,1080.0,Small Business,Female,1.0,3.0,standard,4.0,1.0,0,5,senior manager,326805.0,0,離婚済み,自動車所有,子供なし
4,4,48.0,Company Invited,3,1020.0,Small Business,Female,1.0,3.0,basic,4.0,4.0,0,4,executive,258435.0,1,独身,自動車所有,子供なし


## データ確認

In [5]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3489 entries, 0 to 3488
Data columns (total 20 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   id                      3489 non-null   int64  
 1   Age                     3389 non-null   float64
 2   TypeofContact           3483 non-null   object 
 3   CityTier                3489 non-null   int64  
 4   DurationOfPitch         3368 non-null   float64
 5   Occupation              3489 non-null   object 
 6   Gender                  3489 non-null   object 
 7   NumberOfPersonVisiting  3489 non-null   float64
 8   NumberOfFollowups       3456 non-null   float64
 9   ProductPitched          3489 non-null   object 
 10  PreferredPropertyStar   3489 non-null   float64
 11  NumberOfTrips           3467 non-null   float64
 12  Passport                3489 non-null   int64  
 13  PitchSatisfactionScore  3489 non-null   int64  
 14  Designation             3489 non-null   

In [6]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3489 entries, 0 to 3488
Data columns (total 19 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   id                      3489 non-null   int64  
 1   Age                     3396 non-null   float64
 2   TypeofContact           3477 non-null   object 
 3   CityTier                3489 non-null   int64  
 4   DurationOfPitch         3358 non-null   float64
 5   Occupation              3489 non-null   object 
 6   Gender                  3489 non-null   object 
 7   NumberOfPersonVisiting  3489 non-null   float64
 8   NumberOfFollowups       3465 non-null   float64
 9   ProductPitched          3489 non-null   object 
 10  PreferredPropertyStar   3489 non-null   float64
 11  NumberOfTrips           3447 non-null   float64
 12  Passport                3489 non-null   int64  
 13  PitchSatisfactionScore  3489 non-null   int64  
 14  Designation             3489 non-null   

In [7]:
for feature in train_df.columns:
    print(train_df.groupby(feature).size())
    print("")

id
0       1
1       1
2       1
3       1
4       1
       ..
3484    1
3485    1
3486    1
3487    1
3488    1
Length: 3489, dtype: int64

Age
18.0      6
19.0     12
20.0     28
21.0     34
22.0     43
23.0     38
24.0     38
25.0     99
26.0     67
27.0     56
28.0     74
29.0     74
30.0     87
31.0    109
32.0    108
33.0    127
34.0    112
35.0    232
36.0    134
37.0    118
38.0    107
39.0    105
40.0     95
41.0     94
42.0     96
43.0     91
44.0     62
45.0    193
46.0     75
47.0     69
48.0     68
49.0     68
50.0     76
51.0     83
52.0     83
53.0     71
54.0     61
55.0    128
56.0     49
57.0     36
58.0     33
59.0     25
60.0     16
61.0      9
dtype: int64

TypeofContact
Company Invited    1289
Self Enquiry       2194
dtype: int64

CityTier
1    1475
2    1596
3     418
dtype: int64

DurationOfPitch
240.0       2
300.0      18
360.0      80
420.0     239
480.0     338
540.0     327
600.0     231
660.0     160
720.0     158
780.0     203
840.0     266
900.0     284


## 数値データのカテゴリ化

### Age

In [8]:
def age_to_age_cls(age):
    if pd.isna(age):
        return "不明"
    elif age <= 20:
        return "20歳以下"
    elif (age > 20) & (age <= 25):
        return ("21~25歳")
    elif (age > 25) & (age <= 30):
        return ("26~30歳")
    elif (age > 30) & (age <= 35):
        return ("31~35歳")
    elif (age > 35) & (age <= 40):
        return ("36~40歳")
    elif (age > 40) & (age <= 45):
        return ("41~45歳")
    elif (age > 45) & (age <= 50):
        return ("46~50歳")
    elif (age > 50) & (age <= 55):
        return ("51~55歳")
    elif age > 55:
        return "56歳以上"

In [9]:
train_df["age_cls"] = train_df["Age"].apply(age_to_age_cls)
test_df["age_cls"] = test_df["Age"].apply(age_to_age_cls)

In [10]:
train_df.groupby(["age_cls"]).size()

age_cls
20歳以下      46
21~25歳    252
26~30歳    358
31~35歳    688
36~40歳    559
41~45歳    536
46~50歳    356
51~55歳    426
56歳以上     168
不明        100
dtype: int64

### DurationOfPitch

In [11]:
def pitch_time_to_ctgr(time):
    if pd.isna(time):
        return "不明"
    elif time <= 400:
        return "400s以下"
    elif (time > 400) & (time <= 500):
        return "401~500s"
    elif (time > 500) & (time <= 600):
        return "501~600s"
    elif (time > 600) & (time <= 700):
        return "601~700s"
    elif (time > 700) & (time <= 800):
        return "701~800s"
    elif (time > 800) & (time <= 900):
        return "801~900s"
    elif (time > 900) & (time <= 1000):
        return "901~1000s"
    elif (time > 1000) & (time <= 1100):
        return "1001~1100s"
    elif (time > 1100) & (time <= 1200):
        return "1101~1200s"
    elif (time > 1200) & (time <= 1300):
        return "1201~1300s"
    elif (time > 1300) & (time <= 1400):
        return "1301~1400s"
    elif (time > 1400) & (time <= 1500):
        return "1401~1500s"
    elif time > 1500:
        return "1501s以上"

In [12]:
train_df["DurationOfPitch_cls"] = train_df["DurationOfPitch"].apply(pitch_time_to_ctgr)
test_df["DurationOfPitch_cls"] = test_df["DurationOfPitch"].apply(pitch_time_to_ctgr)

In [13]:
train_df.groupby(["DurationOfPitch_cls"]).size()

DurationOfPitch_cls
1001~1100s    210
1101~1200s     55
1201~1300s     31
1301~1400s     96
1401~1500s     78
1501s以上       333
400s以下        100
401~500s      577
501~600s      558
601~700s      160
701~800s      361
801~900s      550
901~1000s     259
不明            121
dtype: int64

### NumberOfFollowups

100回以上は現実的ではないので、入力ミスと仮定して補正値を代入

In [14]:
def followups_num_to_ctgr(num):
    if pd.isna(num):
        return "不明"
    elif num == 1:
        return "1回"
    elif num == 2:
        return "2回"
    elif num == 3:
        return "3回"
    elif num == 4:
        return "4回"
    elif num == 5:
        return "5回"
    elif num == 6:
        return "6回"
    elif num == 100:
        return "1回"
    elif num == 200:
        return "2回"
    elif num == 300:
        return "3回"
    elif num == 400:
        return "4回"
    elif num == 500:
        return "5回"
    elif num == 600:
        return "6回"

In [15]:
train_df["NumberOfFollowups_cls"] = train_df["NumberOfFollowups"].apply(followups_num_to_ctgr)
test_df["NumberOfFollowups_cls"] = test_df["NumberOfFollowups"].apply(followups_num_to_ctgr)

In [16]:
train_df.groupby(["NumberOfFollowups_cls"]).size()

NumberOfFollowups_cls
1回      78
2回     108
3回    1296
4回    1411
5回     524
6回      39
不明      33
dtype: int64

### MonthlyIncome

In [17]:
def income_num_to_ctgr(income):
    if pd.isna(income):
        return "不明"
    elif income <= 250000:
        return "25万円以下"
    elif (income > 250000) & (income <= 300000):
        return "25~30万円"
    elif (income > 300000) & (income <= 350000):
        return "30~35万円"
    elif (income > 350000) & (income <= 400000):
        return "35~40万円"
    elif income > 400000:
        return "40万円以上"

In [18]:
train_df["MonthlyIncome_cls"] = train_df["MonthlyIncome"].apply(income_num_to_ctgr)
test_df["MonthlyIncome_cls"] = test_df["MonthlyIncome"].apply(income_num_to_ctgr)

In [19]:
train_df.groupby(["MonthlyIncome_cls"]).size()

MonthlyIncome_cls
25~30万円     832
25万円以下       37
30~35万円    1003
35~40万円     807
40万円以上      754
不明           56
dtype: int64

### TypeofContact

In [20]:
# def contact_null_to_ctgr(contact):
#     if pd.isna(contact):
#         return "不明"
#     else:
#         return contact

In [21]:
# train_df["TypeofContact"] = train_df["TypeofContact"].apply(contact_null_to_ctgr)
# test_df["TypeofContact"] = test_df["TypeofContact"].apply(contact_null_to_ctgr)

## 出力


In [22]:
train_df.to_csv(path_input + "preprocess_train.csv", index=False, header=True)
test_df.to_csv(path_input + "preprocess_test.csv", index=False, header=True)