# 学習用データの前処理

In [292]:
# ライブラリの読み込み
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re

# utilの読み込み
from my_utils import *

In [293]:
# データの読み込み
train = pd.read_csv("../data/train.csv")
train.head()

Unnamed: 0,id,Age,TypeofContact,CityTier,DurationOfPitch,Occupation,Gender,NumberOfPersonVisiting,NumberOfFollowups,ProductPitched,PreferredPropertyStar,NumberOfTrips,Passport,PitchSatisfactionScore,Designation,MonthlyIncome,customer_info,ProdTaken
0,0,50歳,Self Enquiry,2,900秒,Large Business,male,1.0,4.0,Basic,3.0,5,1,4,Executive,253905.0,未婚 車未所持 子供なし,1
1,1,56歳,Company Invited,1,14分,Salaried,Male,1.0,4.0,Standard,3.0,2,1,4,Senior Manager,404475.0,"離婚済み,車あり,子供無し",0
2,2,,Self Enquiry,1,10分,Large Business,Female,1.0,3.0,Basic,3.0,4,0,4,Executive,278145.0,"結婚済み、自動車未所有,子供なし",1
3,3,三十七歳,Self Enquiry,2,1080秒,Small Business,female,1.0,3.0,Standard,4.0,1,0,5,Senior Manager,326805.0,離婚済み、車所持、子供無し,0
4,4,48歳,Company Invited,3,1020秒,Small Business,female,1.0,3.0,Basic,4.0,4,0,4,Executive,258435.0,独身／車所持／無子,1


In [294]:
# 欠損値の確認
train.isnull().sum()

id                          0
Age                       100
TypeofContact               6
CityTier                    0
DurationOfPitch           121
Occupation                  0
Gender                      0
NumberOfPersonVisiting      0
NumberOfFollowups          33
ProductPitched              0
PreferredPropertyStar       0
NumberOfTrips              22
Passport                    0
PitchSatisfactionScore      0
Designation                 0
MonthlyIncome              56
customer_info               0
ProdTaken                   0
dtype: int64

In [295]:
# 欠損値のある行を削除
train_dropped = train.dropna()
train_dropped.shape

(3206, 18)

In [296]:
# 前処理後のデータを格納するdataframe
train_preprocessed = pd.DataFrame()

In [297]:
train_dropped["Age"].value_counts()

Age
30代     109
40代     103
37歳      88
33歳      88
36歳      87
       ... 
２４才       1
57際       1
４１才       1
二十七歳      1
２１才       1
Name: count, Length: 236, dtype: int64

In [298]:
# Ageの前処理
age_int = train_dropped['Age'].apply(convert_age_to_int)
train_dropped.loc[:, 'AgeInt'] = age_int
train_dropped["AgeInt"].value_counts()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_dropped.loc[:, 'AgeInt'] = age_int


AgeInt
35    217
45    185
55    126
36    124
33    117
37    113
32    106
31    105
39     99
34     99
38     97
25     94
42     90
41     88
40     86
43     84
52     82
30     81
51     80
50     75
29     72
46     71
53     69
28     68
48     66
47     65
49     64
26     63
44     61
54     59
27     53
56     47
22     42
23     37
57     34
58     32
24     32
21     31
20     26
59     23
60     16
19     10
18      6
61      5
65      4
15      2
Name: count, dtype: int64

In [299]:
train_dropped["TypeofContact"].value_counts()

TypeofContact
Self Enquiry       2050
Company Invited    1156
Name: count, dtype: int64

In [300]:
type_of_contact_mapping = {
    "Self Enquiry": 0,
    "Company Invited": 1,
}
type_of_contact_int = train_dropped["TypeofContact"].map(type_of_contact_mapping)
train_dropped.loc[:, "TypeofContactInt"] = type_of_contact_int

train_dropped["TypeofContactInt"].value_counts()

# train_preprocessed = train_dropped[['AgeInt', 'TypeofContactInt']]
# train_preprocessed.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_dropped.loc[:, "TypeofContactInt"] = type_of_contact_int


TypeofContactInt
0    2050
1    1156
Name: count, dtype: int64

In [301]:
train_dropped["CityTier"].value_counts()

CityTier
2    1483
1    1339
3     384
Name: count, dtype: int64

In [302]:
train_dropped["DurationOfPitch"].value_counts()

DurationOfPitch
8分       243
9分       226
15分      207
14分      201
16分      197
        ... 
1560秒      6
2160秒      4
36分        3
4分         2
1140秒      2
Name: count, Length: 65, dtype: int64

In [303]:
# DurationOfPitchの前処理
duration_int = train_dropped['DurationOfPitch'].apply(convert_duration_to_int)
train_dropped.loc[:, 'DurationInt'] = duration_int
train_dropped["DurationInt"].value_counts()

# train_preprocessed = train_dropped[['Age_int', 'TypeofContact_int', 'CityTier', 'Duration_int']]
# train_preprocessed.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_dropped.loc[:, 'DurationInt'] = duration_int


DurationInt
480     317
540     314
900     272
840     258
960     249
420     233
600     209
780     196
660     154
720     152
1020    129
360      76
1080     65
1920     51
1380     48
1860     42
1320     41
1440     37
1500     36
1620     35
1980     34
1200     33
1260     30
2040     29
1800     28
1680     26
1560     25
2100     23
1740     19
300      18
1140     18
2160      7
240       2
Name: count, dtype: int64

In [304]:
train_dropped["Occupation"].value_counts()

Occupation
Small Business    1568
Salaried          1304
Large Business     334
Name: count, dtype: int64

In [305]:
occupation_mapping = {
    "Salaried": 0,
    "Small Business": 1,
    "Large Business": 2,
}
occupation_int = train_dropped["Occupation"].map(occupation_mapping)
train_dropped.loc[:, "OccupationInt"] = occupation_int

# train_preprocessed = train_dropped[['Age_int', 'TypeofContact_int', 'CityTier', 'Duration_int', 'Occupation_int']]
# train_preprocessed.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_dropped.loc[:, "OccupationInt"] = occupation_int


In [306]:
train_dropped["Gender"].value_counts()

Gender
Male       1132
Female      677
male        433
female      242
MALE        157
Ｍａｌｅ        116
Fe Male     108
FEMALE       88
Ｆｅｍａｌｅ       80
ｍａｌｅ         42
ｆｅｍａｌｅ       31
fe male      29
FE MALE      23
Ｆｅ　Ｍａｌｅ      15
ＭＡＬＥ         15
ＦＥＭＡＬＥ       11
ｆｅ　ｍａｌｅ       6
ＦＥ　ＭＡＬＥ       1
Name: count, dtype: int64

In [307]:
gender_int = train_dropped["Gender"].map(convert_gender_to_int)
train_dropped.loc[:, "GenderInt"] = gender_int
train_dropped["GenderInt"].value_counts()

# train_preprocessed = train_dropped[['Age_int', 'TypeofContact_int', 'CityTier', 'Duration_int', 'Occupation_int', "Gender_int"]]
# train_preprocessed.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_dropped.loc[:, "GenderInt"] = gender_int


GenderInt
0    1895
1    1311
Name: count, dtype: int64

In [308]:
train_dropped["NumberOfPersonVisiting"].value_counts()

NumberOfPersonVisiting
2.0    1443
3.0    1295
1.0     307
4.0     161
Name: count, dtype: int64

In [309]:
train_dropped["NumberOfFollowups"].value_counts()

NumberOfFollowups
4.0      1339
3.0      1132
5.0       489
2.0       102
1.0        71
6.0        38
300.0      15
400.0      14
500.0       4
100.0       1
600.0       1
Name: count, dtype: int64

In [310]:
number_of_followups_normalized = train_dropped["NumberOfFollowups"].map(normalize_number_of_followups)
train_dropped.loc[:, "FollowupsNormalized"] = number_of_followups_normalized
train_dropped["FollowupsNormalized"].value_counts()

# train_preprocessed = train_dropped[['Age_int', 'TypeofContact_int', 'CityTier', 'Duration_int', 'Occupation_int', "Gender_int", "NumberOfPersonVisiting", "NumberOfFollowups_normalized"]]
# train_preprocessed.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_dropped.loc[:, "FollowupsNormalized"] = number_of_followups_normalized


FollowupsNormalized
4.0    1353
3.0    1147
5.0     493
2.0     102
1.0      72
6.0      39
Name: count, dtype: int64

In [311]:
train_dropped["ProductPitched"].value_counts()

ProductPitched
Basic           801
Deluxe          707
Standard        648
Super Deluxe    228
basic            99
               ... 
super de|uxe      1
de|uxe            1
ｓuper deluxe      1
STANDARᎠ          1
ЅTANDARD          1
Name: count, Length: 72, dtype: int64

In [312]:
product_pitched_int = train_dropped["ProductPitched"].map(convert_product_pitched_to_int)
train_dropped.loc[:, "ProductPitchedInt"] = product_pitched_int
train_dropped["ProductPitchedInt"].value_counts()

# train_preprocessed = train_dropped[['Age_int', 'TypeofContact_int', 'CityTier', 'Duration_int', 'Occupation_int', "Gender_int", "NumberOfPersonVisiting", "NumberOfFollowups_normalized", "ProductPitched_int"]]
# train_preprocessed.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_dropped.loc[:, "ProductPitchedInt"] = product_pitched_int


ProductPitchedInt
0    1052
1     954
2     895
3     305
Name: count, dtype: int64

In [313]:
train_dropped["PreferredPropertyStar"].value_counts()

PreferredPropertyStar
3.0    1944
4.0    1085
5.0     177
Name: count, dtype: int64

In [314]:
train_dropped["NumberOfTrips"].value_counts()

NumberOfTrips
2         952
3         627
5         416
1         297
7         213
年に2回      134
4         134
年に3回      106
6          97
年に5回       75
年に1回       49
年に7回       37
年に6回       20
年に4回       19
半年に1回      17
8           8
四半期に1回      3
年に8回        2
Name: count, dtype: int64

In [315]:
number_of_trips_int = train_dropped["NumberOfTrips"].map(convert_trips_to_int)
train_dropped.loc[:, "NumberOfTripsInt"] = number_of_trips_int

# train_preprocessed = train_dropped[[
#     'Age_int', 'TypeofContact_int', 'CityTier', 'Duration_int', 'Occupation_int', "Gender_int", "NumberOfPersonVisiting", 
#     "NumberOfFollowups_normalized", "ProductPitched_int", "PreferredPropertyStar", "NumberOfTrips_int"
# ]]
# train_preprocessed.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_dropped.loc[:, "NumberOfTripsInt"] = number_of_trips_int


In [316]:
train_dropped["Passport"].value_counts()

Passport
0    2867
1     339
Name: count, dtype: int64

In [317]:
train_dropped["PitchSatisfactionScore"].value_counts()

PitchSatisfactionScore
2    1174
3     658
1     651
4     627
5      96
Name: count, dtype: int64

In [318]:
train_dropped["Designation"].value_counts()

Designation
Executive         989
Manager           865
Senior Manager    822
AVP               282
VP                110
Μanager            12
Exеcutive           9
Exеcutivе           8
Managеr             7
Executivе           7
Manαger             7
АVP                 7
Senior Managеr      6
Executiѵe           5
Execuｔive           5
Mαnager             4
Senior Manαger      4
Exеcutiѵе           3
Senior Mαnαger      3
ΑVP                 3
Manage𝙧             3
Sеnior Manager      3
Еxecutive           3
Е×еcutive           2
AVＰ                 2
Տenior Manager      2
Exеcｕtive           2
VＰ                  2
Ѕenior Manager      2
Senior Manage𝙧      2
Еxecutivе           2
E×ecutive           2
Μanagеr             1
Senior Mαnager      1
Еxеcutivе           1
Mαnagеr             1
Exеcｕtivе           1
Senior Μanαger      1
Sеnior Managеr      1
Executiѵе           1
E×ecｕtive           1
Mαnαger             1
Manαgеr             1
Execｕtive           1
Еxecuｔive           

In [319]:
designation_pitched_int = train_dropped["Designation"].map(convert_designation_to_int)
train_dropped.loc[:, "DesignationInt"] = designation_pitched_int
train_dropped["DesignationInt"].value_counts()

# train_preprocessed = train_dropped[[
#     'Age_int', 'TypeofContact_int', 'CityTier', 'Duration_int', 'Occupation_int', "Gender_int", "NumberOfPersonVisiting", 
#     "NumberOfFollowups_normalized", "ProductPitched_int", "PreferredPropertyStar", "NumberOfTrips_int", "Passport", "PitchSatisfactionScore", "Designation_int"
# ]]
# train_preprocessed.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_dropped.loc[:, "DesignationInt"] = designation_pitched_int


DesignationInt
0    1045
1     903
2     852
3     284
4     122
Name: count, dtype: int64

In [320]:
train_dropped["MonthlyIncome"].value_counts()

MonthlyIncome
月収30.0万円    165
月収40.0万円    125
月収50.0万円     35
月収35.0万円     23
月収31.0万円     21
           ... 
385650.0      1
446430.0      1
562470.0      1
449550.0      1
278190.0      1
Name: count, Length: 2378, dtype: int64

In [321]:
monthly_income_int = train_dropped["MonthlyIncome"].map(convert_income_to_int)
train_dropped.loc[:, "MonthlyIncomeInt"] = monthly_income_int
train_dropped["MonthlyIncomeInt"].value_counts()

# train_preprocessed = train_dropped[[
#     'Age_int', 'TypeofContact_int', 'CityTier', 'Duration_int', 'Occupation_int', "Gender_int", "NumberOfPersonVisiting", 
#     "NumberOfFollowups_normalized", "ProductPitched_int", "PreferredPropertyStar", "NumberOfTrips_int", "Passport", "PitchSatisfactionScore", "Designation_int", "MonthlyIncome_int"
# ]]
# train_preprocessed.head()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_dropped.loc[:, "MonthlyIncomeInt"] = monthly_income_int


MonthlyIncomeInt
300000.0    165
400000.0    125
500000.0     35
350000.0     23
310000.0     21
           ... 
385650.0      1
446430.0      1
562470.0      1
449550.0      1
278190.0      1
Name: count, Length: 2377, dtype: int64

In [322]:
train_dropped["customer_info"].value_counts()

customer_info
結婚済み、車所持、子供1人           29
結婚済み、車未所持、子供1人          28
離婚済み、車未所持、子供1人          20
未婚、車未所持、子供1人            18
離婚済み、車所持、子供なし           18
                        ..
離婚済み、自動車未所有、子供有り(1人)     1
未婚 自動車未所有　非育児家庭          1
離婚済み/自動車所有/1児            1
未婚 車未所持　子供2人             1
独身／車所持／こども1人             1
Name: count, Length: 1645, dtype: int64

In [323]:
customer_info_kekkon_int = train_dropped["customer_info"].map(extract_marital_from_customer_info)
train_dropped.loc[:, "MaritalInt"] = customer_info_kekkon_int
train_dropped["MaritalInt"].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_dropped.loc[:, "MaritalInt"] = customer_info_kekkon_int


MaritalInt
1    1274
0    1123
2     809
Name: count, dtype: int64

In [324]:
customer_info_car_int = train_dropped["customer_info"].map(extract_car_from_customer_info)
train_dropped.loc[:, "CarInt"] = customer_info_car_int
train_dropped["CarInt"].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_dropped.loc[:, "CarInt"] = customer_info_car_int


CarInt
0    1623
1    1583
Name: count, dtype: int64

In [325]:
customer_info_child_int = train_dropped["customer_info"].map(extract_child_from_customer_info)
train_dropped.loc[:, "ChildInt"] = customer_info_child_int
train_dropped["ChildInt"].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_dropped.loc[:, "ChildInt"] = customer_info_child_int


ChildInt
 1    1417
 0     897
 2     786
 3      70
-1      36
Name: count, dtype: int64

In [326]:
train_preprocessed = train_dropped[[
    'AgeInt', 'TypeofContactInt', 'CityTier', 'DurationInt', 'OccupationInt', "GenderInt", "NumberOfPersonVisiting", 
    "FollowupsNormalized", "ProductPitchedInt", "PreferredPropertyStar", "NumberOfTripsInt", "Passport", "PitchSatisfactionScore", 
    "DesignationInt", "MonthlyIncomeInt", "MaritalInt", "CarInt", "ChildInt"
]]
train_preprocessed.head()

Unnamed: 0,AgeInt,TypeofContactInt,CityTier,DurationInt,OccupationInt,GenderInt,NumberOfPersonVisiting,FollowupsNormalized,ProductPitchedInt,PreferredPropertyStar,NumberOfTripsInt,Passport,PitchSatisfactionScore,DesignationInt,MonthlyIncomeInt,MaritalInt,CarInt,ChildInt
0,50,0,2,900,2,0,1.0,4.0,0,3.0,5,1,4,0,253905.0,0,0,0
1,56,1,1,840,0,0,1.0,4.0,1,3.0,2,1,4,2,404475.0,2,1,0
3,37,0,2,1080,1,1,1.0,3.0,1,4.0,1,0,5,2,326805.0,2,1,0
4,48,1,3,1020,1,1,1.0,3.0,0,4.0,4,0,4,0,258435.0,0,1,0
5,19,0,2,960,1,0,1.0,3.0,0,3.0,2,0,4,0,260000.0,0,0,0


In [327]:
train_dropped["ProdTaken"].value_counts()

ProdTaken
0    2761
1     445
Name: count, dtype: int64

In [328]:
train_preprocessed = train_dropped[[
    "id", 'AgeInt', 'TypeofContactInt', 'CityTier', 'DurationInt', 'OccupationInt', "GenderInt", "NumberOfPersonVisiting", 
    "FollowupsNormalized", "ProductPitchedInt", "PreferredPropertyStar", "NumberOfTripsInt", "Passport", "PitchSatisfactionScore", 
    "DesignationInt", "MonthlyIncomeInt", "MaritalInt", "CarInt", "ChildInt", "ProdTaken"
]]
train_preprocessed.head()

Unnamed: 0,id,AgeInt,TypeofContactInt,CityTier,DurationInt,OccupationInt,GenderInt,NumberOfPersonVisiting,FollowupsNormalized,ProductPitchedInt,PreferredPropertyStar,NumberOfTripsInt,Passport,PitchSatisfactionScore,DesignationInt,MonthlyIncomeInt,MaritalInt,CarInt,ChildInt,ProdTaken
0,0,50,0,2,900,2,0,1.0,4.0,0,3.0,5,1,4,0,253905.0,0,0,0,1
1,1,56,1,1,840,0,0,1.0,4.0,1,3.0,2,1,4,2,404475.0,2,1,0,0
3,3,37,0,2,1080,1,1,1.0,3.0,1,4.0,1,0,5,2,326805.0,2,1,0,0
4,4,48,1,3,1020,1,1,1.0,3.0,0,4.0,4,0,4,0,258435.0,0,1,0,1
5,5,19,0,2,960,1,0,1.0,3.0,0,3.0,2,0,4,0,260000.0,0,0,0,0


In [329]:
train_preprocessed.to_csv("../mid_output/train_preprocessed.csv", index=False)