# testデータの前処理用ノートブック

In [559]:
# ライブラリの読み込み
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re

In [560]:
#train = pd.read_csv("../data/train.csv")
test = pd.read_csv("../data/test.csv")

In [561]:
test.head()

Unnamed: 0,id,Age,TypeofContact,CityTier,DurationOfPitch,Occupation,Gender,NumberOfPersonVisiting,NumberOfFollowups,ProductPitched,PreferredPropertyStar,NumberOfTrips,Passport,PitchSatisfactionScore,Designation,MonthlyIncome,customer_info
0,3489,４８歳,Self Enquiry,2,13分,Small Business,Male,1.0,4.0,Super De|uxe,3.0,7,0,3,AVP,496950.0,結婚済み 車所持 子供なし
1,3490,30代,Self Enquiry,2,12分,Small Business,Ｆｅｍａｌｅ,1.0,4.0,Standard,3.0,4,1,3,Senior Manager,月収30.0万円,結婚済み、車未所持、子供なし
2,3491,25歳,Self Enquiry,1,540秒,Salaried,Female,1.0,4.0,Basic,3.0,1,0,3,Executive,月収26.0万円,離婚済み、自動車未所有、子供なし
3,3492,21歳,Company Invited,2,420秒,Salaried,Male,1.0,4.0,Basic,4.0,1,0,3,Senior Manager,259875.0,離婚済み、自動車所有、子供なし
4,3493,41歳,Company Invited,1,7分,Salaried,MALE,1.0,4.0,Basic,3.0,1,0,4,Executive,268830.0,独身/車所持／子供なし


In [562]:
test.isnull().sum()

id                          0
Age                        93
TypeofContact              12
CityTier                    0
DurationOfPitch           131
Occupation                  0
Gender                      0
NumberOfPersonVisiting      0
NumberOfFollowups          24
ProductPitched              0
PreferredPropertyStar       0
NumberOfTrips              42
Passport                    0
PitchSatisfactionScore      0
Designation                 0
MonthlyIncome              49
customer_info               0
dtype: int64

In [563]:
# 前処理後のデータを格納するdataframe
test_copied = test.copy()
test_preprocessed = pd.DataFrame()

In [564]:
# Ageの前処理
match_type = r'[0-9]+|[一二三四五六七八九]+|[１２３４５６７８９]+'
kanji_to_arabic = str.maketrans("一二三四五六七八九１２３４５６７８９", "123456789123456789")

def extract_age(age: str) -> int:
    def kanji_to_int(kanji: str) -> int:
        matches: list[str] = re.findall(match_type, age)
        if len(matches) == 0:
            return -1
        else:
            ret = 0
            keta = 0
            for match in matches:
                str_age = match.translate(kanji_to_arabic)
                numeric_age = int(str_age)
                if keta == 0:
                    ret = numeric_age
                    keta += len(match)
                elif keta == 1:
                    ret = ret*10 + numeric_age
                    keta += len(match)
                elif keta == 2:
                    ret = ret*100 + numeric_age
                    keta += len(match)
                else:
                    return -1
            return ret
    if pd.isnull(age):
        return -1
    if age.find("代") != -1:
        dai_int = kanji_to_int(age)
        return dai_int + 5
    else:
        return kanji_to_int(age)

age_int = test_copied['Age'].apply(extract_age)
test_copied.loc[:, 'Age_int'] = age_int

test_preprocessed = test_copied[['Age_int']]
test_preprocessed.head()


Unnamed: 0,Age_int
0,48
1,35
2,25
3,21
4,41


In [565]:
test_copied["TypeofContact"].value_counts()

TypeofContact
Self Enquiry       2250
Company Invited    1227
Name: count, dtype: int64

In [566]:
type_of_contact_mapping = {
    "Self Enquiry": 0,
    "Company Invited": 1,
}
type_of_contact_int = test_copied["TypeofContact"].map(type_of_contact_mapping)
test_copied.loc[:, "TypeofContact_int"] = type_of_contact_int

test_preprocessed = test_copied[['Age_int', 'TypeofContact_int']]
test_preprocessed.head()

Unnamed: 0,Age_int,TypeofContact_int
0,48,0.0
1,35,0.0
2,25,0.0
3,21,1.0
4,41,1.0


In [567]:
test_copied["CityTier"].value_counts()

CityTier
2    1564
1    1516
3     409
Name: count, dtype: int64

In [568]:
test_preprocesse = test_copied[['Age_int', 'TypeofContact_int', 'CityTier']]
test_preprocesse.head()

Unnamed: 0,Age_int,TypeofContact_int,CityTier
0,48,0.0,2
1,35,0.0,2
2,25,0.0,1
3,21,1.0,2
4,41,1.0,1


In [569]:
test_copied["DurationOfPitch"].value_counts()

DurationOfPitch
9分       251
8分       236
15分      216
16分      206
14分      191
        ... 
1800秒      7
1620秒      5
1680秒      3
240秒       1
2160秒      1
Name: count, Length: 65, dtype: int64

In [570]:
# DurationOfPitchの前処理
def extract_duration(duration_str: str) -> int:
    if pd.isnull(duration_str):
        return -1
    if duration_str.find("分") != -1:
        return int(duration_str.replace("分", ""))*60
    elif duration_str.find("秒") != -1:
        return int(duration_str.replace("秒", ""))
    else:
        return -1
    
duration_int = test_copied['DurationOfPitch'].apply(extract_duration)
test_copied.loc[:, 'Duration_int'] = duration_int

test_preprocesse = test_copied[['Age_int', 'TypeofContact_int', 'CityTier', 'Duration_int']]
test_preprocesse.head()

Unnamed: 0,Age_int,TypeofContact_int,CityTier,Duration_int
0,48,0.0,2,780
1,35,0.0,2,720
2,25,0.0,1,540
3,21,1.0,2,420
4,41,1.0,1,420


In [571]:
test_copied["Occupation"].value_counts()

Occupation
Small Business    1729
Salaried          1400
Large Business     360
Name: count, dtype: int64

In [572]:
occupation_mapping = {
    "Salaried": 0,
    "Small Business": 1,
    "Large Business": 2,
}
occupation_int = test_copied["Occupation"].map(occupation_mapping)
test_copied.loc[:, "Occupation_int"] = occupation_int

test_preprocesse = test_copied[['Age_int', 'TypeofContact_int', 'CityTier', 'Duration_int', 'Occupation_int']]
test_preprocesse.head()

Unnamed: 0,Age_int,TypeofContact_int,CityTier,Duration_int,Occupation_int
0,48,0.0,2,780,1
1,35,0.0,2,720,1
2,25,0.0,1,540,0
3,21,1.0,2,420,0
4,41,1.0,1,420,0


In [573]:
test_copied["Gender"].value_counts()

Gender
Male       1299
Female      702
male        472
female      238
MALE        180
Ｍａｌｅ        134
Fe Male     104
FEMALE       98
Ｆｅｍａｌｅ       92
ｍａｌｅ         49
ｆｅｍａｌｅ       29
fe male      28
ＭＡＬＥ         20
FE MALE      20
Ｆｅ　Ｍａｌｅ      11
ＦＥＭＡＬＥ        8
ｆｅ　ｍａｌｅ       5
Name: count, dtype: int64

In [574]:
z_ascii = '\u3000' + ''.join(chr(i) for i in range(0xFF01, 0xFF5E + 1))
# 　！＂＃＄％＆＇（）＊＋，－．／０１２３４５６７８９：；＜＝＞？＠ＡＢＣＤＥＦＧＨＩＪＫＬＭＮＯＰＱＲＳＴＵＶＷＸＹＺ［＼］＾＿｀ａｂｃｄｅｆｇｈｉｊｋｌｍｎｏｐｑｒｓｔｕｖｗｘｙｚ｛｜｝～

h_ascii = ''.join(chr(i) for i in range(0x0020, 0x007E + 1))
#  !"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~

z2h_ascii = str.maketrans(z_ascii, h_ascii)

def extract_gender(gender_str: str) -> int:
    # 空白を削除
    gender_str_stripped = gender_str.replace(" ", "").replace("　", "")
    gender_str_normalized = gender_str_stripped.translate(z2h_ascii).lower()
    if gender_str_normalized == "male":
        return 0
    elif gender_str_normalized == "female":
        return 1
    else:
        return -1
    
gender_int = test_copied["Gender"].map(extract_gender)
test_copied.loc[:, "Gender_int"] = gender_int

test_preprocesse = test_copied[['Age_int', 'TypeofContact_int', 'CityTier', 'Duration_int', 'Occupation_int', "Gender_int"]]
test_preprocesse.head()

Unnamed: 0,Age_int,TypeofContact_int,CityTier,Duration_int,Occupation_int,Gender_int
0,48,0.0,2,780,1,0
1,35,0.0,2,720,1,1
2,25,0.0,1,540,0,1
3,21,1.0,2,420,0,0
4,41,1.0,1,420,0,0


In [575]:
test_copied["NumberOfPersonVisiting"].value_counts()

NumberOfPersonVisiting
2.0    1555
3.0    1396
1.0     350
4.0     188
Name: count, dtype: int64

In [576]:
test_preprocesse = test_copied[['Age_int', 'TypeofContact_int', 'CityTier', 'Duration_int', 'Occupation_int', "Gender_int", "NumberOfPersonVisiting"]]
test_preprocesse.head()

Unnamed: 0,Age_int,TypeofContact_int,CityTier,Duration_int,Occupation_int,Gender_int,NumberOfPersonVisiting
0,48,0.0,2,780,1,0,1.0
1,35,0.0,2,720,1,1,1.0
2,25,0.0,1,540,0,1,1.0
3,21,1.0,2,420,0,0,1.0
4,41,1.0,1,420,0,0,1.0


In [577]:
test_copied["NumberOfFollowups"].value_counts()

NumberOfFollowups
4.0      1350
3.0      1257
5.0       594
2.0       104
1.0        99
6.0        31
400.0      16
300.0       8
500.0       4
100.0       1
200.0       1
Name: count, dtype: int64

In [578]:
def normalize_number_of_followups(followups: float) -> float:
    if followups < 100:
        return followups
    elif followups < 1000:
        return followups/100
    else:
        return -1
    
number_of_followups_normalized = test_copied["NumberOfFollowups"].map(normalize_number_of_followups)
test_copied.loc[:, "NumberOfFollowups_normalized"] = number_of_followups_normalized
test_copied["NumberOfFollowups_normalized"].value_counts()

test_preprocesse = test_copied[['Age_int', 'TypeofContact_int', 'CityTier', 'Duration_int', 'Occupation_int', "Gender_int", "NumberOfPersonVisiting", "NumberOfFollowups_normalized"]]
test_preprocesse.head()

Unnamed: 0,Age_int,TypeofContact_int,CityTier,Duration_int,Occupation_int,Gender_int,NumberOfPersonVisiting,NumberOfFollowups_normalized
0,48,0.0,2,780,1,0,1.0,4.0
1,35,0.0,2,720,1,1,1.0,4.0
2,25,0.0,1,540,0,1,1.0,4.0
3,21,1.0,2,420,0,0,1.0,4.0
4,41,1.0,1,420,0,0,1.0,4.0


In [579]:
test_copied["ProductPitched"].value_counts()

ProductPitched
Basic           923
Deluxe          824
Standard        591
Super Deluxe    233
King            126
               ... 
BASIС             1
𐊡asic             1
Вasic             1
basiс             1
BAЅIC             1
Name: count, Length: 71, dtype: int64

In [580]:
from rapidfuzz import process

product_pitched_mapping = {
    "basic": 0,
    "standard": 1,
    "deluxe": 2,
    "superdeluxe": 3,
}

def extract_product_pitched(product_str: str) -> int:
    # 空白を削除
    product_str_stripped = product_str.replace(" ", "").replace("　", "")
    product_str_normalized = product_str_stripped.translate(z2h_ascii).lower()
    product_str_filtered = "".join(filter(str.isalpha, product_str_normalized))
    
    closest_match = process.extractOne(product_str_filtered, product_pitched_mapping.keys())
    closest_match_str = closest_match[0]

    if closest_match_str in product_pitched_mapping:
        return product_pitched_mapping[closest_match_str]
    else:
        return -1
    
product_pitched_int = test_copied["ProductPitched"].map(extract_product_pitched)
test_copied.loc[:, "ProductPitched_int"] = product_pitched_int

test_preprocesse = test_copied[['Age_int', 'TypeofContact_int', 'CityTier', 'Duration_int', 'Occupation_int', "Gender_int", "NumberOfPersonVisiting", "NumberOfFollowups_normalized", "ProductPitched_int"]]
test_preprocesse.head()

Unnamed: 0,Age_int,TypeofContact_int,CityTier,Duration_int,Occupation_int,Gender_int,NumberOfPersonVisiting,NumberOfFollowups_normalized,ProductPitched_int
0,48,0.0,2,780,1,0,1.0,4.0,3
1,35,0.0,2,720,1,1,1.0,4.0,1
2,25,0.0,1,540,0,1,1.0,4.0,0
3,21,1.0,2,420,0,0,1.0,4.0,0
4,41,1.0,1,420,0,0,1.0,4.0,0


In [581]:
test_copied["PreferredPropertyStar"].value_counts()

PreferredPropertyStar
3.0    2181
4.0    1109
5.0     199
Name: count, dtype: int64

In [582]:
test_preprocesse = test_copied[[
    'Age_int', 'TypeofContact_int', 'CityTier', 'Duration_int', 'Occupation_int', "Gender_int", "NumberOfPersonVisiting", 
    "NumberOfFollowups_normalized", "ProductPitched_int", "PreferredPropertyStar"
]]
test_preprocesse.head()

Unnamed: 0,Age_int,TypeofContact_int,CityTier,Duration_int,Occupation_int,Gender_int,NumberOfPersonVisiting,NumberOfFollowups_normalized,ProductPitched_int,PreferredPropertyStar
0,48,0.0,2,780,1,0,1.0,4.0,3,3.0
1,35,0.0,2,720,1,1,1.0,4.0,1,3.0
2,25,0.0,1,540,0,1,1.0,4.0,0,3.0
3,21,1.0,2,420,0,0,1.0,4.0,0,4.0
4,41,1.0,1,420,0,0,1.0,4.0,0,3.0


In [583]:
test_copied["NumberOfTrips"].value_counts()

NumberOfTrips
2         950
3         762
5         400
1         320
7         226
年に2回      153
4         136
6         125
年に3回      124
年に1回       72
年に5回       62
年に4回       31
年に7回       31
年に6回       26
8          12
半年に1回      10
年に8回        4
四半期に1回      3
Name: count, dtype: int64

In [584]:
def extract_number_of_trips(trips_str: str) -> int:
    if pd.isnull(trips_str):
        return -1
    trips_int = int("".join(filter(str.isdigit, trips_str)))
    if trips_str.find("半年") != -1:
        return trips_int*2
    elif trips_str.find("四半期") != -1:
        return trips_int*4
    else:
        return trips_int
    
number_of_trips_int = test_copied["NumberOfTrips"].map(extract_number_of_trips)
test_copied.loc[:, "NumberOfTrips_int"] = number_of_trips_int

test_preprocesse = test_copied[[
    'Age_int', 'TypeofContact_int', 'CityTier', 'Duration_int', 'Occupation_int', "Gender_int", "NumberOfPersonVisiting", 
    "NumberOfFollowups_normalized", "ProductPitched_int", "PreferredPropertyStar", "NumberOfTrips_int"
]]
test_preprocesse.head()

Unnamed: 0,Age_int,TypeofContact_int,CityTier,Duration_int,Occupation_int,Gender_int,NumberOfPersonVisiting,NumberOfFollowups_normalized,ProductPitched_int,PreferredPropertyStar,NumberOfTrips_int
0,48,0.0,2,780,1,0,1.0,4.0,3,3.0,7
1,35,0.0,2,720,1,1,1.0,4.0,1,3.0,4
2,25,0.0,1,540,0,1,1.0,4.0,0,3.0,1
3,21,1.0,2,420,0,0,1.0,4.0,0,4.0,1
4,41,1.0,1,420,0,0,1.0,4.0,0,3.0,1


In [585]:
test_copied["Passport"].value_counts()

Passport
0    3157
1     332
Name: count, dtype: int64

In [586]:
test_preprocesse = test_copied[[
    'Age_int', 'TypeofContact_int', 'CityTier', 'Duration_int', 'Occupation_int', "Gender_int", "NumberOfPersonVisiting", 
    "NumberOfFollowups_normalized", "ProductPitched_int", "PreferredPropertyStar", "NumberOfTrips_int", "Passport"
]]
test_preprocesse.head()

Unnamed: 0,Age_int,TypeofContact_int,CityTier,Duration_int,Occupation_int,Gender_int,NumberOfPersonVisiting,NumberOfFollowups_normalized,ProductPitched_int,PreferredPropertyStar,NumberOfTrips_int,Passport
0,48,0.0,2,780,1,0,1.0,4.0,3,3.0,7,0
1,35,0.0,2,720,1,1,1.0,4.0,1,3.0,4,1
2,25,0.0,1,540,0,1,1.0,4.0,0,3.0,1,0
3,21,1.0,2,420,0,0,1.0,4.0,0,4.0,1,0
4,41,1.0,1,420,0,0,1.0,4.0,0,3.0,1,0


In [587]:
test_copied["PitchSatisfactionScore"].value_counts()

PitchSatisfactionScore
2    1231
1     753
4     713
3     652
5     140
Name: count, dtype: int64

In [588]:
test_preprocesse = test_copied[[
    'Age_int', 'TypeofContact_int', 'CityTier', 'Duration_int', 'Occupation_int', "Gender_int", "NumberOfPersonVisiting", 
    "NumberOfFollowups_normalized", "ProductPitched_int", "PreferredPropertyStar", "NumberOfTrips_int", "Passport", "PitchSatisfactionScore"
]]
test_preprocesse.head()

Unnamed: 0,Age_int,TypeofContact_int,CityTier,Duration_int,Occupation_int,Gender_int,NumberOfPersonVisiting,NumberOfFollowups_normalized,ProductPitched_int,PreferredPropertyStar,NumberOfTrips_int,Passport,PitchSatisfactionScore
0,48,0.0,2,780,1,0,1.0,4.0,3,3.0,7,0,3
1,35,0.0,2,720,1,1,1.0,4.0,1,3.0,4,1,3
2,25,0.0,1,540,0,1,1.0,4.0,0,3.0,1,0,3
3,21,1.0,2,420,0,0,1.0,4.0,0,4.0,1,0,3
4,41,1.0,1,420,0,0,1.0,4.0,0,3.0,1,0,4


In [589]:
test_copied["Designation"].value_counts()

Designation
Executive         1129
Manager           1016
Senior Manager     740
AVP                305
VP                 142
Exеcutivе           10
Managеr             10
Executivе            9
Exеcutive            9
Mαnager              9
Manαger              8
ΑVP                  8
АVP                  7
Executiѵe            7
Senior Managеr       6
Senio𝙧 Manager       5
Еxecutive            5
Manage𝙧              5
Μanager              5
Sеnior Manager       4
VＰ                   3
Execｕtive            3
Senior Manage𝙧       3
Senior Μanager       2
Ѕenior Manager       2
Senior Manαger       2
AVＰ                  2
Տenior Manager       2
E×ecutive            2
Μαnager              2
Mαnαger              2
Execｕtivе            1
Sеnio𝙧 Manager       1
Exеcｕtivе            1
Mαnαgеr              1
Sеnior Managеr       1
АVＰ                  1
Senior Mαnαger       1
Senior Managе𝙧       1
Mαnagеr              1
Еxecｕtive            1
Senio𝙧 Manage𝙧       1
Senio𝙧 Mαnage𝙧       1

In [590]:
designation_mapping = {
    "executive": 0,
    "manager": 1,
    "seniormanager": 2,
    "avp": 3,
    "vp": 4,
}

def extract_designation(designation_str: str):
    # 空白を削除
    designation_str_stripped = designation_str.replace(" ", "").replace("　", "")
    designation_str_normalized = designation_str_stripped.translate(z2h_ascii).lower()
    designation_str_filtered = "".join(filter(str.isalpha, designation_str_normalized))
    
    closest_match = process.extractOne(designation_str_filtered, designation_mapping.keys())
    closest_match_str = closest_match[0]

    if closest_match_str in designation_mapping:
        return designation_mapping[closest_match_str]
    else:
        return -1
    
designation_pitched_int = test_copied["Designation"].map(extract_designation)
test_copied.loc[:, "Designation_int"] = designation_pitched_int
test_copied["Designation_int"].value_counts()

test_preprocesse = test_copied[[
    'Age_int', 'TypeofContact_int', 'CityTier', 'Duration_int', 'Occupation_int', "Gender_int", "NumberOfPersonVisiting", 
    "NumberOfFollowups_normalized", "ProductPitched_int", "PreferredPropertyStar", "NumberOfTrips_int", "Passport", "PitchSatisfactionScore", "Designation_int"
]]
test_preprocesse.head()

Unnamed: 0,Age_int,TypeofContact_int,CityTier,Duration_int,Occupation_int,Gender_int,NumberOfPersonVisiting,NumberOfFollowups_normalized,ProductPitched_int,PreferredPropertyStar,NumberOfTrips_int,Passport,PitchSatisfactionScore,Designation_int
0,48,0.0,2,780,1,0,1.0,4.0,3,3.0,7,0,3,3
1,35,0.0,2,720,1,1,1.0,4.0,1,3.0,4,1,3,2
2,25,0.0,1,540,0,1,1.0,4.0,0,3.0,1,0,3,0
3,21,1.0,2,420,0,0,1.0,4.0,0,4.0,1,0,3,2
4,41,1.0,1,420,0,0,1.0,4.0,0,3.0,1,0,4,0


In [591]:
test_copied["MonthlyIncome"].value_counts()

MonthlyIncome
月収30.0万円    203
月収40.0万円    127
月収50.0万円     45
月収26.0万円     38
月収32.0万円     35
           ... 
247425.0      1
312990.0      1
240735.0      1
255000.0      1
411795.0      1
Name: count, Length: 2528, dtype: int64

In [592]:
shousu_pattern = r'\d+\.\d+'

def extract_monthly_income(monthly_income_str: str) -> float:
    if pd.isnull(monthly_income_str):
        return -1
    if monthly_income_str.find("万") != -1:
        my_match = re.search(shousu_pattern, monthly_income_str)
        return float(my_match.group(0))*10000
    else:
        try:
            return float(monthly_income_str)
        except ValueError:
            return -1
    
monthly_income_int = test_copied["MonthlyIncome"].map(extract_monthly_income)
test_copied.loc[:, "MonthlyIncome_int"] = monthly_income_int

test_preprocesse = test_copied[[
    'Age_int', 'TypeofContact_int', 'CityTier', 'Duration_int', 'Occupation_int', "Gender_int", "NumberOfPersonVisiting", 
    "NumberOfFollowups_normalized", "ProductPitched_int", "PreferredPropertyStar", "NumberOfTrips_int", "Passport", "PitchSatisfactionScore", "Designation_int", "MonthlyIncome_int"
]]
test_preprocesse.head()


Unnamed: 0,Age_int,TypeofContact_int,CityTier,Duration_int,Occupation_int,Gender_int,NumberOfPersonVisiting,NumberOfFollowups_normalized,ProductPitched_int,PreferredPropertyStar,NumberOfTrips_int,Passport,PitchSatisfactionScore,Designation_int,MonthlyIncome_int
0,48,0.0,2,780,1,0,1.0,4.0,3,3.0,7,0,3,3,496950.0
1,35,0.0,2,720,1,1,1.0,4.0,1,3.0,4,1,3,2,300000.0
2,25,0.0,1,540,0,1,1.0,4.0,0,3.0,1,0,3,0,260000.0
3,21,1.0,2,420,0,0,1.0,4.0,0,4.0,1,0,3,2,259875.0
4,41,1.0,1,420,0,0,1.0,4.0,0,3.0,1,0,4,0,268830.0


In [593]:
test_copied["customer_info"].value_counts().to_csv("../output/customer_info.csv")

In [594]:
def extract_customer_info_kekkon(customer_info_str: str) -> int:
    if customer_info_str.find("未婚") != -1 or customer_info_str.find("独身") != -1:
        return 0
    if customer_info_str.find("結婚") != -1:
        return 1
    if customer_info_str.find("離婚") != -1:
        return 2
    else:
        return -1
    
customer_info_kekkon_int = test_copied["customer_info"].map(extract_customer_info_kekkon)
test_copied.loc[:, "customer_info_kekkon_int"] = customer_info_kekkon_int
test_copied["customer_info_kekkon_int"].value_counts()

customer_info_kekkon_int
1    1462
0    1217
2     810
Name: count, dtype: int64

In [595]:
def extract_customer_info_car(customer_info_str: str) -> int:
    if customer_info_str.find("車所") != -1 or customer_info_str.find("車あり") != -1:
        return 1
    elif customer_info_str.find("車未所") != -1 or customer_info_str.find("車なし") != -1 or customer_info_str.find("車保有なし") != -1:
        return 0
    else:
        return 1
    
customer_info_car_int = test_copied["customer_info"].map(extract_customer_info_car)
test_copied.loc[:, "customer_info_car_int"] = customer_info_car_int
test_copied["customer_info_car_int"].value_counts()

customer_info_car_int
0    1834
1    1655
Name: count, dtype: int64

In [596]:
test_copied[test_copied["customer_info_car_int"] == -1]["customer_info"].value_counts().to_csv("../output/customer_info2.csv")

In [597]:
child_pattern = r"[0-9]+人"
child_pattern2 = r"[0-9]+児"

def extract_customer_info_child(customer_info_str: str) -> int:
    if (customer_info_str.find("子供なし") != -1 
        or customer_info_str.find("子供無し") != -1 
        or customer_info_str.find("子供ゼロ") != -1 
        or customer_info_str.find("無子") != -1
        or customer_info_str.find("非育児家庭") != -1
        ):
        return 0
    my_match = re.search(child_pattern, customer_info_str)
    if my_match:
        return int("".join(filter(str.isdigit, my_match[0])))
    my_match2 = re.search(child_pattern2, customer_info_str)
    if my_match2:
        return int("".join(filter(str.isdigit, my_match2[0])))
    else:
        return -1
    
customer_info_child_int = test_copied["customer_info"].map(extract_customer_info_child)
test_copied.loc[:, "customer_info_child_int"] = customer_info_child_int
test_copied["customer_info_child_int"].value_counts()

customer_info_child_int
 1    1564
 0     924
 2     897
 3      67
-1      37
Name: count, dtype: int64

In [598]:
test_copied[test_copied["customer_info_child_int"] == -1]["customer_info"].value_counts()

customer_info
結婚済み/乗用車なし/子供の数不明     2
結婚済み、自動車所有、子供の数不明     2
結婚済み、車所持、子供の数不明       2
独身、自動車所有、子供の数不明       2
独身、自家用車なし、子供の数不明      1
結婚済み　車未所持　子育て状況不明     1
結婚済み/自動車所有/子供の数不明     1
結婚済み　自家用車あり　子供の数不明    1
結婚済み 車未所持 子の数不詳       1
結婚済み/車所持/不明           1
離婚済み、自動車未所有、子供の数不明    1
結婚済み、自動車所有、不明         1
結婚済み/自家用車なし/子供の数不明    1
結婚済み,自家用車あり,子供の数不明    1
離婚済み／車なし／子育て状況不明      1
結婚済み 自動車所有 子供の数不明     1
離婚済み、車保有、不明           1
独身 車所持 子供の数不明         1
結婚済み\t車所持\t子供の数不明     1
離婚済み、車未所持、不明          1
結婚済み/乗用車所持/子供の数不明     1
結婚済み 車所持 子供の数不明       1
結婚済み/車未所持／不明          1
結婚済み／自家用車あり／不明        1
独身、車所持、子供の数不明         1
結婚済み,車所持,子供の数不明       1
離婚済み 自動車所有 子供の数不明     1
結婚済み　自動車未所有　子供の数不明    1
結婚済み 自動車所有 不明         1
結婚済み、車あり、子供の数不明       1
結婚済み 車未所持　子供の数不明      1
結婚済み、自動車所有、子育て状況不明    1
独身、自動車未所有,子供の数不明      1
Name: count, dtype: int64

In [599]:
test_preprocesse = test_copied[[
    'Age_int', 'TypeofContact_int', 'CityTier', 'Duration_int', 'Occupation_int', "Gender_int", "NumberOfPersonVisiting", 
    "NumberOfFollowups_normalized", "ProductPitched_int", "PreferredPropertyStar", "NumberOfTrips_int", "Passport", 
    "PitchSatisfactionScore", "Designation_int", "MonthlyIncome_int", "customer_info_kekkon_int", "customer_info_car_int", "customer_info_child_int"
]]
test_preprocesse.head()

Unnamed: 0,Age_int,TypeofContact_int,CityTier,Duration_int,Occupation_int,Gender_int,NumberOfPersonVisiting,NumberOfFollowups_normalized,ProductPitched_int,PreferredPropertyStar,NumberOfTrips_int,Passport,PitchSatisfactionScore,Designation_int,MonthlyIncome_int,customer_info_kekkon_int,customer_info_car_int,customer_info_child_int
0,48,0.0,2,780,1,0,1.0,4.0,3,3.0,7,0,3,3,496950.0,1,1,0
1,35,0.0,2,720,1,1,1.0,4.0,1,3.0,4,1,3,2,300000.0,1,0,0
2,25,0.0,1,540,0,1,1.0,4.0,0,3.0,1,0,3,0,260000.0,2,0,0
3,21,1.0,2,420,0,0,1.0,4.0,0,4.0,1,0,3,2,259875.0,2,1,0
4,41,1.0,1,420,0,0,1.0,4.0,0,3.0,1,0,4,0,268830.0,0,1,0


In [600]:
test_preprocesse = test_copied[[
    "id",
    'Age_int', 'TypeofContact_int', 'CityTier', 'Duration_int', 'Occupation_int', "Gender_int", "NumberOfPersonVisiting", 
    "NumberOfFollowups_normalized", "ProductPitched_int", "PreferredPropertyStar", "NumberOfTrips_int", "Passport", 
    "PitchSatisfactionScore", "Designation_int", "MonthlyIncome_int", "customer_info_kekkon_int", "customer_info_car_int", "customer_info_child_int",
]]
test_preprocesse.head()

Unnamed: 0,id,Age_int,TypeofContact_int,CityTier,Duration_int,Occupation_int,Gender_int,NumberOfPersonVisiting,NumberOfFollowups_normalized,ProductPitched_int,PreferredPropertyStar,NumberOfTrips_int,Passport,PitchSatisfactionScore,Designation_int,MonthlyIncome_int,customer_info_kekkon_int,customer_info_car_int,customer_info_child_int
0,3489,48,0.0,2,780,1,0,1.0,4.0,3,3.0,7,0,3,3,496950.0,1,1,0
1,3490,35,0.0,2,720,1,1,1.0,4.0,1,3.0,4,1,3,2,300000.0,1,0,0
2,3491,25,0.0,1,540,0,1,1.0,4.0,0,3.0,1,0,3,0,260000.0,2,0,0
3,3492,21,1.0,2,420,0,0,1.0,4.0,0,4.0,1,0,3,2,259875.0,2,1,0
4,3493,41,1.0,1,420,0,0,1.0,4.0,0,3.0,1,0,4,0,268830.0,0,1,0


In [601]:
test_preprocesse.to_csv("../output/test_preprocessed.csv", index=False)