# Data cleaning

元のデータに対し、以下の処理を行う
- 全角・半角の統一
- 同じ意味で別の書き方をしている処理の統一

In [53]:
import re
import unicodedata

import numpy as np
import pandas as pd

In [54]:
path_input = "../data/input/"
path_train = "../data/input/train.csv"
path_test = "../data/input/test.csv"

## Data read

In [55]:
train_df = pd.read_csv(path_train)
test_df = pd.read_csv(path_test)

In [56]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3489 entries, 0 to 3488
Data columns (total 18 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   id                      3489 non-null   int64  
 1   Age                     3389 non-null   object 
 2   TypeofContact           3483 non-null   object 
 3   CityTier                3489 non-null   int64  
 4   DurationOfPitch         3368 non-null   object 
 5   Occupation              3489 non-null   object 
 6   Gender                  3489 non-null   object 
 7   NumberOfPersonVisiting  3489 non-null   float64
 8   NumberOfFollowups       3456 non-null   float64
 9   ProductPitched          3489 non-null   object 
 10  PreferredPropertyStar   3489 non-null   float64
 11  NumberOfTrips           3467 non-null   object 
 12  Passport                3489 non-null   int64  
 13  PitchSatisfactionScore  3489 non-null   int64  
 14  Designation             3489 non-null   

In [57]:
for feature in train_df.columns:
    print(feature)
    if feature == "id":
        continue
    print(train_df[feature].unique())

id
Age
['50歳' '56歳' nan '三十七歳' '48歳' '19歳' '47歳' '49歳' '33歳' '３１歳' '51歳' '46歳'
 '４９歳' '53才' '20歳' '25歳' '五十三歳' '20代' '44歳' '30歳' '34歳' '29歳' '30代' '41歳'
 '50代' '52歳' '22歳' '３６歳' '28歳' '25才' '五十一歳' '48才' '38才' '三十三歳' '42際' '５０代'
 '35才' '59歳' '四十三歳' '32歳' '４２才' '53歳' '58歳' '40代' '二十六歳' '52際' '54歳' '59才'
 '29才' '21才' '57歳' '37歳' '35歳' '36才' '38歳' '３７歳' '50才' '51才' '43歳' '四十歳'
 '26歳' '31歳' '27歳' '40歳' '３４歳' '40際' '21歳' '24才' '45歳' '36歳' '41才' '38際'
 '18歳' '39歳' '４６歳' '４０代' '５５歳' '58際' '24歳' '41際' '三十一歳' '52才' '３０代' '28才'
 '55際' '60代' '27際' '３２歳' '60歳' '五十四歳' '29際' '三十四歳' '４７歳' '37際' '二十四歳'
 '四十七歳' '58才' '39才' '23才' '３３歳' '４２歳' '４５歳' '３９歳' '５４歳' '55歳' '42歳' '３８歳'
 '47才' '５１歳' '３９才' '51際' '二十七歳' '30才' '２６歳' '４０歳' '36際' '三十歳' '23歳' '二十二歳'
 '37才' '３０歳' '２８歳' '34才' '五十歳' '四十一歳' '５９歳' '３５歳' '２２歳' '３７才' '四十九歳' '42才'
 '４４歳' '２５歳' '44際' '33才' '28際' '２４才' '55才' '４８歳' '四十四歳' '31才' '２０代' '二十歳'
 '22際' '31際' '二十三歳' '39際' '59際' '３３才' '三十六歳' '10代' '２７歳' '49才' '４３歳' '32際'
 '４１歳' '33際' '二十五歳' '27才' '46際' '二

## それぞれ修正

### Age

In [58]:
# 全角数字と漢数字を変換する関数
def normalize_age(age_str):
    if pd.isna(age_str):
        return age_str

    # 漢数字をアラビア数字に変換
    kanji_to_num = {
        "百": "100",
        "九十九": "99",
        "九十八": "98",
        "九十七": "97",
        "九十六": "96",
        "九十五": "95",
        "九十四": "94",
        "九十三": "93",
        "九十二": "92",
        "九十一": "91",
        "九十": "90",
        "八十九": "89",
        "八十八": "88",
        "八十七": "87",
        "八十六": "86",
        "八十五": "85",
        "八十四": "84",
        "八十三": "83",
        "八十二": "82",
        "八十一": "81",
        "八十": "80",
        "七十九": "79",
        "七十八": "78",
        "七十七": "77",
        "七十六": "76",
        "七十五": "75",
        "七十四": "74",
        "七十三": "73",
        "七十二": "72",
        "七十一": "71",
        "七十": "70",
        "六十九": "69",
        "六十八": "68",
        "六十七": "67",
        "六十六": "66",
        "六十五": "65",
        "六十四": "64",
        "六十三": "63",
        "六十二": "62",
        "六十一": "61",
        "六十": "60",
        "五十九": "59",
        "五十八": "58",
        "五十七": "57",
        "五十六": "56",
        "五十五": "55",
        "五十四": "54",
        "五十三": "53",
        "五十二": "52",
        "五十一": "51",
        "五十": "50",
        "四十九": "49",
        "四十八": "48",
        "四十七": "47",
        "四十六": "46",
        "四十五": "45",
        "四十四": "44",
        "四十三": "43",
        "四十二": "42",
        "四十一": "41",
        "四十": "40",
        "三十九": "39",
        "三十八": "38",
        "三十七": "37",
        "三十六": "36",
        "三十五": "35",
        "三十四": "34",
        "三十三": "33",
        "三十二": "32",
        "三十一": "31",
        "三十": "30",
        "二十九": "29",
        "二十八": "28",
        "二十七": "27",
        "二十六": "26",
        "二十五": "25",
        "二十四": "24",
        "二十三": "23",
        "二十二": "22",
        "二十一": "21",
        "二十": "20",
        "十九": "19",
        "十八": "18",
        "十七": "17",
        "十六": "16",
        "十五": "15",
        "十四": "14",
        "十三": "13",
        "十二": "12",
        "十一": "11",
        "十": "10",
        "九": "9",
        "八": "8",
        "七": "7",
        "六": "6",
        "五": "5",
        "四": "4",
        "三": "3",
        "二": "2",
        "一": "1",
    }

    for kanji, num in kanji_to_num.items():
        age_str = age_str.replace(kanji, num)

    # 全角を半角に変換
    age_str = age_str.translate(str.maketrans("０１２３４５６７８９", "0123456789"))

    # 「歳」と「才」を「歳」に統一
    age_str = age_str.replace("才", "歳")
    age_str = age_str.replace("際", "歳")
    # 「〜代」を中央値に変換、ただし10代と60代だけは例外として以下の変換
    if "代" in age_str:
        if age_str == "10代":
            return "19歳"
        elif age_str == "20代":
            return "25歳"
        elif age_str == "30代":
            return "35歳"
        elif age_str == "40代":
            return "45歳"
        elif age_str == "50代":
            return "55歳"
        elif age_str == "60代":
            return "61歳"
    return age_str

In [59]:
# 年齢の前の数字だけを取得する関数
def extract_number(age_str):
    if pd.isna(age_str):
        return age_str
    # 正規表現を使って数字を抽出
    match = re.search(r"\d+", age_str)
    return match.group(0) if match else age_str

In [60]:
# データを変換して抽出
normalized_data = [normalize_age(age) for age in train_df["Age"]]
extracted_numbers = [extract_number(age) for age in normalized_data]
train_df["Age"] = extracted_numbers

# Age 列を int に変換
train_df["Age"] = train_df["Age"].astype(float)
train_df["Age"] = train_df["Age"].fillna(np.nan)

print(train_df["Age"].unique())
print(train_df["Age"].value_counts())
print(train_df["Age"].isnull().sum())

[50. 56. nan 37. 48. 19. 47. 49. 33. 31. 51. 46. 53. 20. 25. 44. 30. 34.
 29. 35. 41. 55. 52. 22. 36. 28. 38. 42. 59. 43. 32. 58. 45. 26. 54. 21.
 57. 40. 27. 24. 18. 39. 61. 60. 23.]
35.0    232
45.0    193
36.0    134
55.0    128
33.0    127
37.0    118
34.0    112
31.0    109
32.0    108
38.0    107
39.0    105
25.0     99
42.0     96
40.0     95
41.0     94
43.0     91
30.0     87
51.0     83
52.0     83
50.0     76
46.0     75
29.0     74
28.0     74
53.0     71
47.0     69
48.0     68
49.0     68
26.0     67
44.0     62
54.0     61
27.0     56
56.0     49
22.0     43
24.0     38
23.0     38
57.0     36
21.0     34
58.0     33
20.0     28
59.0     25
60.0     16
19.0     12
61.0      9
18.0      6
Name: Age, dtype: int64
100


In [61]:
# データを変換して抽出
normalized_data = [normalize_age(age) for age in test_df["Age"]]
extracted_numbers = [extract_number(age) for age in normalized_data]
test_df["Age"] = extracted_numbers

# Age 列を int に変換
try:
    test_df["Age"] = test_df["Age"].astype(int)
except ValueError:
    test_df["Age"] = test_df["Age"].fillna(np.nan)

print(test_df["Age"].unique())
print(test_df["Age"].value_counts())
print(test_df["Age"].isnull().sum())

['48' '35' '25' '21' '41' '45' '44' '30' '31' '47' '43' '55' '32' '20'
 '28' '56' '39' '38' '36' nan '37' '34' '33' '46' '42' '27' '29' '59' '19'
 '24' '49' '51' '50' '23' '52' '58' '26' '57' '54' '60' '40' '22' '53'
 '18' '61']
35    237
45    161
33    136
32    126
55    124
36    123
34    122
37    120
31    108
30    107
42    102
39     99
38     99
25     94
40     93
41     92
50     89
28     86
29     84
43     79
47     75
51     73
52     71
46     71
54     70
49     68
44     67
27     66
53     60
24     58
48     57
26     56
57     50
56     48
22     38
23     36
58     30
59     28
21     24
20     19
19     16
60     14
18     11
61      9
Name: Age, dtype: int64
93


### DurationOfPitch

秒に統一

In [62]:
# 秒単位に変換する関数
def convert_to_seconds(time_str):
    if pd.isna(time_str):
        return time_str

    if "秒" in time_str:
        # '秒'を含む場合、そのまま数値に変換
        return int(re.sub(r"秒", "", time_str))

    elif "分" in time_str:
        # '分'を含む場合、分を秒に変換
        minutes = int(re.sub(r"分", "", time_str))
        return minutes * 60

In [63]:
convert_to_sconds_data = [
    convert_to_seconds(time) for time in train_df["DurationOfPitch"]
]
train_df["DurationOfPitch"] = convert_to_sconds_data

print(train_df["DurationOfPitch"].unique())

[ 900.  840.  600. 1080. 1020.  960. 1320. 1200.  660. 1920. 1440. 1380.
 2100.  420.  300.  480.   nan  540. 1560.  780.  720. 1260. 1500.  360.
 1860. 1680. 2160. 1620. 1980. 1740. 1800. 2040.  240. 1140.]


In [64]:
convert_to_sconds_data = [
    convert_to_seconds(time) for time in test_df["DurationOfPitch"]
]
test_df["DurationOfPitch"] = convert_to_sconds_data

print(test_df["DurationOfPitch"].unique())

[ 780.  720.  540.  420.   nan  480.  960. 1320. 1440. 1020.  660. 1380.
  600.  900. 1080.  840. 1140. 1860.  300. 1980.  360. 1680. 1620. 1200.
 1560. 1260. 1740. 2040. 1920. 1800. 2100. 1500.  240. 2160.]


### Gender

In [65]:
# 性別を変換する関数
def normalize_gender(gender):
    if pd.isna(gender):
        return gender

    # 全角文字を半角文字に変換
    gender = gender.translate(
        str.maketrans(
            "ＡＢＣＤＥＦＧＨＩＪＫＬＭＮＯＰＱＲＳＴＵＶＷＸＹＺａｂｃｄｅｆｇｈｉｊｋｌｍｎｏｐｑｒｓｔｕｖｗｘｙｚ",
            "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz",
        )
    )

    # 空白を削除
    gender = re.sub(r"\s+", "", gender)

    # 小文字に変換してから、Male または Female に統一
    gender = gender.lower()
    if "female" in gender:
        return "Female"
    elif "male" in gender:
        return "Male"
    else:
        return gender  # 予期しない入力がある場合

In [66]:
train_df["Gender_replace"] = train_df["Gender"].apply(normalize_gender)
print(train_df["Gender_replace"].unique())
print(train_df.groupby(["Gender_replace", "Gender"]).size())
train_df["Gender"] = train_df["Gender_replace"]
train_df = train_df.drop(["Gender_replace"], axis=1)

['Male' 'Female']
Gender_replace  Gender 
Female          FE MALE      23
                FEMALE       99
                Fe Male     109
                Female      739
                fe male      29
                female      266
                ＦＥ　ＭＡＬＥ       1
                ＦＥＭＡＬＥ       13
                Ｆｅ　Ｍａｌｅ      15
                Ｆｅｍａｌｅ       89
                ｆｅ　ｍａｌｅ       6
                ｆｅｍａｌｅ       36
Male            MALE        183
                Male       1226
                male        468
                ＭＡＬＥ         18
                Ｍａｌｅ        126
                ｍａｌｅ         43
dtype: int64


In [67]:
test_df["Gender_replace"] = test_df["Gender"].apply(normalize_gender)
print(test_df["Gender_replace"].unique())
print(test_df.groupby(["Gender_replace", "Gender"]).size())
test_df["Gender"] = test_df["Gender_replace"]
test_df = test_df.drop(["Gender_replace"], axis=1)

['Male' 'Female']
Gender_replace  Gender 
Female          FE MALE      20
                FEMALE       98
                Fe Male     104
                Female      702
                fe male      28
                female      238
                ＦＥＭＡＬＥ        8
                Ｆｅ　Ｍａｌｅ      11
                Ｆｅｍａｌｅ       92
                ｆｅ　ｍａｌｅ       5
                ｆｅｍａｌｅ       29
Male            MALE        180
                Male       1299
                male        472
                ＭＡＬＥ         20
                Ｍａｌｅ        134
                ｍａｌｅ         49
dtype: int64


### ProductPitched

In [68]:
# 半角小文字に変換する関数
def normalize_to_lowercase(text):
    if pd.isna(text):
        return text

    # Unicode正規化 (NFKD)で分解してから再結合
    text = unicodedata.normalize("NFKD", text)

    # ASCII文字のみ残す
    text = "".join(c for c in text if unicodedata.category(c) != "Mn")

    # 小文字に変換
    return text.lower()

In [69]:
# 置換マッピング
replace_map = {
    "ᗞ": "d",
    "в": "b",
    "ꓢ": "s",
    "ѕ": "s",
    "տ": "s",
    "ꭰ": "d",
    "ς": "c",
    "α": "a",
    "×": "x",
    "β": "b",
    "ո": "n",
    "ı": "i",
    "c": "c",
    "𐊡": "b",
    "ε": "e",
    "ι": "i",
    "ϲ": "c",
    "ꓢ": "s",
    "ꭰ": "d",
    "і": "i",
    "ѕ": "s",
    "|": "l",
    "basiс": "basic",
}

In [70]:
# 文字を置換する関数
def replace_characters(text):
    for original, replacement in replace_map.items():
        text = text.replace(original, replacement)
    return text

In [71]:
train_df["ProductPitched"] = train_df["ProductPitched"].apply(normalize_to_lowercase)
train_df["ProductPitched"] = train_df["ProductPitched"].apply(replace_characters)
# 統一するためにさらに小文字化
train_df["ProductPitched"] = train_df["ProductPitched"].str.lower()
print(train_df["ProductPitched"].unique())

['basic' 'standard' 'super deluxe' 'deluxe' 'king']


In [72]:
test_df["ProductPitched"] = test_df["ProductPitched"].apply(normalize_to_lowercase)
test_df["ProductPitched"] = test_df["ProductPitched"].apply(replace_characters)
# 統一するためにさらに小文字化
test_df["ProductPitched"] = test_df["ProductPitched"].str.lower()
print(test_df["ProductPitched"].unique())

['super deluxe' 'standard' 'basic' 'deluxe' 'king']


## NumberOfFollowups

In [73]:
# 外れ値を補正
def replace_over_100(num):
    if num == 100.0:
        return 1.0
    elif num == 200.0:
        return 2.0
    elif num == 300.0:
        return 3.0
    elif num == 400.0:
        return 4.0
    elif num == 500:
        return 5.0
    elif num == 600:
        return 6.0
    else:
        return num

In [74]:
train_df["NumberOfFollowups"] = train_df["NumberOfFollowups"].apply(replace_over_100)
test_df["NumberOfFollowups"] = test_df["NumberOfFollowups"].apply(replace_over_100)

In [75]:
train_df.groupby("NumberOfFollowups").size()

NumberOfFollowups
1.0      78
2.0     108
3.0    1296
4.0    1411
5.0     524
6.0      39
dtype: int64

In [76]:
test_df.groupby("NumberOfFollowups").size()

NumberOfFollowups
1.0     100
2.0     105
3.0    1265
4.0    1366
5.0     598
6.0      31
dtype: int64

### NumberOfTrips

In [77]:
# テキストを数値に変換する関数
def convert_to_numeric(entry):
    if pd.isna(entry):
        return entry

    # 年にn回を数字に変換
    match = re.match(r"年に(\d+)回", entry)
    if match:
        return int(match.group(1))

    # 半年に1回は年に2回に相当
    if entry == "半年に1回":
        return 2

    # 四半期に1回は年に4回に相当
    if entry == "四半期に1回":
        return 4

    # それ以外の数字として扱える部分はそのまま返す
    if entry.isdigit():
        return int(entry)

    return entry

In [78]:
train_df["NumberOfTrips_num"] = train_df["NumberOfTrips"].apply(convert_to_numeric)
print(train_df.groupby(["NumberOfTrips", "NumberOfTrips_num"]).size())
train_df["NumberOfTrips"] = train_df["NumberOfTrips_num"]
train_df = train_df.drop(["NumberOfTrips_num"], axis=1)

NumberOfTrips  NumberOfTrips_num
1              1.0                   349
2              2.0                  1006
3              3.0                   699
4              4.0                   141
5              5.0                   436
6              6.0                   109
7              7.0                   219
8              8.0                     8
半年に1回          2.0                    17
四半期に1回         4.0                     3
年に1回           1.0                    60
年に2回           2.0                   141
年に3回           3.0                   121
年に4回           4.0                    19
年に5回           5.0                    79
年に6回           6.0                    21
年に7回           7.0                    37
年に8回           8.0                     2
dtype: int64


In [79]:
test_df["NumberOfTrips_num"] = test_df["NumberOfTrips"].apply(convert_to_numeric)
print(test_df.groupby(["NumberOfTrips", "NumberOfTrips_num"]).size())
test_df["NumberOfTrips"] = test_df["NumberOfTrips_num"]
test_df = test_df.drop(["NumberOfTrips_num"], axis=1)

NumberOfTrips  NumberOfTrips_num
1              1.0                  320
2              2.0                  950
3              3.0                  762
4              4.0                  136
5              5.0                  400
6              6.0                  125
7              7.0                  226
8              8.0                   12
半年に1回          2.0                   10
四半期に1回         4.0                    3
年に1回           1.0                   72
年に2回           2.0                  153
年に3回           3.0                  124
年に4回           4.0                   31
年に5回           5.0                   62
年に6回           6.0                   26
年に7回           7.0                   31
年に8回           8.0                    4
dtype: int64


### Designation

In [80]:
# 置換マッピングを作成
designation_replace_map = {
    "Μ": "M",
    "Α": "A",
    "А": "A",
    "ѵ": "v",
    "Ѕ": "S",
    "е": "e",
    "Е": "E",
    "ѕ": "s",
    "𝙧": "r",
    "α": "a",
    "×": "x",
    "Ⅴ": "V",
    "Ｐ": "P",
    "Տ": "S",
    "c": "c",
    "u": "u",
    "ｔ": "t",
    "v": "v",
    "μ": "m",
    "е": "e",
    "α": "a",
    "а": "a",
    "ѵ": "v",
    "ѕ": "s",
    "տ": "s",
}

In [81]:
# 文字を置換する関数
def designation_replace_characters(text):
    if pd.isna(text):
        return text
    text = unicodedata.normalize("NFKD", text)  # 正規化
    for original, replacement in designation_replace_map.items():
        text = text.replace(original, replacement)
    return text.lower()

In [82]:
train_df["Designation_replace"] = train_df["Designation"].apply(
    designation_replace_characters
)
print(train_df["Designation_replace"].unique())
print(train_df.groupby(["Designation_replace", "Designation"]).size())
train_df["Designation"] = train_df["Designation_replace"]
train_df = train_df.drop(["Designation_replace"], axis=1)

['executive' 'senior manager' 'avp' 'manager' 'vp']
Designation_replace  Designation   
avp                  AVP                299
                     AVＰ                  2
                     ΑVP                  3
                     АVP                  7
executive            Executive         1090
                     Executivе            7
                     Executiѵe            6
                     Executiѵе            2
                     Execuｔive            5
                     Execｕtive            1
                     Exеcutive           10
                     Exеcutivе            9
                     Exеcutiѵе            3
                     Exеcｕtive            2
                     Exеcｕtivе            1
                     E×ecutive            2
                     E×ecｕtive            1
                     E×еcutiѵe            1
                     Еxecutive            3
                     Еxecutivе            2
                     Еxecuｔive  

In [83]:
test_df["Designation_replace"] = test_df["Designation"].apply(
    designation_replace_characters
)
print(test_df["Designation_replace"].unique())
print(test_df.groupby(["Designation_replace", "Designation"]).size())
test_df["Designation"] = test_df["Designation_replace"]
test_df = test_df.drop(["Designation_replace"], axis=1)

['avp' 'senior manager' 'executive' 'manager' 'vp']
Designation_replace  Designation   
avp                  AVP                305
                     AVＰ                  2
                     ΑVP                  8
                     ΑVＰ                  1
                     АVP                  7
                     АVＰ                  1
executive            Executive         1129
                     Executivе            9
                     Executiѵe            7
                     Execuｔive            1
                     Execｕtive            3
                     Execｕtivе            1
                     Execｕｔive            1
                     Exеcutive            9
                     Exеcutivе           10
                     Exеcuｔive            1
                     Exеcｕtivе            1
                     E×ecutive            2
                     Еxecutive            5
                     Еxecｕtive            1
manager              Manager    

### MonthlyIncome

In [84]:
# 数値に変換する関数
def convert_to_num_income(entry):
    if pd.isna(entry):
        return entry

    # 「月収XX.XX万円」の形式を認識し、年収に変換
    match = re.match(r"月収(\d+\.?\d*)万円", str(entry))
    if match:
        monthly_income = float(match.group(1)) * 10000  # 万円を円に変換
        return float(monthly_income)

    # それ以外はそのまま返す（すでに数値のもの）
    return float(entry)

In [85]:
train_df["MonthlyIncome"] = train_df["MonthlyIncome"].apply(convert_to_num_income)

In [86]:
test_df["MonthlyIncome"] = test_df["MonthlyIncome"].apply(convert_to_num_income)

### customer_info

#### split

In [87]:
train_df["customer_info"].unique()

array(['未婚 車未所持 子供なし', '離婚済み,車あり,子供無し', '結婚済み、自動車未所有,子供なし', ...,
       '結婚済み,車保有なし,子供有り 1人', '結婚済み/自家用車なし/1児', '独身／車所持／こども1人'],
      dtype=object)

In [88]:
def split_customer_info(info):
    info_list = re.split("[ ,、/／\n\t\u3000]", info)
    return info_list[0], info_list[1], info_list[2]

In [89]:
train_df[["customer_marriage", "customer_car", "customer_child"]] = [
    split_customer_info(info) for info in train_df["customer_info"]
]
train_df = train_df.drop(["customer_info"], axis=1)

In [90]:
test_df[["customer_marriage", "customer_car", "customer_child"]] = [
    split_customer_info(info) for info in test_df["customer_info"]
]
test_df = test_df.drop(["customer_info"], axis=1)

In [91]:
train_df.head()

Unnamed: 0,id,Age,TypeofContact,CityTier,DurationOfPitch,Occupation,Gender,NumberOfPersonVisiting,NumberOfFollowups,ProductPitched,PreferredPropertyStar,NumberOfTrips,Passport,PitchSatisfactionScore,Designation,MonthlyIncome,ProdTaken,customer_marriage,customer_car,customer_child
0,0,50.0,Self Enquiry,2,900.0,Large Business,Male,1.0,4.0,basic,3.0,5.0,1,4,executive,253905.0,1,未婚,車未所持,子供なし
1,1,56.0,Company Invited,1,840.0,Salaried,Male,1.0,4.0,standard,3.0,2.0,1,4,senior manager,404475.0,0,離婚済み,車あり,子供無し
2,2,,Self Enquiry,1,600.0,Large Business,Female,1.0,3.0,basic,3.0,4.0,0,4,executive,278145.0,1,結婚済み,自動車未所有,子供なし
3,3,37.0,Self Enquiry,2,1080.0,Small Business,Female,1.0,3.0,standard,4.0,1.0,0,5,senior manager,326805.0,0,離婚済み,車所持,子供無し
4,4,48.0,Company Invited,3,1020.0,Small Business,Female,1.0,3.0,basic,4.0,4.0,0,4,executive,258435.0,1,独身,車所持,無子


#### customer_marriage

In [92]:
train_df["customer_marriage"].unique()

array(['未婚', '離婚済み', '結婚済み', '独身'], dtype=object)

In [93]:
test_df["customer_marriage"].unique()

array(['結婚済み', '離婚済み', '独身', '未婚'], dtype=object)

#### customer_car

In [94]:
train_df["customer_car"].unique()

array(['車未所持', '車あり', '自動車未所有', '車所持', '自家用車あり', '車保有', '車保有なし', '乗用車所持',
       '乗用車なし', '自動車所有', '自家用車なし', '車なし'], dtype=object)

In [95]:
# 置換マッピングを作成
car_replace_map = {
    "車未所持": "自動車未所有",
    "車保有なし": "自動車未所有",
    "乗用車なし": "自動車未所有",
    "自家用車なし": "自動車未所有",
    "車なし": "自動車未所有",
    "車あり": "自動車所有",
    "車所持": "自動車所有",
    "自家用車あり": "自動車所有",
    "車保有": "自動車所有",
    "乗用車所持": "自動車所有",
}

In [96]:
# 自動車保有有無を置換する関数
def customer_car_replace_characters(car_hold):
    return car_replace_map.get(car_hold, car_hold)

In [97]:
train_df["customer_car_replace"] = train_df["customer_car"].apply(
    customer_car_replace_characters
)
print(train_df["customer_car_replace"].unique())
print(train_df.groupby(["customer_car_replace", "customer_car"]).size())
train_df["customer_car"] = train_df["customer_car_replace"]
train_df = train_df.drop(["customer_car_replace"], axis=1)

['自動車未所有' '自動車所有']
customer_car_replace  customer_car
自動車所有                 乗用車所持           107
                      自動車所有           392
                      自家用車あり          188
                      車あり              94
                      車保有             104
                      車所持             819
自動車未所有                乗用車なし           109
                      自動車未所有          437
                      自家用車なし          207
                      車なし             104
                      車保有なし            98
                      車未所持            830
dtype: int64


In [98]:
test_df["customer_car_replace"] = test_df["customer_car"].apply(
    customer_car_replace_characters
)
print(test_df["customer_car_replace"].unique())
print(test_df.groupby(["customer_car_replace", "customer_car"]).size())
test_df["customer_car"] = test_df["customer_car_replace"]
test_df = test_df.drop(["customer_car_replace"], axis=1)

['自動車所有' '自動車未所有']
customer_car_replace  customer_car
自動車所有                 乗用車所持           107
                      自動車所有           397
                      自家用車あり          217
                      車あり              84
                      車保有              96
                      車所持             754
自動車未所有                乗用車なし           130
                      自動車未所有          450
                      自家用車なし          205
                      車なし             101
                      車保有なし           104
                      車未所持            844
dtype: int64


#### customer_child

In [99]:
train_df["customer_child"].unique()

array(['子供なし', '子供無し', '無子', '子供ゼロ', '非育児家庭', '子育て状況不明', '子の数不詳',
       '子供の数不明', 'こども1人', '1児', '子供1人', '子供有り(1人)', 'わからない', '子供有り',
       'こども2人', '子供2人', '子供有り(2人)', 'こども3人', '子供3人', '不明', '2児', '3児',
       '子供有り(3人)'], dtype=object)

In [100]:
child_replace_map = {
    "子供なし": "子供なし",
    "子供無し": "子供なし",
    "無子": "子供なし",
    "子供ゼロ": "子供なし",
    "非育児家庭": "子供なし",
    "子育て状況不明": "不明",
    "子の数不詳": "不明",
    "子供の数不明": "不明",
    "わからない": "不明",
    "不明": "不明",
    "こども1人": "子供1人",
    "1児": "子供1人",
    "子供1人": "子供1人",
    "子供有り(1人)": "子供1人",
    "子供有り": "不明",  # 子供有りだけでは人数が不明
    "こども2人": "子供2人",
    "2児": "子供2人",
    "子供2人": "子供2人",
    "子供有り(2人)": "子供2人",
    "こども3人": "子供3人",
    "3児": "子供3人",
    "子供3人": "子供3人",
    "子供有り(3人)": "子供3人",
}


In [101]:
# 子供の有無・数を置換する関数
def replace_child_status(child_status):
    return child_replace_map.get(child_status, child_status)

In [102]:
train_df["customer_child_replace"] = train_df["customer_child"].apply(
    replace_child_status
)
print(train_df["customer_child_replace"].unique())
print(train_df.groupby(["customer_child_replace", "customer_child"]).size())
train_df["customer_child"] = train_df["customer_child_replace"]
train_df = train_df.drop(["customer_child_replace"], axis=1)

['子供なし' '不明' '子供1人' '子供2人' '子供3人']
customer_child_replace  customer_child
不明                      わからない               5
                        不明                 10
                        子の数不詳               4
                        子供の数不明             13
                        子供有り              160
                        子育て状況不明             5
子供1人                    1児                113
                        こども1人             410
                        子供1人              859
                        子供有り(1人)           94
子供2人                    2児                 51
                        こども2人             229
                        子供2人              434
                        子供有り(2人)           49
子供3人                    3児                 10
                        こども3人              20
                        子供3人               35
                        子供有り(3人)            3
子供なし                    子供なし              539
                        子供ゼロ               75
      

In [103]:
test_df["customer_child_replace"] = test_df["customer_child"].apply(
    replace_child_status
)
print(test_df["customer_child_replace"].unique())
print(test_df.groupby(["customer_child_replace", "customer_child"]).size())
test_df["customer_child"] = test_df["customer_child_replace"]
test_df = test_df.drop(["customer_child_replace"], axis=1)

['子供なし' '不明' '子供1人' '子供2人' '子供3人']
customer_child_replace  customer_child
不明                      不明                  7
                        子の数不詳               1
                        子供の数不明             26
                        子供有り              161
                        子育て状況不明             3
子供1人                    1児                107
                        こども1人             422
                        子供1人              840
                        子供有り(1人)          100
子供2人                    2児                 57
                        こども2人             227
                        子供2人              487
                        子供有り(2人)           66
子供3人                    3児                  4
                        こども3人              18
                        子供3人               35
                        子供有り(3人)            4
子供なし                    子供なし              499
                        子供ゼロ               50
                        子供無し              255
      

## 出力

In [104]:
train_df.to_csv(path_input + "cleaned_train.csv", header=True, index=None)
test_df.to_csv(path_input + "cleaned_test.csv", header=True, index=None)