In [15]:
# ライブラリのインポート
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import re

# 表示される最大行数を拡大する
pd.set_option('display.max_rows', None)
# カラムを無制限に表示する設定
pd.set_option('display.max_columns', None)

In [16]:
# データのインポート
df_train = pd.read_csv('./data/processed_train.csv')
df_test = pd.read_csv('./data/processed_test.csv')

In [17]:
df_train.head()

Unnamed: 0,id,Age,TypeofContact,CityTier,DurationOfPitch,Occupation,Gender,NumberOfPersonVisiting,NumberOfFollowups,ProductPitched,PreferredPropertyStar,NumberOfTrips,Passport,PitchSatisfactionScore,Designation,MonthlyIncome,marital_status,car_ownership,children,ProdTaken
0,0,50,Self Enquiry,2,900.0,Large Business,Male,1.0,4.0,Basic,3.0,5,1,4,Executive,253905.0,未婚,車未所持,0.0,1.0
1,1,56,Company Invited,1,840.0,Salaried,Male,1.0,4.0,Standard,3.0,2,1,4,Senior Manager,404475.0,離婚済み,車所持,0.0,0.0
2,2,0,Self Enquiry,1,600.0,Large Business,Female,1.0,3.0,Basic,3.0,4,0,4,Executive,278145.0,結婚済み,車未所持,0.0,1.0
3,3,37,Self Enquiry,2,1080.0,Small Business,Female,1.0,3.0,Standard,4.0,1,0,5,Senior Manager,326805.0,離婚済み,車所持,0.0,0.0
4,4,48,Company Invited,3,1020.0,Small Business,Female,1.0,3.0,Basic,4.0,4,0,4,Executive,258435.0,独身,車所持,0.0,1.0


In [18]:
df_train['ProdTaken'] = df_train['ProdTaken'].astype(int)

In [19]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3489 entries, 0 to 3488
Data columns (total 20 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   id                      3489 non-null   int64  
 1   Age                     3489 non-null   int64  
 2   TypeofContact           3483 non-null   object 
 3   CityTier                3489 non-null   int64  
 4   DurationOfPitch         3368 non-null   float64
 5   Occupation              3489 non-null   object 
 6   Gender                  3489 non-null   object 
 7   NumberOfPersonVisiting  3489 non-null   float64
 8   NumberOfFollowups       3456 non-null   float64
 9   ProductPitched          3489 non-null   object 
 10  PreferredPropertyStar   3489 non-null   float64
 11  NumberOfTrips           3467 non-null   object 
 12  Passport                3489 non-null   int64  
 13  PitchSatisfactionScore  3489 non-null   int64  
 14  Designation             3489 non-null   

In [20]:
# 列数・行数の確認
df_train.shape

(3489, 20)

In [21]:
df_test.shape

(3489, 19)

In [22]:
# 欠損値の確認
df_train.isnull().sum()

id                          0
Age                         0
TypeofContact               6
CityTier                    0
DurationOfPitch           121
Occupation                  0
Gender                      0
NumberOfPersonVisiting      0
NumberOfFollowups          33
ProductPitched              0
PreferredPropertyStar       0
NumberOfTrips              22
Passport                    0
PitchSatisfactionScore      0
Designation                 0
MonthlyIncome              56
marital_status              0
car_ownership               0
children                   37
ProdTaken                   0
dtype: int64

In [23]:
# 型のチェック
df_train.dtypes

id                          int64
Age                         int64
TypeofContact              object
CityTier                    int64
DurationOfPitch           float64
Occupation                 object
Gender                     object
NumberOfPersonVisiting    float64
NumberOfFollowups         float64
ProductPitched             object
PreferredPropertyStar     float64
NumberOfTrips              object
Passport                    int64
PitchSatisfactionScore      int64
Designation                object
MonthlyIncome             float64
marital_status             object
car_ownership              object
children                  float64
ProdTaken                   int32
dtype: object

In [24]:
# Ageの値を確認
df_train['Age'].value_counts()

Age
30    215
40    211
50    143
36    134
33    127
37    118
34    112
31    109
32    108
38    107
39    105
35    104
0     100
42     96
41     94
43     91
51     83
52     83
20     82
45     77
46     75
28     74
29     74
53     71
47     69
48     68
49     68
26     67
44     62
55     61
54     61
27     56
56     49
25     45
22     43
23     38
24     38
57     36
21     34
58     33
59     25
60     20
19     10
18      6
61      5
10      2
Name: count, dtype: int64

In [25]:
# データの結合
data = pd.concat([df_train, df_test], axis=0)
data.shape

(6978, 20)

In [26]:
# 漢数字とそれに対応する数字を定義
kanji_to_num = {
    '零': 0, '一': 1, '二': 2, '三': 3, '四': 4, '五': 5, '六': 6, 
    '七': 7, '八': 8, '九': 9, '十': 10, '百': 100, '千': 1000
}

def kanji_to_number(text):
    num = 0
    unit = 1
    for char in reversed(text):
        if char in kanji_to_num:
            digit = kanji_to_num[char]
            if digit >= 10:
                unit = digit
            else:
                num += digit * unit
        else:
            unit = 1
    return num

def preprocess_age(age):
    if pd.isna(age):  # 欠損値（NaN）を処理
        return None
    
    if isinstance(age, (int, float)):  # 数値が入力された場合そのまま返す
        return int(age)

    # 全角数字を半角数字に変換
    age = re.sub(r'[０-９]', lambda x: str(ord(x.group()) - ord('０')), age)
    
    # 漢数字が含まれる場合は変換
    if any(char in kanji_to_num for char in age):
        age = re.sub(r'[零一二三四五六七八九十百千]+', lambda x: str(kanji_to_number(x.group())), age)

    # 数字のみを抽出
    age = re.sub(r'\D', '', age)
    
    return int(age) if age.isdigit() else None

def preprocess_dataframe(df, column_name):
    # 指定された列に対して前処理を適用
    df[column_name] = df[column_name].apply(preprocess_age)
    return df

In [27]:
# 前処理を適用
data = preprocess_dataframe(data, 'Age')
# 確認
data.head()

Unnamed: 0,id,Age,TypeofContact,CityTier,DurationOfPitch,Occupation,Gender,NumberOfPersonVisiting,NumberOfFollowups,ProductPitched,PreferredPropertyStar,NumberOfTrips,Passport,PitchSatisfactionScore,Designation,MonthlyIncome,marital_status,car_ownership,children,ProdTaken
0,0,50,Self Enquiry,2,900.0,Large Business,Male,1.0,4.0,Basic,3.0,5,1,4,Executive,253905.0,未婚,車未所持,0.0,1.0
1,1,56,Company Invited,1,840.0,Salaried,Male,1.0,4.0,Standard,3.0,2,1,4,Senior Manager,404475.0,離婚済み,車所持,0.0,0.0
2,2,0,Self Enquiry,1,600.0,Large Business,Female,1.0,3.0,Basic,3.0,4,0,4,Executive,278145.0,結婚済み,車未所持,0.0,1.0
3,3,37,Self Enquiry,2,1080.0,Small Business,Female,1.0,3.0,Standard,4.0,1,0,5,Senior Manager,326805.0,離婚済み,車所持,0.0,0.0
4,4,48,Company Invited,3,1020.0,Small Business,Female,1.0,3.0,Basic,4.0,4,0,4,Executive,258435.0,独身,車所持,0.0,1.0


In [28]:
df_train['Age'].isnull().sum()

0

In [29]:
# NaNを0に置き換えてからint型に変換
data['Age'] = data['Age'].fillna(0).round().astype(int)

In [30]:
data['DurationOfPitch'].value_counts()

DurationOfPitch
480.0     671
540.0     654
900.0     569
960.0     527
840.0     518
600.0     467
420.0     446
780.0     397
1020.0    312
660.0     306
720.0     293
360.0     167
1080.0    133
1380.0    104
1920.0     93
1860.0     86
1440.0     86
1320.0     84
1500.0     79
2040.0     75
1980.0     69
1200.0     66
1260.0     65
1800.0     65
1560.0     64
2100.0     58
1620.0     57
1680.0     56
1140.0     50
1740.0     48
300.0      43
2160.0     15
240.0       3
Name: count, dtype: int64

In [31]:
# 秒に変換する関数
def time_to_seconds(time_str):
    # 欠損値（NaN）を処理
    if pd.isna(time_str):  
        return None
    
    # 数値が入力された場合、そのまま返すか適宜変換
    if isinstance(time_str, (int, float)):  
        return time_str
    
    # 数値部分を抽出
    number = re.findall(r'\d+', time_str)
    if not number:
        return None
    
    number = int(number[0])
    
    # 時間単位を確認して変換
    if '秒' in time_str:
        return number
    elif '分' in time_str:
        return number * 60
    else:
        return None

In [32]:
# 前処理を適用
data['DurationOfPitch'] = data['DurationOfPitch'].apply(time_to_seconds)
# 確認
data.head()

Unnamed: 0,id,Age,TypeofContact,CityTier,DurationOfPitch,Occupation,Gender,NumberOfPersonVisiting,NumberOfFollowups,ProductPitched,PreferredPropertyStar,NumberOfTrips,Passport,PitchSatisfactionScore,Designation,MonthlyIncome,marital_status,car_ownership,children,ProdTaken
0,0,50,Self Enquiry,2,900.0,Large Business,Male,1.0,4.0,Basic,3.0,5,1,4,Executive,253905.0,未婚,車未所持,0.0,1.0
1,1,56,Company Invited,1,840.0,Salaried,Male,1.0,4.0,Standard,3.0,2,1,4,Senior Manager,404475.0,離婚済み,車所持,0.0,0.0
2,2,0,Self Enquiry,1,600.0,Large Business,Female,1.0,3.0,Basic,3.0,4,0,4,Executive,278145.0,結婚済み,車未所持,0.0,1.0
3,3,37,Self Enquiry,2,1080.0,Small Business,Female,1.0,3.0,Standard,4.0,1,0,5,Senior Manager,326805.0,離婚済み,車所持,0.0,0.0
4,4,48,Company Invited,3,1020.0,Small Business,Female,1.0,3.0,Basic,4.0,4,0,4,Executive,258435.0,独身,車所持,0.0,1.0


In [33]:
df_train['customer_info'].value_counts()

KeyError: 'customer_info'

In [None]:
# 削除したい文字のリスト
remove_chars = ['、', '／', '/', ',', '\t', '\r\n', '\n']

# 各要素に対して処理を行う
for i in range(len(data)):
    info = data.iloc[i]['customer_info']
    
    # 文字を削除する
    for char in remove_chars:
        if char in info:
            info = info.replace(char, ' ')
    
    # 更新
    data.iloc[i, data.columns.get_loc('customer_info')] = info

# 結果の確認
data['customer_info'].value_counts()

customer_info
結婚済み 車未所持 子供1人          185
結婚済み 車所持 子供1人           154
未婚 車未所持 子供1人             96
結婚済み 自動車未所有 子供1人         94
結婚済み 車所持 子供2人            91
結婚済み 車未所持 子供2人           86
結婚済み 車所持 子供なし            85
離婚済み 車所持 子供1人            85
結婚済み 車所持 こども1人           82
結婚済み 車未所持 こども1人          81
離婚済み 車未所持 子供1人           79
結婚済み 自動車所有 子供1人          71
離婚済み 車所持 子供なし            68
独身 車所持 子供なし              64
離婚済み 車未所持 子供なし           59
結婚済み 車未所持 子供なし           57
独身 車未所持 子供なし             56
離婚済み 車所持 子供2人            55
独身 車未所持 子供1人             54
未婚 自動車未所有 子供1人           51
独身 車所持 子供1人              50
未婚 車未所持 こども1人            49
結婚済み 車未所持 こども2人          48
離婚済み 自動車所有 子供なし          46
結婚済み 自動車未所有 子供2人         46
独身 自動車未所有 子供1人           46
独身 自動車未所有 子供なし           45
結婚済み 自動車未所有 こども1人        44
未婚 車所持 子供1人              44
結婚済み 自動車所有 子供2人          42
独身 車未所持 こども1人            42
未婚 車未所持 子供2人             41
離婚済み 自動車未所有 子供1人         41
結婚済み 自動車所有 こども1人         40
結婚済み 自家用車なし 子供1人         40
結婚済み 自

In [None]:
split_data = data['customer_info'].str.split(expand=True)
data[['marital_status', 'car_ownership', 'children']] = split_data[[0, 1, 2]]

In [None]:
data['marital_status'].value_counts()

marital_status
結婚済み    2846
離婚済み    1688
独身      1309
未婚      1135
Name: count, dtype: int64

In [None]:
data['car_ownership'].value_counts()

car_ownership
車未所持      1674
車所持       1573
自動車未所有     887
自動車所有      789
自家用車なし     412
自家用車あり     405
乗用車なし      239
乗用車所持      214
車なし        205
車保有なし      202
車保有        200
車あり        178
Name: count, dtype: int64

In [None]:
# マッピング辞書の定義
car_ownership_mapping = {
    '車所持': '車所持',
    '自動車未所有': '車未所持',
    '自動車所有': '車所持',
    '自家用車なし': '車未所持',
    '自家用車あり': '車所持',
    '乗用車なし': '車未所持',
    '乗用車所持': '車所持',
    '車なし': '車未所持',
    '車保有なし': '車未所持',
    '車保有': '車所持',
    '車あり': '車所持'
}

# データの変換
data['car_ownership'] = data['car_ownership'].replace(car_ownership_mapping)

# 結果の確認
print(data['car_ownership'].value_counts())

car_ownership
車未所持    3619
車所持     3359
Name: count, dtype: int64


In [None]:
data['children'].value_counts()

children
子供1人        1699
子供なし        1038
子供2人         921
こども1人        832
子供無し         507
こども2人        456
子供有り         321
1児           220
子供有り(1人)     194
子供ゼロ         125
無子           121
非育児家庭        118
子供有り(2人)     115
2児           108
子供3人          70
子供の数不明        39
こども3人         38
不明            17
3児            14
子育て状況不明        8
子供有り(3人)       7
子の数不詳          5
わからない          5
Name: count, dtype: int64

In [None]:
# マッピング辞書の定義
children_mapping = {
    '子供1人': '1',
    'こども1人': '1',
    '子供無し': '0',
    '子供なし': '0',
    '子供ゼロ': '0',
    '無子': '0',
    '非育児家庭': '0',
    '子供有り': '1',
    '子供有り(1人)': '1',
    '1児': '1',
    '子供有り(2人)': '2',
    '2児': '2',
    '子供2人': '2',
    'こども2人': '2',
    '子供3人': '3',
    'こども3人': '3',
    '3児': '3',
    '子供有り(3人)': '3',
    '子育て状況不明': np.nan,
    '子供の数不明': np.nan,
    '子の数不詳': np.nan,
    'わからない': np.nan,
    '不明': np.nan
}

# データの変換
data['children'] = data['children'].replace(children_mapping)

# 結果の確認
print(data['children'].value_counts(dropna=False))

children
1      3266
0      1909
2      1600
3       129
NaN      74
Name: count, dtype: int64


In [None]:
data.head()

Unnamed: 0,id,Age,TypeofContact,CityTier,DurationOfPitch,Occupation,Gender,NumberOfPersonVisiting,NumberOfFollowups,ProductPitched,PreferredPropertyStar,NumberOfTrips,Passport,PitchSatisfactionScore,Designation,MonthlyIncome,customer_info,ProdTaken,marital_status,car_ownership,children
0,0,50,Self Enquiry,2,900.0,Large Business,male,1.0,4.0,Basic,3.0,5,1,4,Executive,253905.0,未婚 車未所持 子供なし,1.0,未婚,車未所持,0
1,1,56,Company Invited,1,840.0,Salaried,Male,1.0,4.0,Standard,3.0,2,1,4,Senior Manager,404475.0,離婚済み 車あり 子供無し,0.0,離婚済み,車所持,0
2,2,0,Self Enquiry,1,600.0,Large Business,Female,1.0,3.0,Basic,3.0,4,0,4,Executive,278145.0,結婚済み 自動車未所有 子供なし,1.0,結婚済み,車未所持,0
3,3,37,Self Enquiry,2,1080.0,Small Business,female,1.0,3.0,Standard,4.0,1,0,5,Senior Manager,326805.0,離婚済み 車所持 子供無し,0.0,離婚済み,車所持,0
4,4,48,Company Invited,3,1020.0,Small Business,female,1.0,3.0,Basic,4.0,4,0,4,Executive,258435.0,独身 車所持 無子,1.0,独身,車所持,0


In [None]:
# `ProdTaken`カラムをポップして取得
prod_taken = data.pop('ProdTaken')

# `ProdTaken`カラムを最後に挿入
data.insert(len(data.columns), 'ProdTaken', prod_taken)

# 結果の確認
data.head()

Unnamed: 0,id,Age,TypeofContact,CityTier,DurationOfPitch,Occupation,Gender,NumberOfPersonVisiting,NumberOfFollowups,ProductPitched,PreferredPropertyStar,NumberOfTrips,Passport,PitchSatisfactionScore,Designation,MonthlyIncome,customer_info,marital_status,car_ownership,children,ProdTaken
0,0,50,Self Enquiry,2,900.0,Large Business,male,1.0,4.0,Basic,3.0,5,1,4,Executive,253905.0,未婚 車未所持 子供なし,未婚,車未所持,0,1.0
1,1,56,Company Invited,1,840.0,Salaried,Male,1.0,4.0,Standard,3.0,2,1,4,Senior Manager,404475.0,離婚済み 車あり 子供無し,離婚済み,車所持,0,0.0
2,2,0,Self Enquiry,1,600.0,Large Business,Female,1.0,3.0,Basic,3.0,4,0,4,Executive,278145.0,結婚済み 自動車未所有 子供なし,結婚済み,車未所持,0,1.0
3,3,37,Self Enquiry,2,1080.0,Small Business,female,1.0,3.0,Standard,4.0,1,0,5,Senior Manager,326805.0,離婚済み 車所持 子供無し,離婚済み,車所持,0,0.0
4,4,48,Company Invited,3,1020.0,Small Business,female,1.0,3.0,Basic,4.0,4,0,4,Executive,258435.0,独身 車所持 無子,独身,車所持,0,1.0


In [None]:
# 'Age'カラムを削除
data = data.drop('customer_info', axis=1)
data.head()

Unnamed: 0,id,Age,TypeofContact,CityTier,DurationOfPitch,Occupation,Gender,NumberOfPersonVisiting,NumberOfFollowups,ProductPitched,PreferredPropertyStar,NumberOfTrips,Passport,PitchSatisfactionScore,Designation,MonthlyIncome,marital_status,car_ownership,children,ProdTaken
0,0,50,Self Enquiry,2,900.0,Large Business,male,1.0,4.0,Basic,3.0,5,1,4,Executive,253905.0,未婚,車未所持,0,1.0
1,1,56,Company Invited,1,840.0,Salaried,Male,1.0,4.0,Standard,3.0,2,1,4,Senior Manager,404475.0,離婚済み,車所持,0,0.0
2,2,0,Self Enquiry,1,600.0,Large Business,Female,1.0,3.0,Basic,3.0,4,0,4,Executive,278145.0,結婚済み,車未所持,0,1.0
3,3,37,Self Enquiry,2,1080.0,Small Business,female,1.0,3.0,Standard,4.0,1,0,5,Senior Manager,326805.0,離婚済み,車所持,0,0.0
4,4,48,Company Invited,3,1020.0,Small Business,female,1.0,3.0,Basic,4.0,4,0,4,Executive,258435.0,独身,車所持,0,1.0


In [None]:
data['Gender'].value_counts()

Gender
Male       2525
Female     1441
male        940
female      504
MALE        363
Ｍａｌｅ        260
Fe Male     213
FEMALE      197
Ｆｅｍａｌｅ      181
ｍａｌｅ         92
ｆｅｍａｌｅ       65
fe male      57
FE MALE      43
ＭＡＬＥ         38
Ｆｅ　Ｍａｌｅ      26
ＦＥＭＡＬＥ       21
ｆｅ　ｍａｌｅ      11
ＦＥ　ＭＡＬＥ       1
Name: count, dtype: int64

In [None]:
# 性別のマッピング辞書
gender_map = {
    'Male': 'Male', 'Female': 'Female', 'male': 'Male', 'female': 'Female', 'MALE': 'Male', 'Ｍａｌｅ': 'Male',
    'Fe Male': 'Female', 'FEMALE': 'Female', 'Ｆｅｍａｌｅ': 'Female',
    'ｍａｌｅ': 'Male', 'ｆｅｍａｌｅ': 'Female', 'fe male': 'Female',
    'FE MALE': 'Female', 'ＭＡＬＥ': 'Male', 'Ｆｅ　Ｍａｌｅ': 'Female',
    'ＦＥＭＡＬＥ': 'Female', 'ｆｅ　ｍａｌｅ': 'Female', 'ＦＥ　ＭＡＬＥ': 'Female'
}

# 性別を統一
data['Gender'] = data['Gender'].map(gender_map)

# 結果の確認
print(data['Gender'].value_counts())

Gender
Male      4218
Female    2760
Name: count, dtype: int64


In [None]:
data['MonthlyIncome'].value_counts()

MonthlyIncome
月収30.0万円    388
月収40.0万円    253
月収50.0万円     81
月収26.0万円     56
月収32.0万円     56
月収31.0万円     49
月収35.0万円     44
月収27.0万円     41
月収34.0万円     36
月収37.0万円     31
月収33.0万円     31
月収36.0万円     28
月収25.0万円     22
月収39.0万円     20
月収38.0万円     19
月収45.0万円     16
月収42.0万円     14
月収46.0万円     12
月収28.0万円     11
月収60.0万円     10
月収44.0万円     10
月収47.0万円     10
月収41.0万円     10
月収52.0万円      8
月収48.0万円      7
月収51.0万円      7
月収20.0万円      5
316605.0      5
月収56.0万円      5
月収53.0万円      5
月収54.0万円      5
259500.0      5
月収49.0万円      5
323190.0      4
333285.0      4
312780.0      4
314820.0      4
月収43.0万円      4
260475.0      4
317055.0      4
272220.0      4
345825.0      4
360930.0      4
252270.0      4
375840.0      4
316005.0      4
257550.0      4
268770.0      4
312285.0      4
269355.0      4
316380.0      4
366045.0      4
337860.0      4
342900.0      4
311505.0      4
315465.0      3
313485.0      3
260445.0      3
327330.0      3
320460.0      3
357060.0      3
307920.0  

In [None]:
data.head()

Unnamed: 0,id,Age,TypeofContact,CityTier,DurationOfPitch,Occupation,Gender,NumberOfPersonVisiting,NumberOfFollowups,ProductPitched,PreferredPropertyStar,NumberOfTrips,Passport,PitchSatisfactionScore,Designation,MonthlyIncome,marital_status,car_ownership,children,ProdTaken
0,0,50,Self Enquiry,2,900.0,Large Business,Male,1.0,4.0,Basic,3.0,5,1,4,Executive,253905.0,未婚,車未所持,0,1.0
1,1,56,Company Invited,1,840.0,Salaried,Male,1.0,4.0,Standard,3.0,2,1,4,Senior Manager,404475.0,離婚済み,車所持,0,0.0
2,2,0,Self Enquiry,1,600.0,Large Business,Female,1.0,3.0,Basic,3.0,4,0,4,Executive,278145.0,結婚済み,車未所持,0,1.0
3,3,37,Self Enquiry,2,1080.0,Small Business,Female,1.0,3.0,Standard,4.0,1,0,5,Senior Manager,326805.0,離婚済み,車所持,0,0.0
4,4,48,Company Invited,3,1020.0,Small Business,Female,1.0,3.0,Basic,4.0,4,0,4,Executive,258435.0,独身,車所持,0,1.0


In [None]:
# 文字列型に変換
data['MonthlyIncome'] = data['MonthlyIncome'].astype(str)

# 「月収」と「万円」を取り除き、数値に変換する関数
def process_income(income_str):
    # 元の文字列を保持する
    original_str = income_str

    # 「月収」を取り除く
    if '月収' in income_str:
        income_str = income_str.replace('月収', '')
    
    # 「万円」を取り除く
    if '万円' in income_str:
        income_str = income_str.replace('万円', '')
    
    # 数値に変換
    try:
        numeric_value = float(income_str)
        # 「月収」または「万円」を取り除いた場合、10000を掛ける
        if '月収' in original_str or '万円' in original_str:
            return numeric_value * 10000
        else:
            return numeric_value
    except ValueError:
        # 変換できない場合はNaNを返す
        return None

# 月収カラムを変換
data['MonthlyIncome'] = data['MonthlyIncome'].apply(process_income)

# 結果の確認
data.head()

Unnamed: 0,id,Age,TypeofContact,CityTier,DurationOfPitch,Occupation,Gender,NumberOfPersonVisiting,NumberOfFollowups,ProductPitched,PreferredPropertyStar,NumberOfTrips,Passport,PitchSatisfactionScore,Designation,MonthlyIncome,marital_status,car_ownership,children,ProdTaken
0,0,50,Self Enquiry,2,900.0,Large Business,Male,1.0,4.0,Basic,3.0,5,1,4,Executive,253905.0,未婚,車未所持,0,1.0
1,1,56,Company Invited,1,840.0,Salaried,Male,1.0,4.0,Standard,3.0,2,1,4,Senior Manager,404475.0,離婚済み,車所持,0,0.0
2,2,0,Self Enquiry,1,600.0,Large Business,Female,1.0,3.0,Basic,3.0,4,0,4,Executive,278145.0,結婚済み,車未所持,0,1.0
3,3,37,Self Enquiry,2,1080.0,Small Business,Female,1.0,3.0,Standard,4.0,1,0,5,Senior Manager,326805.0,離婚済み,車所持,0,0.0
4,4,48,Company Invited,3,1020.0,Small Business,Female,1.0,3.0,Basic,4.0,4,0,4,Executive,258435.0,独身,車所持,0,1.0


In [None]:
data['MonthlyIncome'].value_counts()

MonthlyIncome
300000.0    388
400000.0    253
500000.0     81
320000.0     56
260000.0     56
310000.0     49
350000.0     44
270000.0     42
340000.0     36
330000.0     31
370000.0     31
360000.0     28
250000.0     22
390000.0     20
380000.0     19
450000.0     16
420000.0     15
460000.0     12
280000.0     11
410000.0     10
440000.0     10
600000.0     10
470000.0     10
520000.0      8
480000.0      7
510000.0      7
316605.0      5
560000.0      5
200000.0      5
540000.0      5
490000.0      5
530000.0      5
259500.0      5
430000.0      4
311505.0      4
323190.0      4
312780.0      4
257550.0      4
272220.0      4
360930.0      4
260475.0      4
316005.0      4
342900.0      4
269355.0      4
314820.0      4
366045.0      4
268770.0      4
345825.0      4
252270.0      4
317055.0      4
375840.0      4
312285.0      4
337860.0      4
316380.0      4
333285.0      4
313815.0      3
259260.0      3
363810.0      3
362850.0      3
314655.0      3
351375.0      3
318060.0  

In [None]:
# データを2つに分けるインデックス
split_index = len(data) // 2

# データフレームを2つに分割
df_train = data.iloc[:split_index]
df_test = data.iloc[split_index:]

df_train.head()

Unnamed: 0,id,Age,TypeofContact,CityTier,DurationOfPitch,Occupation,Gender,NumberOfPersonVisiting,NumberOfFollowups,ProductPitched,PreferredPropertyStar,NumberOfTrips,Passport,PitchSatisfactionScore,Designation,MonthlyIncome,marital_status,car_ownership,children,ProdTaken
0,0,50,Self Enquiry,2,900.0,Large Business,Male,1.0,4.0,Basic,3.0,5,1,4,Executive,253905.0,未婚,車未所持,0,1.0
1,1,56,Company Invited,1,840.0,Salaried,Male,1.0,4.0,Standard,3.0,2,1,4,Senior Manager,404475.0,離婚済み,車所持,0,0.0
2,2,0,Self Enquiry,1,600.0,Large Business,Female,1.0,3.0,Basic,3.0,4,0,4,Executive,278145.0,結婚済み,車未所持,0,1.0
3,3,37,Self Enquiry,2,1080.0,Small Business,Female,1.0,3.0,Standard,4.0,1,0,5,Senior Manager,326805.0,離婚済み,車所持,0,0.0
4,4,48,Company Invited,3,1020.0,Small Business,Female,1.0,3.0,Basic,4.0,4,0,4,Executive,258435.0,独身,車所持,0,1.0


In [None]:
df_train['ProdTaken'] = df_train['ProdTaken'].astype(int)
df_train['ProdTaken'].astype

NameError: name 'df_train' is not defined

In [None]:
df_test.head()

Unnamed: 0,id,Age,TypeofContact,CityTier,DurationOfPitch,Occupation,Gender,NumberOfPersonVisiting,NumberOfFollowups,ProductPitched,PreferredPropertyStar,NumberOfTrips,Passport,PitchSatisfactionScore,Designation,MonthlyIncome,marital_status,car_ownership,children,ProdTaken
0,3489,48,Self Enquiry,2,780.0,Small Business,Male,1.0,4.0,Super De|uxe,3.0,7,0,3,AVP,496950.0,結婚済み,車所持,0,
1,3490,30,Self Enquiry,2,720.0,Small Business,Female,1.0,4.0,Standard,3.0,4,1,3,Senior Manager,300000.0,結婚済み,車未所持,0,
2,3491,25,Self Enquiry,1,540.0,Salaried,Female,1.0,4.0,Basic,3.0,1,0,3,Executive,260000.0,離婚済み,車未所持,0,
3,3492,21,Company Invited,2,420.0,Salaried,Male,1.0,4.0,Basic,4.0,1,0,3,Senior Manager,259875.0,離婚済み,車所持,0,
4,3493,41,Company Invited,1,420.0,Salaried,Male,1.0,4.0,Basic,3.0,1,0,4,Executive,268830.0,独身,車所持,0,


In [None]:
df_test['MonthlyIncome'].value_counts()

MonthlyIncome
300000.0    203
400000.0    127
500000.0     45
260000.0     38
320000.0     35
310000.0     27
270000.0     22
350000.0     21
340000.0     17
370000.0     17
250000.0     12
390000.0     12
380000.0     11
330000.0     10
360000.0      8
600000.0      8
460000.0      8
450000.0      8
420000.0      8
280000.0      6
480000.0      5
410000.0      5
510000.0      5
200000.0      4
520000.0      4
470000.0      4
312780.0      4
440000.0      4
373380.0      3
337860.0      3
530000.0      3
317385.0      3
304515.0      3
560000.0      3
314820.0      3
360930.0      3
315030.0      3
317835.0      3
308205.0      3
346995.0      3
316380.0      3
260070.0      3
320460.0      3
327855.0      3
323190.0      3
316605.0      3
337170.0      3
324120.0      2
364830.0      2
348210.0      2
318060.0      2
422490.0      2
335415.0      2
264870.0      2
379605.0      2
332505.0      2
288390.0      2
361515.0      2
389565.0      2
341070.0      2
319575.0      2
262740.0  

In [None]:
# 'Age'カラムを削除
df_test = df_test.drop('ProdTaken', axis=1)
df_test.head()

Unnamed: 0,id,Age,TypeofContact,CityTier,DurationOfPitch,Occupation,Gender,NumberOfPersonVisiting,NumberOfFollowups,ProductPitched,PreferredPropertyStar,NumberOfTrips,Passport,PitchSatisfactionScore,Designation,MonthlyIncome,marital_status,car_ownership,children
0,3489,48,Self Enquiry,2,780.0,Small Business,Male,1.0,4.0,Super De|uxe,3.0,7,0,3,AVP,496950.0,結婚済み,車所持,0
1,3490,30,Self Enquiry,2,720.0,Small Business,Female,1.0,4.0,Standard,3.0,4,1,3,Senior Manager,300000.0,結婚済み,車未所持,0
2,3491,25,Self Enquiry,1,540.0,Salaried,Female,1.0,4.0,Basic,3.0,1,0,3,Executive,260000.0,離婚済み,車未所持,0
3,3492,21,Company Invited,2,420.0,Salaried,Male,1.0,4.0,Basic,4.0,1,0,3,Senior Manager,259875.0,離婚済み,車所持,0
4,3493,41,Company Invited,1,420.0,Salaried,Male,1.0,4.0,Basic,3.0,1,0,4,Executive,268830.0,独身,車所持,0


In [None]:
# データフレームをCSVファイルに出力
df_train.to_csv('./data/processed_train1.csv', index=False, encoding='utf-8')
df_test.to_csv('./data/processed_test1.csv', index=False, encoding='utf-8')