In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
# 前処理済みのデータの読み込み
df_train = pd.read_csv('./data/processed_train1.csv')
df_test = pd.read_csv('./data/processed_test1.csv')

In [3]:
df_train['NumberOfTrips'].value_counts()

NumberOfTrips
2         1006
3          699
5          436
1          349
7          219
年に2回       141
4          141
年に3回       121
6          109
年に5回        79
年に1回        60
年に7回        37
年に6回        21
年に4回        19
半年に1回       17
8            8
四半期に1回       3
年に8回         2
Name: count, dtype: int64

In [4]:
# Function to normalize the entries
def normalize_trips(trip):
    # Convert all values to string first
    trip = str(trip)
    # Convert phrases like "年にX回" to X
    if '年に' in trip and '回' in trip:
        return re.sub(r'年に(\d+)回', r'\1', trip)
    # Convert "半年に1回" to 0.5 (since it means 2 times a year)
    elif trip == '半年に1回':
        return '0.5'
    # Convert "四半期に1回" to 4 (since it means 4 times a year)
    elif trip == '四半期に1回':
        return '4'
    else:
        return trip

# Apply the normalization function
df_train['NumberOfTrips'] = df_train['NumberOfTrips'].apply(normalize_trips)

# Convert to float for consistency and handle errors
def safe_convert_to_float(value):
    try:
        return float(value)
    except ValueError:
        return None  # or handle the error as needed

df_train['NumberOfTrips'] = df_train['NumberOfTrips'].apply(safe_convert_to_float)

# Display the standardized data
df_train['NumberOfTrips'].value_counts()

NumberOfTrips
2.0    1147
3.0     820
5.0     515
1.0     409
7.0     256
4.0     163
6.0     130
8.0      10
Name: count, dtype: int64

In [9]:
df_test['NumberOfTrips'] = df_test['NumberOfTrips'].apply(normalize_trips)
df_test['NumberOfTrips'] = df_test['NumberOfTrips'].apply(safe_convert_to_float)

In [10]:
df_test['NumberOfTrips'].value_counts()

NumberOfTrips
2.0    1103
3.0     886
5.0     462
1.0     392
7.0     257
4.0     170
6.0     151
8.0      16
Name: count, dtype: int64

In [5]:
# Mapping of incorrect variations to correct job titles
mapping = {
    'Executive': 'Executive',
    'Exеcutive': 'Executive',
    'Exеcutivе': 'Executive',
    'Execuｔive': 'Executive',
    'Executivе': 'Executive',
    'Еxecutive': 'Executive',
    'Еxecutivе': 'Executive',
    'Е×еcutive': 'Executive',
    'E×ecutive': 'Executive',
    'Exеcuti?е': 'Executive',
    'Executi?e': 'Executive',
    'E×ecｕtive': 'Executive',
    'Execｕtive': 'Executive',
    'Exеcｕtivе': 'Executive',
    'Еxecuｔive': 'Executive',
    'Ε×ecutive': 'Executive',
    'Executi?е': 'Executive',            
    'Exеcｕtive': 'Executive',
    'E×еcuti?e': 'Executive',          
    'Еxеcutivе': 'Executive',              
    'Е×ecutive': 'Executive', 
    
    'Manager': 'Manager',
    'Μanager': 'Manager',
    'Manαger': 'Manager',
    'Managеr': 'Manager',
    'Mαnager': 'Manager',
    'Μanagеr': 'Manager',
    'Manαgеr': 'Manager',
    'Mαnagеr': 'Manager',
    'Manage??': 'Manager',
    'Mαnage??': 'Manager',
    'Μanage??': 'Manager',
    'Mαnαger': 'Manager',
    
    'Senior Manager': 'Senior Manager',
    'Senior Managеr': 'Senior Manager',
    'Senior Manαger': 'Senior Manager',
    'Senior Mαnager': 'Senior Manager',
    'Senior Manαgеr': 'Senior Manager',
    'Senior Managе??': 'Senior Manager',
    'Senio?? Manager': 'Senior Manager',
    '?enior Manager': 'Senior Manager',
    '?enior Manαger': 'Senior Manager',
    'Sеnior Manager': 'Senior Manager',
    '?enior Μanager': 'Senior Manager',
    'Senior Μanαger': 'Senior Manager',
    'Senior Mαnαger': 'Senior Manager',
    'Sеnior Managеr': 'Senior Manager',
    'Senior Manage??': 'Senior Manager',
    
    'AVP': 'AVP',
    'ΑVP': 'AVP',
    'АVP': 'AVP',
    'AVＰ': 'AVP',
    
    'VP': 'VP',
    'VＰ': 'VP',
}

# Replace incorrect variations with correct job titles
df_train['Designation'] = df_train['Designation'].replace(mapping)

In [14]:
df_test['Designation'] = df_test['Designation'].replace(mapping)

In [15]:
test_mapping = {
    'Executiѵe': 'Executive',
    'Еxecｕtive': 'Executive',
    'Execｕtivе': 'Executive',
    'Exеcuｔive': 'Executive',
     'Execｕｔive': 'Executive',
    
    'Manager': 'Manager',
    'Manage𝙧': 'Manager',
    'Μαnager': 'Manager',
    'Mαnαgеr': 'Manager',
    'Managе𝙧': 'Manager',
    'Μαnagеr': 'Manager',
    
    'Senior Manager': 'Senior Manager',
    'Senio𝙧 Manager': 'Senior Manager',
    'Senior Manage𝙧': 'Senior Manager',
    'Senior Μanager': 'Senior Manager',
    'Տenior Manager': 'Senior Manager',
    'Ѕenior Manager': 'Senior Manager',
    'Senior Managе𝙧': 'Senior Manager',
    'Senio𝙧 Mαnage𝙧': 'Senior Manager',
    'Sеnior Managе𝙧': 'Senior Manager',
    'Senio𝙧 Managеr': 'Senior Manager',
    'Տenior Μanager': 'Senior Manager',
    'Ѕenior Μanage𝙧': 'Senior Manager',
    'Senio𝙧 Manαger': 'Senior Manager',
    'Senio𝙧 Manage𝙧': 'Senior Manager',
    'Sеnio𝙧 Manager': 'Senior Manager',
    
    'AVP': 'AVP',
    'АVＰ': 'AVP',
    'ΑVＰ': 'AVP',
    
    'VP': 'VP'
}

# Replace incorrect variations with correct job titles
df_test['Designation'] = df_test['Designation'].replace(test_mapping)

In [16]:
df_test['Designation'].value_counts()

Designation
Executive         1180
Manager           1062
Senior Manager     778
AVP                324
VP                 145
Name: count, dtype: int64

In [13]:
df_train['Designation'].value_counts()

Designation
Executive         1150
Manager           1055
Senior Manager     854
AVP                311
VP                 119
Name: count, dtype: int64

In [18]:
df_train.isnull().sum()

id                          0
Age                         0
TypeofContact               6
CityTier                    0
DurationOfPitch           121
Occupation                  0
Gender                      0
NumberOfPersonVisiting      0
NumberOfFollowups          33
ProductPitched              0
PreferredPropertyStar       0
NumberOfTrips              39
Passport                    0
PitchSatisfactionScore      0
Designation                 0
MonthlyIncome              56
marital_status              0
car_ownership               0
children                   37
ProdTaken                   0
dtype: int64

In [22]:
# Mapping of variations to standard forms
product_mapping = {
    'Basic': 'Basic', 'basic': 'Basic', 'BASIC': 'Basic', 'Bas?c': 'Basic', 'Вasic': 'Вasic', 'BASΙC': 'Вasic', 'Вasic': 'Basic', 'Вasic': 'Basic',
    'Βasic': 'Basic', 'Basi??': 'Basic', 'Βas?c': 'Basic', 'Basiс': 'Basic', '??asic': 'Вasic', '??asi??': 'Вasic', 
    'Basi?': 'Basic', 'BA?IC': 'Basic', 'BASIС': 'Basic', 'B??sic': 'Basic', 'В??sic': 'Вasic',
    'ΒASIС': 'Basic', 'Bas??': 'Basic', 'basi?': 'Basic', 'bas?c': 'Basic', 'Вasic': 'Вasic',
    'Baｓic': 'Basic', 'BΑSIC': 'Basic', 'Βas???': 'Basic', 'BΑSIC': 'Basic', 'B??si??': 'Basic', 'Вasic': 'Basic',
    'Deluxe': 'Deluxe', 'deluxe': 'Deluxe', 'DELUXE': 'Deluxe', 'De|uxe': 'Deluxe',
    'Delu×e': 'Deluxe', 'DELUXΕ': 'Deluxe', 'DΕLUXΕ': 'Deluxe', 'de|u×e': 'Deluxe', 'de|uxe': 'Deluxe', 
    '?eluxe': 'Deluxe', '??eluxe': 'Deluxe', '?ELUXE': 'Deluxe', 'super deluxe': 'Super Deluxe', 'SUPER DΕLUXE': 'Super Deluxe',
    'Super Deluxe': 'Super Deluxe', 'SUPER DELUXE': 'Super Deluxe', 'Super ?eluxe': 'Super Deluxe',
    '?uper Deluxe': 'Super Deluxe', 'Super De|uxe': 'Super Deluxe', 'Super ??eluxe': 'Super Deluxe',
    '?uper De|uxe': 'Super Deluxe', 'super de|uxe': 'Super Deluxe', 'ｓuper deluxe': 'Super Deluxe',
    'Standard': 'Standard', 'standard': 'Standard', 'STANDARD': 'Standard', '?tandard': 'Standard',
    'Stand??rd': 'Standard', 'Standa??d': 'Standard', 'Sta?dard': 'Standard', 'S??andard': 'Standard', 'sta?dard': 'Standard',
    'Standar??': 'Standard', 'St??ndard': 'Standard', 'STAN?ARD': 'Standard', 'STANDAR?': 'Standard',
    '?TANDARD': 'Standard', '?tanda??d': 'Standard', 'King': 'King', 'king': 'King',
    'KING': 'King', 'K??g': 'King', 'Ki?g': 'King', 'K?ng': 'King'
}

# Applying the mapping
df_train['ProductPitched'] = df_train['ProductPitched'].map(product_mapping)

In [20]:
df_train['ProductPitched'].isnull().sum()

0

In [23]:
df_train['ProductPitched'].value_counts()

ProductPitched
Basic           1157
Deluxe          1045
Standard         841
Super Deluxe     320
King             126
Name: count, dtype: int64

In [25]:
df_test['ProductPitched'] = df_test['ProductPitched'].map(product_mapping)
df_test['ProductPitched'].value_counts()

ProductPitched
Basic           1155
Deluxe          1034
Standard         742
Super Deluxe     309
King             158
Name: count, dtype: int64

In [26]:
# データフレームをCSVファイルに出力
df_train.to_csv('./data/processed_train.csv', index=False, encoding='utf-8')
df_test.to_csv('./data/processed_test.csv', index=False, encoding='utf-8')