In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df=pd.read_csv('../Data/raw/train.csv')

In [3]:
df.head()

Unnamed: 0,id,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,0,42,technician,married,secondary,no,7,no,no,cellular,25,aug,117,3,-1,0,unknown,0
1,1,38,blue-collar,married,secondary,no,514,no,no,unknown,18,jun,185,1,-1,0,unknown,0
2,2,36,blue-collar,married,secondary,no,602,yes,no,unknown,14,may,111,2,-1,0,unknown,0
3,3,27,student,single,secondary,no,34,yes,no,unknown,28,may,10,2,-1,0,unknown,0
4,4,26,technician,married,secondary,no,889,yes,no,cellular,3,feb,902,1,-1,0,unknown,1


In [4]:
df['contact'].unique()

array(['cellular', 'unknown', 'telephone'], dtype=object)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 750000 entries, 0 to 749999
Data columns (total 18 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   id         750000 non-null  int64 
 1   age        750000 non-null  int64 
 2   job        750000 non-null  object
 3   marital    750000 non-null  object
 4   education  750000 non-null  object
 5   default    750000 non-null  object
 6   balance    750000 non-null  int64 
 7   housing    750000 non-null  object
 8   loan       750000 non-null  object
 9   contact    750000 non-null  object
 10  day        750000 non-null  int64 
 11  month      750000 non-null  object
 12  duration   750000 non-null  int64 
 13  campaign   750000 non-null  int64 
 14  pdays      750000 non-null  int64 
 15  previous   750000 non-null  int64 
 16  poutcome   750000 non-null  object
 17  y          750000 non-null  int64 
dtypes: int64(9), object(9)
memory usage: 103.0+ MB


In [6]:
# Merge rare categories
rare_jobs = ['self-employed', 'entrepreneur', 'housemaid','unemployed', 'student', 'unknown']

df['job'] = df['job'].replace(rare_jobs, 'other')

In [9]:
df['balance_shifted'] = df['balance'] - df['balance'].min() + 1 

df['log_balance'] = np.log(df['balance_shifted'])

In [10]:
education_map = {'unknown':0, 'primary':1, 'secondary':2, 'tertiary':3}
df['education']=df['education'].map(education_map)

In [13]:
# SAFE FEATURES (no leakage)
# 1. Client contact sensitivity (using previous campaign data)
df['contact_sensitivity'] = df['previous'] / (df['pdays'].replace(999, 0) + 1e-6)

# 2. Previous campaign engagement
df['prev_campaign_engaged'] = (df['poutcome'] == 'success').astype(int)

# 3. Historical contact responsiveness
df['responsiveness'] = np.select(
    [
        df['previous'] == 0,
        (df['previous'] > 0) & (df['poutcome'] == 'success'),
        (df['previous'] > 0) & (df['poutcome'] != 'success')
    ],
    ['new', 'responsive', 'unresponsive'],
    default='unknown'
)

In [14]:
df.head()

Unnamed: 0,id,age,job,marital,education,default,balance,housing,loan,contact,...,poutcome,y,balance_bin,age_bin,balance_shifted,log_balance,duration_log,contact_sensitivity,prev_campaign_engaged,responsiveness
0,0,42,technician,married,2,no,7,no,no,cellular,...,unknown,0,Low,36-50,8027,8.990566,4.770685,-0.0,0,new
1,1,38,blue-collar,married,2,no,514,no,no,unknown,...,unknown,0,Medium,36-50,8534,9.051813,5.225747,-0.0,0,new
2,2,36,blue-collar,married,2,no,602,yes,no,unknown,...,unknown,0,Medium,36-50,8622,9.062072,4.718499,-0.0,0,new
3,3,27,other,single,2,no,34,yes,no,unknown,...,unknown,0,Medium,26-35,8054,8.993924,2.397895,-0.0,0,new
4,4,26,technician,married,2,no,889,yes,no,cellular,...,unknown,1,Medium,26-35,8909,9.094817,6.805723,-0.0,0,new


In [None]:
df.drop(['id','day','month','balance','balance_shifted','previous','default'],axis=1,inplace=True)

In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 750000 entries, 0 to 749999
Data columns (total 17 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   age                    750000 non-null  int64  
 1   job                    750000 non-null  object 
 2   marital                750000 non-null  object 
 3   education              750000 non-null  int64  
 4   housing                750000 non-null  object 
 5   loan                   750000 non-null  object 
 6   contact                750000 non-null  object 
 7   duration               750000 non-null  int64  
 8   campaign               750000 non-null  int64  
 9   pdays                  750000 non-null  int64  
 10  poutcome               750000 non-null  object 
 11  y                      750000 non-null  int64  
 12  log_balance            750000 non-null  float64
 13  duration_log           750000 non-null  float64
 14  contact_sensitivity    750000 non-nu

In [20]:
df.to_csv('../Data/cleaned/preprocessed.csv',index=False)