In [50]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer

In [51]:
df = pd.read_csv("Dataset/kendaraan_train.csv")
df.drop(['id'], axis=1, inplace=True)

print("Total Dataset :", len(df))
df.sample(8)


Total Dataset : 285831


Unnamed: 0,Jenis_Kelamin,Umur,SIM,Kode_Daerah,Sudah_Asuransi,Umur_Kendaraan,Kendaraan_Rusak,Premi,Kanal_Penjualan,Lama_Berlangganan,Tertarik
31721,Pria,52.0,1.0,28.0,0.0,,Pernah,48365.0,124.0,277.0,0
227603,Wanita,20.0,1.0,26.0,0.0,< 1 Tahun,Pernah,29915.0,160.0,162.0,0
43508,Wanita,26.0,1.0,46.0,1.0,< 1 Tahun,Tidak,43409.0,152.0,119.0,0
9046,Wanita,21.0,1.0,50.0,1.0,< 1 Tahun,Tidak,36413.0,152.0,112.0,0
48535,Pria,,1.0,46.0,1.0,< 1 Tahun,Tidak,24746.0,160.0,67.0,0
150549,Wanita,57.0,1.0,28.0,0.0,> 2 Tahun,Pernah,117237.0,26.0,20.0,1
114409,Pria,46.0,1.0,28.0,1.0,1-2 Tahun,Tidak,37180.0,,154.0,0
135669,Wanita,20.0,1.0,41.0,1.0,< 1 Tahun,Tidak,31752.0,160.0,174.0,0


In [52]:
df_test = pd.read_csv("Dataset/kendaraan_test.csv")

print("Total Dataset :", len(df_test))
df_test.sample(8)

Total Dataset : 47639


Unnamed: 0,Jenis_Kelamin,Umur,SIM,Kode_Daerah,Sudah_Asuransi,Umur_Kendaraan,Kendaraan_Rusak,Premi,Kanal_Penjualan,Lama_Berlangganan,Tertarik
29105,Wanita,59,1,28,0,1-2 Tahun,Pernah,32346,52,80,0
41577,Wanita,22,1,46,1,< 1 Tahun,Tidak,17398,152,39,0
2723,Pria,51,1,28,1,1-2 Tahun,Tidak,2630,26,200,0
20243,Wanita,66,1,28,0,> 2 Tahun,Pernah,42873,124,75,0
34478,Pria,23,1,50,0,< 1 Tahun,Pernah,29037,151,135,0
31839,Wanita,25,1,41,0,< 1 Tahun,Pernah,2630,152,266,0
19946,Pria,24,1,36,0,< 1 Tahun,Pernah,31589,152,239,0
18062,Pria,21,1,8,1,< 1 Tahun,Tidak,29336,152,209,0


In [53]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 285831 entries, 0 to 285830
Data columns (total 11 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   Jenis_Kelamin      271391 non-null  object 
 1   Umur               271617 non-null  float64
 2   SIM                271427 non-null  float64
 3   Kode_Daerah        271525 non-null  float64
 4   Sudah_Asuransi     271602 non-null  float64
 5   Umur_Kendaraan     271556 non-null  object 
 6   Kendaraan_Rusak    271643 non-null  object 
 7   Premi              271262 non-null  float64
 8   Kanal_Penjualan    271532 non-null  float64
 9   Lama_Berlangganan  271839 non-null  float64
 10  Tertarik           285831 non-null  int64  
dtypes: float64(7), int64(1), object(3)
memory usage: 24.0+ MB


In [54]:
df.isna().sum()

Jenis_Kelamin        14440
Umur                 14214
SIM                  14404
Kode_Daerah          14306
Sudah_Asuransi       14229
Umur_Kendaraan       14275
Kendaraan_Rusak      14188
Premi                14569
Kanal_Penjualan      14299
Lama_Berlangganan    13992
Tertarik                 0
dtype: int64

In [55]:
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')

df["Umur"] = imp_mean.fit_transform(df[["Umur"]]).ravel()
df["Lama_Berlangganan"] = imp_mean.fit_transform(df[["Lama_Berlangganan"]]).ravel()
df["Premi"] = imp_mean.fit_transform(df[["Premi"]]).ravel()

In [56]:
df = df.dropna()
df.isna().sum()

Jenis_Kelamin        0
Umur                 0
SIM                  0
Kode_Daerah          0
Sudah_Asuransi       0
Umur_Kendaraan       0
Kendaraan_Rusak      0
Premi                0
Kanal_Penjualan      0
Lama_Berlangganan    0
Tertarik             0
dtype: int64

## **One Hot Encoding**

In [57]:
categorical = ['Jenis_Kelamin', 'Kendaraan_Rusak', 'Umur_Kendaraan']
df_train = pd.get_dummies(df, columns=categorical)
df_train.sample(5)

Unnamed: 0,Umur,SIM,Kode_Daerah,Sudah_Asuransi,Premi,Kanal_Penjualan,Lama_Berlangganan,Tertarik,Jenis_Kelamin_Pria,Jenis_Kelamin_Wanita,Kendaraan_Rusak_Pernah,Kendaraan_Rusak_Tidak,Umur_Kendaraan_1-2 Tahun,Umur_Kendaraan_< 1 Tahun,Umur_Kendaraan_> 2 Tahun
83771,26.0,1.0,28.0,1.0,35925.0,152.0,220.0,0,1,0,0,1,0,1,0
44035,44.0,1.0,28.0,0.0,34600.0,26.0,142.0,1,0,1,1,0,1,0,0
180999,42.0,1.0,3.0,0.0,33243.0,26.0,116.0,0,1,0,1,0,1,0,0
135140,44.0,1.0,32.0,0.0,25823.0,26.0,287.0,1,1,0,1,0,1,0,0
229566,53.0,1.0,28.0,1.0,39012.0,26.0,88.0,0,1,0,0,1,1,0,0


In [58]:
df_test = pd.get_dummies(df_test, columns=categorical)
df_test.sample(5)

Unnamed: 0,Umur,SIM,Kode_Daerah,Sudah_Asuransi,Premi,Kanal_Penjualan,Lama_Berlangganan,Tertarik,Jenis_Kelamin_Pria,Jenis_Kelamin_Wanita,Kendaraan_Rusak_Pernah,Kendaraan_Rusak_Tidak,Umur_Kendaraan_1-2 Tahun,Umur_Kendaraan_< 1 Tahun,Umur_Kendaraan_> 2 Tahun
15030,57,1,39,1,2630,26,91,0,1,0,0,1,1,0,0
14556,58,1,46,1,32248,124,194,0,0,1,0,1,1,0,0
4567,27,1,29,0,31143,124,245,0,1,0,1,0,0,1,0
27431,46,1,28,1,34381,45,57,0,1,0,0,1,1,0,0
46991,43,1,28,1,26692,13,133,0,1,0,0,1,1,0,0


## **Min Max Scaler**

In [59]:
scaler = MinMaxScaler()
numerical = ['Premi', 'Lama_Berlangganan']

df_train[numerical] = scaler.fit_transform(df_train[numerical].values)
df_train.sample(5)

Unnamed: 0,Umur,SIM,Kode_Daerah,Sudah_Asuransi,Premi,Kanal_Penjualan,Lama_Berlangganan,Tertarik,Jenis_Kelamin_Pria,Jenis_Kelamin_Wanita,Kendaraan_Rusak_Pernah,Kendaraan_Rusak_Tidak,Umur_Kendaraan_1-2 Tahun,Umur_Kendaraan_< 1 Tahun,Umur_Kendaraan_> 2 Tahun
16309,33.0,1.0,29.0,0.0,0.04246,157.0,0.318339,0,0,1,1,0,1,0,0
40064,52.0,1.0,41.0,1.0,0.079455,26.0,0.231834,0,1,0,0,1,1,0,0
32865,59.0,1.0,30.0,1.0,0.035923,26.0,0.224913,0,1,0,0,1,1,0,0
204474,26.0,1.0,8.0,1.0,0.051916,152.0,0.048443,0,0,1,0,1,0,1,0
19154,35.0,1.0,46.0,0.0,0.051916,1.0,0.179931,0,1,0,0,1,0,1,0


In [60]:
df_test[numerical] = scaler.fit_transform(df_test[numerical].values)
df_test.sample(5)

Unnamed: 0,Umur,SIM,Kode_Daerah,Sudah_Asuransi,Premi,Kanal_Penjualan,Lama_Berlangganan,Tertarik,Jenis_Kelamin_Pria,Jenis_Kelamin_Wanita,Kendaraan_Rusak_Pernah,Kendaraan_Rusak_Tidak,Umur_Kendaraan_1-2 Tahun,Umur_Kendaraan_< 1 Tahun,Umur_Kendaraan_> 2 Tahun
42262,49,1,28,1,0.053863,61,0.519031,0,0,1,0,1,1,0,0
38784,38,1,28,0,0.067062,154,0.062284,0,1,0,1,0,1,0,0
39756,53,1,28,0,0.06693,13,0.044983,0,1,0,1,0,1,0,0
24910,25,1,45,1,0.031957,152,0.67474,0,0,1,0,1,0,1,0
23451,23,1,28,0,0.059884,152,0.944637,1,1,0,1,0,0,1,0


In [61]:
df_train.to_csv('Dataset/kendaraan_train_clean.csv', index=False)
df_test.to_csv('Dataset/kendaraan_test_clean.csv', index=False)
