Load Data

In [184]:
# Import library
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import roc_auc_score

In [103]:
# Load dataset
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
all_data = [train, test]

Cleaning Data

In [104]:
for i in all_data:
  print(i.isin(['?']).sum(axis=0))

id                          0
Umur                        0
Kelas Pekerja            2063
Berat Akhir                 0
Pendidikan                  0
Jmlh Tahun Pendidikan       0
Status Perkawinan           0
Pekerjaan                2069
Jenis Kelamin               0
Keuntungan Kapital          0
Kerugian Capital            0
Jam per Minggu              0
Gaji                        0
dtype: int64
id                         0
Umur                       0
Kelas Pekerja            552
Berat Akhir                0
Pendidikan                 0
Jmlh Tahun Pendidikan      0
Status Perkawinan          0
Pekerjaan                554
Jenis Kelamin              0
Keuntungan Kapital         0
Kerugian Capital           0
Jam per Minggu             0
dtype: int64


In [105]:
for i in all_data:
  i['Kelas Pekerja'] = i['Kelas Pekerja'].replace('?','Tidak Diketahui')
  i['Pekerjaan'] = i['Pekerjaan'].replace('?','Tidak Diketahui')

Feature Engineering

In [106]:
# Mapping pada label (0 = Gaji <= 7 jt, 1 = Gaji > 7 jt)
train['Gaji'] = train['Gaji'].map( {'<=7jt': 0, '>7jt': 1} ).astype(int)

In [107]:
for i in all_data:
  i.drop('id', axis=1, inplace=True)

In [108]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35994 entries, 0 to 35993
Data columns (total 12 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Umur                   35994 non-null  int64  
 1   Kelas Pekerja          35994 non-null  object 
 2   Berat Akhir            35994 non-null  int64  
 3   Pendidikan             35994 non-null  object 
 4   Jmlh Tahun Pendidikan  35994 non-null  int64  
 5   Status Perkawinan      35994 non-null  object 
 6   Pekerjaan              35994 non-null  object 
 7   Jenis Kelamin          35994 non-null  object 
 8   Keuntungan Kapital     35994 non-null  float64
 9   Kerugian Capital       35994 non-null  float64
 10  Jam per Minggu         35994 non-null  float64
 11  Gaji                   35994 non-null  int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 3.3+ MB


In [109]:
class_mapping = {'Wiraswasta':0, 'Pekerja Bebas Bukan Perusahan':1, 'Pemerintah Lokal':2, 'Tidak Diketahui':3,
                 'Pemerintah Negara':4, 'Pekerja Bebas Perusahaan':5, 'Pemerintah Provinsi':6,
                 'Tanpa di Bayar':7, 'Tidak Pernah Bekerja':8}

educate_mapping = {'SMA':0, 'Pendidikan Tinggi':1, 'Sarjana':2, 'Master':3, 'D4':4, '11th':5, 'D3':6,
                 '10th':7, '7th-8th':8, 'Sekolah Professional':9, '9th':10, '12th':11, 'Doktor':12,
                 '5th-6th':13, '1st-4th':14, 'SD':15}

marital_mapping = {'Menikah':0, 'Belum Pernah Menikah':1, 'Cerai':2, 'Janda':3, 'Berpisah':4, 'Menikah LDR':5}

job_mapping = {'Perbaikan Kerajinan':0, 'Ekesekutif Managerial':1, 'Spesialis':2, 'Pemuka Agama':3, 'Sales':4,
               'Servis Lainnya':5, 'Mesin Inspeksi':6, 'Supir':7, 'Pembersih':8, 'Petani':9, 'Tech-support':10,
               'Penjaga':11, 'Asisten Rumah Tangga':12, 'Tentara':13, 'Tidak Diketahui':14}

gender_mapping = {'Laki2': 0, 'Perempuan': 1}

In [110]:
for dataset in all_data:
  dataset['Kelas Pekerja'] = dataset['Kelas Pekerja'].map(class_mapping).astype(int)
  dataset['Pendidikan'] = dataset['Pendidikan'].map(educate_mapping).astype(int)
  dataset['Status Perkawinan'] = dataset['Status Perkawinan'].map(marital_mapping).astype(int)
  dataset['Pekerjaan'] = dataset['Pekerjaan'].map(job_mapping).astype(int)
  dataset['Jenis Kelamin'] = dataset['Jenis Kelamin'].map(gender_mapping).astype(int)

In [111]:
for i in all_data:
  i.drop(['Umur', 'Berat Akhir', 'Jmlh Tahun Pendidikan', 'Keuntungan Kapital',
         'Kerugian Capital', 'Jam per Minggu'], axis=1, inplace=True)

In [112]:
X = train.drop('Gaji', axis=1)
y = train['Gaji']

In [113]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [185]:
model = RandomForestClassifier(min_samples_split=60, min_samples_leaf=30)

In [186]:
cv_result = cross_val_score(model, X_train, y_train, cv=10, scoring='roc_auc')
cv_result.mean()

0.8737501788539918

In [177]:
model.fit(X, y)
submitted = model.predict(test)

In [178]:
submit = pd.read_csv('test.csv')
submission = pd.DataFrame({'id':submit['id'],'Gaji':submitted})
submission.head()

Unnamed: 0,id,Gaji
0,35994,0
1,35995,0
2,35996,1
3,35997,1
4,35998,0


In [179]:
filename = 'Submission3.csv'

submission.to_csv(filename,index=False)

print('Saved file: ' + filename)

Saved file: Submission3.csv
