In [7]:
import pandas as pd
import numpy as np
from lifelines import KaplanMeierFitter
import matplotlib.pyplot as plt

In [3]:
# Membuat DataFrame dengan data acak
np.random.seed(0)
data_pegawai = pd.DataFrame({
    'umur_masuk': np.random.randint(20, 50, 200),
    'umur_pensiun': np.random.randint(55, 70, 200),
    'status_pensiun': np.random.randint(0, 2, 200) # 0 untuk masih bekerja, 1 untuk sudah pensiun
})

# Menghitung durasi kerja sampai pensiun
data_pegawai['durasi_kerja'] = data_pegawai['umur_pensiun'] - data_pegawai['umur_masuk']
data_pegawai

Unnamed: 0,umur_masuk,umur_pensiun,status_pensiun,durasi_kerja
0,32,63,0,31
1,35,59,0,24
2,41,58,0,17
3,20,67,0,47
4,23,68,0,45
...,...,...,...,...
195,28,64,1,36
196,39,65,0,26
197,28,55,0,27
198,46,61,0,15


In [4]:
# Analisis survival menggunakan Kaplan-Meier estimator
kmf = KaplanMeierFitter()
kmf.fit(durations=data_pegawai['durasi_kerja'], event_observed=data_pegawai['status_pensiun'])

<lifelines.KaplanMeierFitter:"KM_estimate", fitted with 200 total observations, 103 right-censored observations>

In [None]:
# Plotting survival function
kmf.plot_survival_function()
plt.title('Fungsi Survival untuk Durasi Kerja Pegawai')
plt.xlabel('Tahun')
plt.ylabel('Probabilitas Belum Pensiun')
plt.show()

In [9]:
# Interpretasi hasil
median_survival = kmf.median_survival_time_
print(f'Median waktu kerja sampai pensiun adalah {median_survival} tahun.')

Median waktu kerja sampai pensiun adalah 37.0 tahun.


# Regresi Cox

In [17]:
from lifelines import CoxPHFitter

In [10]:
data_pegawai['pendidikan']=np.random.choice(['SD', 'SMP', 'SMA', 'S1', 'S2', 'S3'], 200)
data_pegawai['jenis_kelamin']=np.random.choice(['Laki-laki', 'Perempuan'], 200)
data_pegawai

Unnamed: 0,umur_masuk,umur_pensiun,status_pensiun,durasi_kerja,pendidikan,jenis_kelamin
0,32,63,0,31,SMP,Perempuan
1,35,59,0,24,SD,Perempuan
2,41,58,0,17,SD,Laki-laki
3,20,67,0,47,SMA,Laki-laki
4,23,68,0,45,S2,Laki-laki
...,...,...,...,...,...,...
195,28,64,1,36,SMA,Laki-laki
196,39,65,0,26,S2,Laki-laki
197,28,55,0,27,SMP,Laki-laki
198,46,61,0,15,S3,Laki-laki


In [14]:
# Mengubah data kategorikal menjadi numerik
data_pegawai = pd.get_dummies(data_pegawai, columns=['pendidikan', 'jenis_kelamin'], drop_first=True)

KeyError: "None of [Index(['pendidikan', 'jenis_kelamin'], dtype='object')] are in the [columns]"

In [15]:
data_pegawai

Unnamed: 0,umur_masuk,umur_pensiun,status_pensiun,durasi_kerja,pendidikan_S2,pendidikan_S3,pendidikan_SD,pendidikan_SMA,pendidikan_SMP,jenis_kelamin_Perempuan
0,32,63,0,31,False,False,False,False,True,True
1,35,59,0,24,False,False,True,False,False,True
2,41,58,0,17,False,False,True,False,False,False
3,20,67,0,47,False,False,False,True,False,False
4,23,68,0,45,True,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...
195,28,64,1,36,False,False,False,True,False,False
196,39,65,0,26,True,False,False,False,False,False
197,28,55,0,27,False,False,False,False,True,False
198,46,61,0,15,False,True,False,False,False,False


In [18]:
# Membuat model regresi Cox
cph = CoxPHFitter()
cph.fit(data_pegawai, duration_col='durasi_kerja', event_col='status_pensiun')

# Menampilkan hasil model
cph.print_summary()



0,1
model,lifelines.CoxPHFitter
duration col,'durasi_kerja'
event col,'status_pensiun'
baseline estimation,breslow
number of observations,200
number of events observed,97
partial log-likelihood,-185.53
time fit was run,2024-05-10 23:44:05 UTC

Unnamed: 0,coef,exp(coef),se(coef),coef lower 95%,coef upper 95%,exp(coef) lower 95%,exp(coef) upper 95%,cmp to,z,p,-log2(p)
umur_masuk,1.18,3.26,0.13,0.93,1.43,2.54,4.19,0.0,9.31,<0.005,66.03
umur_pensiun,-1.16,0.31,0.13,-1.42,-0.91,0.24,0.4,0.0,-9.07,<0.005,62.83
pendidikan_S2,0.17,1.18,0.37,-0.55,0.88,0.58,2.42,0.0,0.46,0.64,0.64
pendidikan_S3,-0.03,0.97,0.34,-0.7,0.65,0.5,1.91,0.0,-0.08,0.93,0.1
pendidikan_SD,0.27,1.31,0.36,-0.45,0.98,0.64,2.68,0.0,0.74,0.46,1.12
pendidikan_SMA,0.19,1.21,0.35,-0.49,0.86,0.61,2.37,0.0,0.54,0.59,0.77
pendidikan_SMP,-0.86,0.42,0.46,-1.75,0.04,0.17,1.04,0.0,-1.87,0.06,4.03
jenis_kelamin_Perempuan,0.02,1.02,0.22,-0.41,0.45,0.67,1.58,0.0,0.11,0.91,0.13

0,1
Concordance,0.99
Partial AIC,387.05
log-likelihood ratio test,460.54 on 8 df
-log2(p) of ll-ratio test,311.23


In [20]:
# Interpretasi hasil
# Koefisien yang signifikan (p < 0.05) menunjukkan variabel yang berpengaruh signifikan terhadap durasi kerja.