In [1]:
import os
import numpy as np
import pandas as pd

In [2]:
def read_excel(path):
    return pd.read_excel(path)

def imputing_data_row(df,missing):
    df_a = df[df['WAKTU'] == missing['WAKTU'] + pd.Timedelta(minutes=-60)]
    df_b = df[df['WAKTU'] == missing['WAKTU'] + pd.Timedelta(minutes=60)]
    MOTOR = -1
    MOBIL = -1
    TRUK = -1
    if(len(df_a) + len(df_b) == 2):
        result = pd.concat([df_a,df_b])
        MOTOR = round(result['MOTOR'].mean())
        MOBIL = round(result['MOBIL'].mean())
        TRUK = round(result['TRUK/BUS'].mean())
    return MOTOR, MOBIL, TRUK 

def imputing_data(df, HARI, JAM):
    df = df[(df['HARI'] == HARI) & (df['JAM'] == JAM)]
    
    # Menghitung Q1 (kuartil pertama) dan Q3 (kuartil ketiga)
    Q1 = df['SMP'].quantile(0.25)
    Q3 = df['SMP'].quantile(0.75)
    IQR = Q3 - Q1
    
    # Menentukan batas bawah dan batas atas untuk outlier
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    # Menemukan dan menghapus outlier
    df_cleaned = df[(df['SMP'] >= lower_bound) & (df['SMP'] <= upper_bound)]

    df_grouped = df_cleaned[['HARI', 'JAM', 'MOTOR', 'MOBIL','TRUK/BUS']]
    df_grouped = round(df_grouped.groupby(['HARI', 'JAM']).mean()).reset_index()

    return df_grouped

def perhitungan_jumlah_smp(df):
    df['SMP_MTR'] = df['MOTOR'] * 0.2
    df['SMP_MBL'] = df['MOBIL']
    df['SMP_TRK'] = df['TRUK/BUS'] * 2.5
    df['SMP'] = round(df['SMP_MTR'] + df['SMP_MBL'] + df['SMP_TRK'])
    df['JUMLAH'] = df['MOTOR'] + df['MOBIL'] + df['TRUK/BUS']
    return df

def menggabungkan_data(df, missing_times):
    # Menggabungkan data hilang dengan DataFrame utama
    df = pd.concat([df, missing_times], ignore_index=True)
    
    # Mengurutkan DataFrame berdasarkan kolom 'Waktu'
    df = df.sort_values(by=['TAHUN','BULAN','TANGGAL','HARI','JAM']).reset_index(drop=True)

    return df


In [3]:
path_1 = '../Data/Preprocessing/Data Balonggandu/2023/1_input_09 september.xlsx'
path_2 = '../Data/Preprocessing/Data Balonggandu/2023/1_missing_times_09 september.xlsx'

df = read_excel(path_1)
df_missing = read_excel(path_2)

In [4]:
df.head()

Unnamed: 0,WAKTU,TAHUN,BULAN,TANGGAL,HARI,JAM,MOTOR,MOBIL,TRUK/BUS,JUMLAH,SMP_MTR,SMP_MBL,SMP_TRK,SMP
0,2024-09-01 00:00:00,2024,9,1,Sunday,00:00,13,9,6,28,2.6,9,15.0,27
1,2024-09-01 01:00:00,2024,9,1,Sunday,01:00,7,5,7,19,1.4,5,17.5,24
2,2024-09-01 02:00:00,2024,9,1,Sunday,02:00,6,15,11,32,1.2,15,27.5,44
3,2024-09-01 03:00:00,2024,9,1,Sunday,03:00,3,8,9,20,0.6,8,22.5,31
4,2024-09-01 04:00:00,2024,9,1,Sunday,04:00,15,19,8,42,3.0,19,20.0,42


In [5]:
df_missing.head()

Unnamed: 0,WAKTU,TAHUN,BULAN,TANGGAL,HARI,JAM,MOTOR,MOBIL,TRUK/BUS,JUMLAH
0,2024-09-09 00:00:00,2024,9,9,Monday,00:00,,,,
1,2024-09-09 01:00:00,2024,9,9,Monday,01:00,,,,
2,2024-09-09 02:00:00,2024,9,9,Monday,02:00,,,,
3,2024-09-09 03:00:00,2024,9,9,Monday,03:00,,,,
4,2024-09-09 04:00:00,2024,9,9,Monday,04:00,,,,


In [6]:
for i, row in df_missing.iterrows():
    imputasi = imputing_data(df, row['HARI'], row['JAM'])
    # print(imputasi)

In [7]:
imputasi

Unnamed: 0,HARI,JAM,MOTOR,MOBIL,TRUK/BUS
0,Monday,23:00,12.0,7.0,1.0


In [8]:
# Iterasi setiap baris dan mengisi nilai yang hilang
# for i, missing in df_missing.iterrows():
#     MOTOR, MOBIL, TRUK = imputing_data_row(df, missing)
#     if (MOTOR != -1) & (np.isnan(missing['MOTOR'])):
#         df_missing.at[i, 'MOTOR'] = MOTOR
#         df_missing.at[i, 'MOBIL'] = MOBIL
#         df_missing.at[i, 'TRUK/BUS'] = TRUK

In [9]:
# Iterasi setiap baris dan mengisi nilai yang hilang
for i, missing in df_missing.iterrows():
    imputasi = imputing_data(df, missing['HARI'], missing['JAM'])
    if not imputasi.empty:
        df_missing.at[i, 'MOTOR'] = imputasi['MOTOR'].values[0]
        df_missing.at[i, 'MOBIL'] = imputasi['MOBIL'].values[0]
        df_missing.at[i, 'TRUK/BUS'] = imputasi['TRUK/BUS'].values[0]

In [10]:
df_missing.head()

Unnamed: 0,WAKTU,TAHUN,BULAN,TANGGAL,HARI,JAM,MOTOR,MOBIL,TRUK/BUS,JUMLAH
0,2024-09-09 00:00:00,2024,9,9,Monday,00:00,10.0,13.0,8.0,
1,2024-09-09 01:00:00,2024,9,9,Monday,01:00,6.0,11.0,6.0,
2,2024-09-09 02:00:00,2024,9,9,Monday,02:00,6.0,11.0,6.0,
3,2024-09-09 03:00:00,2024,9,9,Monday,03:00,7.0,6.0,6.0,
4,2024-09-09 04:00:00,2024,9,9,Monday,04:00,20.0,11.0,9.0,


In [11]:
df_missing = perhitungan_jumlah_smp(df_missing)

In [12]:
df_missing.to_excel('../Data/Preprocessing/Data Balonggandu/2023/2_missing_times_09 september.xlsx', index=False)

In [13]:
df_missing.head()

Unnamed: 0,WAKTU,TAHUN,BULAN,TANGGAL,HARI,JAM,MOTOR,MOBIL,TRUK/BUS,JUMLAH,SMP_MTR,SMP_MBL,SMP_TRK,SMP
0,2024-09-09 00:00:00,2024,9,9,Monday,00:00,10.0,13.0,8.0,31.0,2.0,13.0,20.0,35.0
1,2024-09-09 01:00:00,2024,9,9,Monday,01:00,6.0,11.0,6.0,23.0,1.2,11.0,15.0,27.0
2,2024-09-09 02:00:00,2024,9,9,Monday,02:00,6.0,11.0,6.0,23.0,1.2,11.0,15.0,27.0
3,2024-09-09 03:00:00,2024,9,9,Monday,03:00,7.0,6.0,6.0,19.0,1.4,6.0,15.0,22.0
4,2024-09-09 04:00:00,2024,9,9,Monday,04:00,20.0,11.0,9.0,40.0,4.0,11.0,22.5,38.0


In [14]:
df_missing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 514 entries, 0 to 513
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   WAKTU     514 non-null    datetime64[ns]
 1   TAHUN     514 non-null    int64         
 2   BULAN     514 non-null    int64         
 3   TANGGAL   514 non-null    int64         
 4   HARI      514 non-null    object        
 5   JAM       514 non-null    object        
 6   MOTOR     514 non-null    float64       
 7   MOBIL     514 non-null    float64       
 8   TRUK/BUS  514 non-null    float64       
 9   JUMLAH    514 non-null    float64       
 10  SMP_MTR   514 non-null    float64       
 11  SMP_MBL   514 non-null    float64       
 12  SMP_TRK   514 non-null    float64       
 13  SMP       514 non-null    float64       
dtypes: datetime64[ns](1), float64(8), int64(3), object(2)
memory usage: 56.3+ KB


In [15]:
df_compiled = menggabungkan_data(df, df_missing)
df_compiled.head()

Unnamed: 0,WAKTU,TAHUN,BULAN,TANGGAL,HARI,JAM,MOTOR,MOBIL,TRUK/BUS,JUMLAH,SMP_MTR,SMP_MBL,SMP_TRK,SMP
0,2024-09-01 00:00:00,2024,9,1,Sunday,00:00,13.0,9.0,6.0,28.0,2.6,9.0,15.0,27.0
1,2024-09-01 01:00:00,2024,9,1,Sunday,01:00,7.0,5.0,7.0,19.0,1.4,5.0,17.5,24.0
2,2024-09-01 02:00:00,2024,9,1,Sunday,02:00,6.0,15.0,11.0,32.0,1.2,15.0,27.5,44.0
3,2024-09-01 03:00:00,2024,9,1,Sunday,03:00,3.0,8.0,9.0,20.0,0.6,8.0,22.5,31.0
4,2024-09-01 04:00:00,2024,9,1,Sunday,04:00,15.0,19.0,8.0,42.0,3.0,19.0,20.0,42.0


In [16]:
df_compiled.to_excel('../Data/Preprocessing/Data Balonggandu/2023/2_input_09 september.xlsx', index=False)