In [1]:
import pandas as pd
import numpy as np

In [2]:
def contains_total(data_list):
    # Iterate over each element in the list
    for item in data_list:
        # Check if "TOTAL" is in the current item (case-insensitive)
        if "TOTAL" in item.upper():
            return True
    return False

def is_multiple_of_15(time_str):
    hour, minute = map(int, time_str.split(':'))
    return minute % 15 == 0
    
def read_excel_until_total(file_path, start_row=0):
    # Load the Excel file
    xls = pd.ExcelFile(file_path)

    # Initialize a list to store data from all sheets
    all_data = []

    # Iterate through each sheet in the Excel file
    for i, sheet_name in enumerate(xls.sheet_names):
        # Read the sheet into a DataFrame
        df = pd.read_excel(xls, sheet_name=sheet_name,header=0)

        # # Filter rows starting from `start_row`
        filtered_data = []

        for index, row in df.iterrows():
            if contains_total(row.astype(str).values):
                break
            filtered_data.append(row)

        # Convert the list of rows back into a DataFrame
        if filtered_data:  # Proceed only if there is data to add
            sheet_data = pd.DataFrame(filtered_data)
            sheet_data['SHEET NAME'] = sheet_name  # Add the sheet name to the DataFrame

            # Append the filtered data from this sheet to all_data
            all_data.append(sheet_data)

    # Concatenate all the data from all sheets
    result_df = pd.concat(all_data, ignore_index=True)
    
    if 'Unnamed: 6' in result_df.columns:
        result_df = result_df.drop(columns=['Unnamed: 6'])

    # Mengubah nama kolom menjadi huruf besar
    result_df.columns = result_df.columns.str.upper()
 
    # Mengubah format waktu
    result_df['WAKTU'] = result_df['WAKTU'].str[:5]

    result_df = result_df.rename(columns={'TOTAL': 'JUMLAH'})
    return result_df

def penyesuaian_format(df):
    # Memastikan bahwa SHEET NAME diformat dengan dua digit (misalnya '01', '02', ..., '31')
    df['SHEET NAME'] = df['SHEET NAME'].str.zfill(2)
    
    # Menambahkan tanggal, bulan, dan tahun ke waktu
    df['WAKTU'] = thn_bln + df['SHEET NAME'] + ' ' + df['WAKTU']  # Tambahkan tanggal default
    
    # Mengubah kolom 'WAKTU' menjadi datetime dengan format lengkap
    df['WAKTU'] = pd.to_datetime(df['WAKTU'], format='%Y-%m-%d %H:%M')
    
    # Filter data berdasarkan apakah menit adalah kelipatan dari 15
    df = df[df['WAKTU'].dt.minute % 15 == 0]
    return df

def mencari_data_hilang(df, nama_file):    
    # Menghasilkan rentang tanggal untuk bulan yang ditentukan
    month_start = pd.Timestamp(f'{thn_bln}01')
    month_end = month_start + pd.offsets.MonthEnd(0)
    
    # Membuat rentang tanggal untuk seluruh bulan
    dates_range = pd.date_range(start=month_start, end=month_end, freq='D')
    
    # Inisialisasi list untuk menyimpan DataFrame dari waktu yang hilang
    missing_times_list = []
    
    # Loop melalui setiap hari dalam rentang bulan
    for date in dates_range:
        day = date.date()
        
        # Filter data untuk hari ini
        daily_data = df[df['WAKTU'].dt.date == day]
        
        # Buat rentang waktu yang lengkap untuk hari ini
        full_daily_range = pd.date_range(start=f'{day} 00:00', end=f'{day} 23:45', freq='15T')
        
        # Identifikasi waktu yang hilang
        missing_daily_times = full_daily_range[~full_daily_range.isin(daily_data['WAKTU'])]
        
        # Jika ada waktu yang hilang, tambahkan ke list missing_times_list
        if not missing_daily_times.empty:
            missing_times_df = pd.DataFrame({
                'WAKTU': missing_daily_times,
                'MOTOR': np.nan,
                'MOBIL': np.nan,
                'TRUK/BUS': np.nan,
                'JUMLAH': np.nan,
                'SHEET NAME': str(day.day).zfill(2),
                'INTERVAL': np.nan,
                'UPPKB': np.nan
                
            })
            missing_times_list.append(missing_times_df)
    
    # Menggabungkan semua DataFrame dari missing_times_list menjadi satu DataFrame
    missing_times = pd.concat(missing_times_list, ignore_index=True) if missing_times_list else pd.DataFrame()
    
    # Menampilkan data yang hilang
    if missing_times.empty:
        print(f"Bulan {nama_file[:2]} Tidak ada data yang hilang.")
    else:
        print(f"Bulan {nama_file[:2]} Ada data yang hilang")
        # print(missing_times)
    
    return missing_times, df

def pisahkan_kolom_waktu(df):
    # Pastikan kolom WAKTU sudah dalam format datetime
    df['WAKTU'] = pd.to_datetime(df['WAKTU'])

    # Pisahkan kolom WAKTU menjadi beberapa kolom
    df['TAHUN'] = df['WAKTU'].dt.year
    df['BULAN'] = df['WAKTU'].dt.month
    df['TANGGAL'] = df['WAKTU'].dt.day
    df['HARI'] = df['WAKTU'].dt.day_name()  # Ini akan memberikan nama hari dalam format string
    df['JAM'] = df['WAKTU'].dt.strftime('%H:%M')

    # Daftar kolom yang diinginkan dalam urutan baru
    new_column_order = [
        'WAKTU', 'TAHUN', 'BULAN', 'PEKAN', 'TANGGAL', 'HARI', 'JAM', 
        'MOTOR', 'MOBIL', 'TRUK/BUS', 'JUMLAH',
        'SHEET NAME','INTERVAL', 'UPPKB'
    ]
    
    # Menyusun ulang kolom berdasarkan urutan yang diinginkan
    df = df.reindex(columns=new_column_order)
    return df
    
def hapus_kolom(df):
    # Misalkan df adalah DataFrame yang sudah ada
    df = df.drop(columns=['SHEET NAME','INTERVAL', 'UPPKB'])
    return df

def perhitungan_smp(df):
    df['SMP_MTR'] = df['MOTOR'] * 0.8
    df['SMP_MBL'] = df['MOBIL']
    df['SMP_TRK'] = df['TRUK/BUS'] * 2.5
    df['SMP'] = round(df['SMP_MTR'] + df['SMP_MBL'] + df['SMP_TRK'])

    return df

# Fungsi untuk menghitung minggu ke berapa dalam bulan
def week_of_month(dt):
    """Menghitung minggu ke berapa dalam bulan dari tanggal tertentu."""
    day_of_month = dt.day
    if day_of_month <= 7:
        return 1
    elif day_of_month <= 14:
        return 2
    elif day_of_month <= 21:
        return 3
    elif day_of_month <= 28:
        return 4
    else:
        return 5

In [3]:
# data files
nama_files = ['01 Januari 2024', '02 Februari 2024', '03 Maret 2024', '04 April 2024']

# Inisialisasi list untuk menampung DataFrame
all_dfs = []
all_miss = []

for nama_file in nama_files:
    bulan = nama_file.split()[0]
    tahun = nama_file.split()[2]
    thn_bln = f'{tahun}-{bulan}-'
    
    nama_file = nama_file + '.xlsx'
    file_path = '../Data/DataLHR/Data Losarang/2024/' + nama_file 
    
    # Read the Excel file from the specified row until 'Total' is encountered
    result = read_excel_until_total(file_path, start_row=11)
    result = penyesuaian_format(result)
    missing_times, result = mencari_data_hilang(result, nama_file)
    missing_times = pisahkan_kolom_waktu(missing_times)
    missing_times = hapus_kolom(missing_times)
    missing_times = perhitungan_smp(missing_times)
    result = pisahkan_kolom_waktu(result)
    result = hapus_kolom(result)
    result = perhitungan_smp(result)

    missing_times['PEKAN'] = missing_times['WAKTU'].apply(week_of_month)
    result['PEKAN'] = result['WAKTU'].apply(week_of_month)
    
    df = result.copy()

    # Tambahkan DataFrame ke list
    all_dfs.append(result)
    all_miss.append(missing_times)
    
    # save the result to a new Excel file
    df.to_excel('../Data/Preprocessing/Data Losarang/2024/1_input_'+nama_file, index=False)
    missing_times.to_excel('../Data/Preprocessing/Data Losarang/2024/1_missing_times_'+nama_file, index=False)
    

# gabungkan data
df_combined = pd.concat(all_dfs,ignore_index=True)
df_combined_miss = pd.concat(all_miss,ignore_index=True)

Bulan 01 Ada data yang hilang
Bulan 02 Ada data yang hilang
Bulan 03 Ada data yang hilang
Bulan 04 Ada data yang hilang


In [4]:
df_combined.head()

Unnamed: 0,WAKTU,TAHUN,BULAN,PEKAN,TANGGAL,HARI,JAM,MOTOR,MOBIL,TRUK/BUS,JUMLAH,SMP_MTR,SMP_MBL,SMP_TRK,SMP
0,2024-01-01 00:00:00,2024,1,1,1,Monday,00:00,78,30,12,120.0,62.4,30,30.0,122.0
1,2024-01-01 00:15:00,2024,1,1,1,Monday,00:15,97,32,9,138.0,77.6,32,22.5,132.0
2,2024-01-01 00:30:00,2024,1,1,1,Monday,00:30,98,41,7,146.0,78.4,41,17.5,137.0
3,2024-01-01 00:45:00,2024,1,1,1,Monday,00:45,88,31,12,131.0,70.4,31,30.0,131.0
4,2024-01-01 01:00:00,2024,1,1,1,Monday,01:00,56,28,12,96.0,44.8,28,30.0,103.0


In [5]:
df_combined_miss.head()

Unnamed: 0,WAKTU,TAHUN,BULAN,PEKAN,TANGGAL,HARI,JAM,MOTOR,MOBIL,TRUK/BUS,JUMLAH,SMP_MTR,SMP_MBL,SMP_TRK,SMP
0,2024-01-02 01:15:00,2024,1,1,2,Tuesday,01:15,,,,,,,,
1,2024-01-02 02:00:00,2024,1,1,2,Tuesday,02:00,,,,,,,,
2,2024-01-02 05:30:00,2024,1,1,2,Tuesday,05:30,,,,,,,,
3,2024-01-02 06:30:00,2024,1,1,2,Tuesday,06:30,,,,,,,,
4,2024-01-02 07:30:00,2024,1,1,2,Tuesday,07:30,,,,,,,,


In [6]:
len(df_combined)

11434

In [7]:
print(len(df_combined))
index_to_remove = df_combined[(df_combined['WAKTU'] == '2024-03-04 20:00:00') & (df_combined['MOTOR'] == 101.0)].index
print(index_to_remove)
data_cleaned = df_combined.drop(index_to_remove)
df_combined = data_cleaned.copy()

11434
Index([6037], dtype='int64')


In [8]:
def imputing_data_row(df,missing):
    df_a = df[df['WAKTU'] == missing['WAKTU'] + pd.Timedelta(minutes=-15)]
    df_b = df[df['WAKTU'] == missing['WAKTU'] + pd.Timedelta(minutes=15)]
    MOTOR = -1
    MOBIL = -1
    TRUK = -1
    if(len(df_a) + len(df_b) == 2):
        result = pd.concat([df_a,df_b])
        MOTOR = round(result['MOTOR'].mean())
        MOBIL = round(result['MOBIL'].mean())
        TRUK = round(result['TRUK/BUS'].mean())
    return MOTOR, MOBIL, TRUK 

In [9]:
# Iterasi setiap baris dan mengisi nilai yang hilang
for i, missing in df_combined_miss.iterrows():
    MOTOR, MOBIL, TRUK = imputing_data_row(df_combined, missing)
    if (MOTOR != -1) & (np.isnan(missing['MOTOR'])):
        df_combined_miss.at[i, 'MOTOR'] = MOTOR
        df_combined_miss.at[i, 'MOBIL'] = MOBIL
        df_combined_miss.at[i, 'TRUK/BUS'] = TRUK

In [10]:
# Append these non-missing rows to df_combined
non_missing_motor = df_combined_miss[df_combined_miss['MOTOR'].notna()]
non_missing_motor_smp = non_missing_motor.copy()
non_missing_motor_smp = perhitungan_smp(non_missing_motor_smp)
df_combined = pd.concat([df_combined, non_missing_motor_smp], ignore_index=False)
df_combined =df_combined.sort_values('WAKTU')

# Update df_combined_miss to contain only missing 'MOTOR' entries
df_combined_miss = df_combined_miss[df_combined_miss['MOTOR'].isna()]

In [11]:
# Simpan hasil gabungan ke file Excel
df_combined.to_excel('../Data/Preprocessing/Data Losarang/2024/1_input_losarang_2024.xlsx', index=False)
df_combined_miss.to_excel('../Data/Preprocessing/Data Losarang/2024/1_missing_times_losarang_2024.xlsx', index=False)

In [12]:
# 11616 = 4 * 24 * (31+29+31+30)
len(df_combined),len(df_combined_miss), len(df_combined) + len(df_combined_miss)

(11612, 4, 11616)

In [13]:
unique_values = df_combined['PEKAN'].unique()
print("Nilai Unik di PEKAN:", unique_values)

Nilai Unik di PEKAN: [1 2 3 4 5]
