In [23]:
# ============================================================================
# TAHAP 1: DATA LOADING & INITIAL EXPLORATION
# PT Arkonin Engineering Manggala Pratama - Tender Prediction System
# ============================================================================

# Import library yang diperlukan
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.width', 1000)

print("=" * 80)
print("TAHAP 1: DATA LOADING & INITIAL EXPLORATION (REVISED)")
print("=" * 80)

# ============================================================================
# 1. LOAD DATA DENGAN MULTIPLE HEADER DETECTION
# ============================================================================
print("\n[1] Loading data dari Excel dengan deteksi header otomatis...")

file_path = "PP PT ARKONIN EMP Thn 2022 sd 2025.xlsx"

try:
    # Baca dulu beberapa baris untuk deteksi header
    df_preview = pd.read_excel(file_path, sheet_name=0, nrows=10, header=None)
    
    print("\nüìã Preview 10 Baris Pertama (Raw):")
    print(df_preview)
    
    # Deteksi baris header (baris yang memiliki paling banyak non-null values)
    header_row = None
    max_non_null = 0
    
    for idx in range(5):  # Check first 5 rows
        non_null_count = df_preview.iloc[idx].notna().sum()
        if non_null_count > max_non_null:
            max_non_null = non_null_count
            header_row = idx
    
    print(f"\nüîç Header terdeteksi di baris: {header_row}")
    
    # Baca ulang dengan header yang benar
    df_raw = pd.read_excel(file_path, sheet_name=0, header=header_row)
    
    # Cleaning column names
    df_raw.columns = df_raw.columns.str.strip()  # Remove whitespace
    
    print(f"\n‚úì Data berhasil dimuat!")
    print(f"  Jumlah baris: {len(df_raw)}")
    print(f"  Jumlah kolom: {len(df_raw.columns)}")
    
except FileNotFoundError:
    print(f"‚úó Error: File '{file_path}' tidak ditemukan!")
    exit()
except Exception as e:
    print(f"‚úó Error saat membaca file: {str(e)}")
    exit()


TAHAP 1: DATA LOADING & INITIAL EXPLORATION (REVISED)

[1] Loading data dari Excel dengan deteksi header otomatis...

üìã Preview 10 Baris Pertama (Raw):
    0                                                  1                     2                                                  3   4   5   6   7   8                            9          10  11      12             13  14             15                                                 16  17  18  19  20  21      22    23  24            25                                         26  27           28            29  30                    31         32  33                   34
0  NaN  DATA PENGALAMAN PERUSAHAAN PT. ARKONIN ENGINEE...                   NaN                                                NaN NaN NaN NaN NaN NaN                          NaN        NaN NaN     NaN            NaN NaN            NaN                                                NaN NaN NaN NaN NaN NaN     NaN   NaN NaN           NaN                              

In [25]:
# ============================================================================
# 2. IDENTIFIKASI KOLOM BERDASARKAN KONTEN
# ============================================================================
print("\n[2] Identifikasi Kolom Berdasarkan Keyword...")
print("-" * 80)

# Dictionary untuk mapping keyword ke kolom target
keyword_mapping = {
    'Nama Paket': ['nama', 'paket', 'pekerjaan', 'project'],
    'Klasifikasi': ['klasifikasi', 'sub klasifikasi', 'jenis'],
    'Lokasi': ['lokasi', 'tempat', 'kota', 'provinsi'],
    'Pengguna Jasa': ['pengguna', 'jasa', 'klien', 'owner'],
    'Nilai Kontrak': ['nilai', 'kontrak', 'harga', 'rp'],
    'Tanggal Mulai': ['tanggal', 'mulai', 'start', 'awal'],
    'Tanggal Selesai': ['selesai', 'akhir', 'finish', 'end']
}

detected_columns = {}

for target_col, keywords in keyword_mapping.items():
    for col in df_raw.columns:
        col_lower = str(col).lower()
        if any(keyword in col_lower for keyword in keywords):
            detected_columns[target_col] = col
            break

print("\n‚úì Kolom Terdeteksi:")
for target, actual in detected_columns.items():
    print(f"  ‚Ä¢ {target:20s} ‚Üí {actual}")

if len(detected_columns) == 0:
    print("\n‚ö†Ô∏è PERINGATAN: Tidak ada kolom yang terdeteksi otomatis!")
    print("   Menampilkan semua nama kolom untuk mapping manual:\n")
    for idx, col in enumerate(df_raw.columns, 1):
        print(f"   {idx}. {col}")


[2] Identifikasi Kolom Berdasarkan Keyword...
--------------------------------------------------------------------------------

‚úì Kolom Terdeteksi:
  ‚Ä¢ Nama Paket           ‚Üí Nama
  ‚Ä¢ Nilai Kontrak        ‚Üí Nilai (Rp.)
  ‚Ä¢ Tanggal Mulai        ‚Üí No./Tanggal
  ‚Ä¢ Tanggal Selesai      ‚Üí BA Peny. Lap. Akhir


In [26]:
# ============================================================================
# 3. CLEANING DATA AWAL
# ============================================================================
print("\n[3] Cleaning Data Awal...")
print("-" * 80)

# Remove rows yang semuanya NaN
df_clean = df_raw.dropna(how='all')
print(f"‚úì Baris kosong dihapus: {len(df_raw) - len(df_clean)} baris")

# Remove rows yang sepertinya header duplikat
header_keywords = ['nama', 'paket', 'lokasi', 'nilai', 'tanggal']
mask = df_clean.apply(
    lambda row: any(str(val).lower().strip() in header_keywords 
                    for val in row if pd.notna(val)), 
    axis=1
)
df_clean = df_clean[~mask]
print(f"‚úì Header duplikat dihapus")

# Reset index
df_clean = df_clean.reset_index(drop=True)

print(f"\n‚úì Data setelah cleaning:")
print(f"  Total baris: {len(df_clean)}")
print(f"  Total kolom: {len(df_clean.columns)}")



[3] Cleaning Data Awal...
--------------------------------------------------------------------------------
‚úì Baris kosong dihapus: 150 baris
‚úì Header duplikat dihapus

‚úì Data setelah cleaning:
  Total baris: 115
  Total kolom: 38


In [27]:
# ============================================================================
# 4. PREVIEW DATA BERSIH
# ============================================================================
print("\n[4] Preview Data Bersih...")
print("-" * 80)

print("\nüìã 5 Baris Pertama:")
print(df_clean.head())

print("\nüìä Info Tipe Data:")
print(df_clean.info())


[4] Preview Data Bersih...
--------------------------------------------------------------------------------

üìã 5 Baris Pertama:
  Unnamed: 0  Unnamed: 1  Unnamed: 2                                         Unnamed: 3  Unnamed: 4  Unnamed: 5  Unnamed: 6  Unnamed: 7  Unnamed: 8  Unnamed: 9 Unnamed: 10  Unnamed: 11  Unnamed: 12    Unnamed: 13  Unnamed: 14  Nama                                        Unnamed: 16  Unnamed: 17  Unnamed: 18  Unnamed: 19  Unnamed: 20  Unnamed: 21  Alamat Unnamed: 23  Unnamed: 24  No./Tanggal                                Unnamed: 26  Unnamed: 27  Nilai (Rp.)   Unnamed: 29  Unnamed: 30  Kontrak Unnamed: 32  Unnamed: 33  BA Peny. Lap. Akhir Unnamed: 35  Unnamed: 36 Unnamed: 37
0         1.         NaN         2.0                                                NaN         NaN         NaN         NaN         NaN         NaN         3.0         NaN          NaN          4.0            NaN          NaN   5.0                                                NaN    

In [28]:
# ============================================================================
# 5. IDENTIFIKASI MISSING VALUES
# ============================================================================
print("\n[5] Identifikasi Missing Values...")
print("-" * 80)

missing_data = pd.DataFrame({
    'Kolom': df_clean.columns,
    'Missing Count': df_clean.isnull().sum(),
    'Missing Percentage': (df_clean.isnull().sum() / len(df_clean) * 100).round(2)
})

missing_data = missing_data[missing_data['Missing Count'] > 0].sort_values(
    'Missing Percentage', ascending=False
).reset_index(drop=True)

if len(missing_data) > 0:
    print("\n‚ö†Ô∏è Kolom dengan Missing Values (Top 10):")
    print(missing_data.head(10).to_string(index=False))
else:
    print("\n‚úì Tidak ada missing values dalam dataset!")



[5] Identifikasi Missing Values...
--------------------------------------------------------------------------------

‚ö†Ô∏è Kolom dengan Missing Values (Top 10):
      Kolom  Missing Count  Missing Percentage
 Unnamed: 1            115               100.0
 Unnamed: 5            115               100.0
 Unnamed: 4            115               100.0
 Unnamed: 8            115               100.0
 Unnamed: 7            115               100.0
 Unnamed: 6            115               100.0
Unnamed: 20            115               100.0
Unnamed: 21            115               100.0
Unnamed: 27            115               100.0
Unnamed: 11            115               100.0


In [29]:
# ============================================================================
# 6. STATISTIK KOLOM PENTING
# ============================================================================
print("\n[6] Statistik Kolom Penting...")
print("-" * 80)

if detected_columns:
    for target_col, actual_col in detected_columns.items():
        if actual_col in df_clean.columns:
            print(f"\nüìä {target_col} ({actual_col}):")
            print(f"  - Total values: {df_clean[actual_col].notna().sum()}")
            print(f"  - Unique values: {df_clean[actual_col].nunique()}")
            
            # Sample values
            sample_values = df_clean[actual_col].dropna().head(3).tolist()
            print(f"  - Sample: {sample_values}")



[6] Statistik Kolom Penting...
--------------------------------------------------------------------------------

üìä Nama Paket (Nama):
  - Total values: 1
  - Unique values: 1
  - Sample: [5.0]

üìä Nilai Kontrak (Nilai (Rp.)):
  - Total values: 1
  - Unique values: 1
  - Sample: [8.0]

üìä Tanggal Mulai (No./Tanggal):
  - Total values: 1
  - Unique values: 1
  - Sample: [7.0]

üìä Tanggal Selesai (BA Peny. Lap. Akhir):
  - Total values: 1
  - Unique values: 1
  - Sample: [10.0]


In [30]:
# ============================================================================
# 7. EXPORT HASIL
# ============================================================================
print("\n[7] Menyimpan Hasil...")
print("-" * 80)

# Simpan cleaned data
df_clean.to_excel('01_data_cleaned.xlsx', index=False)
print("‚úì Data bersih disimpan ke: 01_data_cleaned.xlsx")

# Simpan mapping kolom
if detected_columns:
    with open('01_column_mapping.txt', 'w', encoding='utf-8') as f:
        f.write("COLUMN MAPPING\n")
        f.write("=" * 50 + "\n\n")
        for target, actual in detected_columns.items():
            f.write(f"{target:20s} ‚Üí {actual}\n")
    print("‚úì Column mapping disimpan ke: 01_column_mapping.txt")

# Report
summary_report = f"""
{'=' * 80}
LAPORAN EKSPLORASI DATA AWAL (REVISED)
PT Arkonin Engineering Manggala Pratama
Tanggal: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
{'=' * 80}

1. INFORMASI DATASET
   - File: {file_path}
   - Header Row: {header_row}
   - Total Records (setelah cleaning): {len(df_clean)}
   - Total Kolom: {len(df_clean.columns)}

2. KOLOM TERDETEKSI
   {len(detected_columns)} kolom target berhasil diidentifikasi

3. DATA QUALITY
   - Missing Values: {df_clean.isnull().sum().sum()}
   - Completeness: {((1 - df_clean.isnull().sum().sum() / (len(df_clean) * len(df_clean.columns))) * 100):.1f}%

4. NEXT STEPS
   - Validasi column mapping
   - Data preprocessing lengkap
   - Feature engineering
   - Model development

{'=' * 80}
"""

print(summary_report)

with open('01_exploration_report_revised.txt', 'w', encoding='utf-8') as f:
    f.write(summary_report)

print("‚úì Report disimpan ke: 01_exploration_report_revised.txt")



[7] Menyimpan Hasil...
--------------------------------------------------------------------------------
‚úì Data bersih disimpan ke: 01_data_cleaned.xlsx
‚úì Column mapping disimpan ke: 01_column_mapping.txt

LAPORAN EKSPLORASI DATA AWAL (REVISED)
PT Arkonin Engineering Manggala Pratama
Tanggal: 2025-12-03 15:24:10

1. INFORMASI DATASET
   - File: PP PT ARKONIN EMP Thn 2022 sd 2025.xlsx
   - Header Row: 4
   - Total Records (setelah cleaning): 115
   - Total Kolom: 38

2. KOLOM TERDETEKSI
   4 kolom target berhasil diidentifikasi

3. DATA QUALITY
   - Missing Values: 3801
   - Completeness: 13.0%

4. NEXT STEPS
   - Validasi column mapping
   - Data preprocessing lengkap
   - Feature engineering
   - Model development


‚úì Report disimpan ke: 01_exploration_report_revised.txt


In [31]:
# ============================================================================
# 8. REKOMENDASI MANUAL MAPPING (JIKA PERLU)
# ============================================================================
if len(detected_columns) < 7:
    print("\n" + "=" * 80)
    print("‚ö†Ô∏è PERHATIAN: DIPERLUKAN MANUAL MAPPING")
    print("=" * 80)
    print("\nBeberapa kolom tidak terdeteksi otomatis.")
    print("Silakan periksa file Excel dan identifikasi kolom berikut:\n")
    
    required_cols = [
        'Nama Paket Pekerjaan',
        'Klasifikasi/Sub Klasifikasi',
        'Lokasi',
        'Pengguna Jasa',
        'Nilai Kontrak (Rp)',
        'Tanggal Mulai',
        'Tanggal Selesai'
    ]
    
    for col in required_cols:
        if col not in detected_columns:
            print(f"  ‚ùå {col}")
        else:
            print(f"  ‚úì {col}")
    
    print("\nüìù Daftar lengkap kolom dalam data:")
    for idx, col in enumerate(df_clean.columns, 1):
        print(f"   {idx:2d}. {col}")

print("\n" + "=" * 80)
print("‚úì TAHAP 1 (REVISED) SELESAI!")
print("=" * 80)


‚ö†Ô∏è PERHATIAN: DIPERLUKAN MANUAL MAPPING

Beberapa kolom tidak terdeteksi otomatis.
Silakan periksa file Excel dan identifikasi kolom berikut:

  ‚ùå Nama Paket Pekerjaan
  ‚ùå Klasifikasi/Sub Klasifikasi
  ‚ùå Lokasi
  ‚ùå Pengguna Jasa
  ‚ùå Nilai Kontrak (Rp)
  ‚úì Tanggal Mulai
  ‚úì Tanggal Selesai

üìù Daftar lengkap kolom dalam data:
    1. Unnamed: 0
    2. Unnamed: 1
    3. Unnamed: 2
    4. Unnamed: 3
    5. Unnamed: 4
    6. Unnamed: 5
    7. Unnamed: 6
    8. Unnamed: 7
    9. Unnamed: 8
   10. Unnamed: 9
   11. Unnamed: 10
   12. Unnamed: 11
   13. Unnamed: 12
   14. Unnamed: 13
   15. Unnamed: 14
   16. Nama
   17. Unnamed: 16
   18. Unnamed: 17
   19. Unnamed: 18
   20. Unnamed: 19
   21. Unnamed: 20
   22. Unnamed: 21
   23. Alamat
   24. Unnamed: 23
   25. Unnamed: 24
   26. No./Tanggal
   27. Unnamed: 26
   28. Unnamed: 27
   29. Nilai (Rp.)
   30. Unnamed: 29
   31. Unnamed: 30
   32. Kontrak
   33. Unnamed: 32
   34. Unnamed: 33
   35. BA Peny. Lap. Akhir
   36