In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import OneHotEncoder
from google.colab import files
import io
import re

# ==========================================
# 1. UPLOAD FILE
# ==========================================
print("=== STEP 1: DATA INGESTION ===")
uploaded = files.upload()

if len(uploaded) > 0:
    filename = next(iter(uploaded))
    print(f"\nSedang memproses file: {filename}...")
    file_content = io.BytesIO(uploaded[filename])

    # --- SMART LOADER LOGIC ---
    # Mencoba berbagai cara membaca file agar tidak error
    try:
        # Percobaan 1: Auto-Detect Separator
        print("🔄 Mencoba membaca dengan Auto-Detect Separator...")
        df_raw = pd.read_csv(file_content, sep=None, engine='python')
        print("✅ Berhasil dengan Auto-Detect!")

    except:
        file_content.seek(0)
        try:
            # Percobaan 2: Paksa Titik Koma (Format Excel Indo)
            print("🔄 Mencoba membaca dengan Titik Koma (;)...")
            df_raw = pd.read_csv(file_content, sep=';', engine='python')
            print("✅ Berhasil dengan Titik Koma!")

        except:
            file_content.seek(0)
            try:
                # Percobaan 3: Mode Darurat (Skip baris error)
                print("⚠️ Mencoba Mode Darurat (Skip Bad Lines)...")
                df_raw = pd.read_csv(file_content, sep=',', on_bad_lines='skip', engine='python')
                print("✅ Berhasil (Baris error dilewati).")
            except Exception as e:
                print("❌ GAGAL TOTAL. File tidak terbaca.")
                df_raw = None

    if df_raw is not None:
        print(f"\n📊 Data Loaded: {len(df_raw)} Baris, {len(df_raw.columns)} Kolom")
else:
    print("❌ Tidak ada file yang diupload.")

=== STEP 1: DATA INGESTION ===


Saving 20260123 Raw Data Pelanggan Aktiif v.1.1.csv to 20260123 Raw Data Pelanggan Aktiif v.1.1 (5).csv

Sedang memproses file: 20260123 Raw Data Pelanggan Aktiif v.1.1 (5).csv...
🔄 Mencoba membaca dengan Auto-Detect Separator...
✅ Berhasil dengan Auto-Detect!

📊 Data Loaded: 118316 Baris, 83 Kolom


In [None]:
print("=== STEP 2: DATA CLEANING & STANDARDIZATION ===")

if 'df_raw' in locals() and df_raw is not None:
    df = df_raw.copy()

    # 1. MAPPING KOLOM (Agar nama kolom standar)
    # ---------------------------------------------------------
    target_columns = {
        'Customer_Name': ['namapelanggan', 'nama pelanggan', 'customer name', 'nama_pelanggan'],
        'Revenue': ['hargapelanggan', 'harga pelanggan', 'revenue', 'amount', 'total revenue', 'harga'],
        'Bandwidth_Mbps': ['bandwidth', 'bw', 'kapasitas'],
        'Contract_End_Date': ['tanggalakhirkontrak', 'tgl akhir kontrak', 'contract end date'],
        'Status': ['statuslayanan', 'status layanan', 'status'],
        'Region': ['wilayah', 'region', 'area'],
        'Product_Name': ['namalayanan', 'nama layanan', 'product name'],
        'Tier': ['kelompok tier', 'tier'],
        'Industry_Segment': ['segmenicon', 'segmen icon', 'segment']
    }

    print("🔍 Sedang mencocokkan nama kolom...")
    for col in df.columns:
        col_lower = str(col).lower().strip()
        for target, variations in target_columns.items():
            if col_lower in variations:
                df = df.rename(columns={col: target})
                break

    # 2. FUNGSI CLEANING NAMA (Advanced)
    # ---------------------------------------------------------
    def clean_name_complete(text):
        if pd.isna(text):
            return "UNKNOWN"

        # [A] Hapus SPASI di AWAL & AKHIR (Trim)
        text = str(text).strip()

        # [B] Ubah ke Huruf Besar
        text = text.upper()

        # [C] Hapus Gelar Badan Usaha
        remove_patterns = [
            r'\bPT\.?\b', r'\bCV\.?\b', r'\bTBK\.?\b',
            r'\(PERSERO\)', r'\bPERSERO\b', r'\bUD\.?\b',
            r'\bYAYASAN\b', r'\bKOPERASI\b', r'\bPERUM\b',
            r'\bDINAS\b', r'\bLTD\.?\b'
        ]
        combined_pattern = '|'.join(remove_patterns)
        text = re.sub(combined_pattern, '', text)

        # [D] Hapus karakter simbol sisa (.,-) & Spasi Ganda
        text = re.sub(r'[^\w\s]', ' ', text)
        text = re.sub(r'\s+', ' ', text).strip()

        return text

    if 'Customer_Name' in df.columns:
        print("🧹 Membersihkan Nama Pelanggan (Hapus PT, Spasi Awal, dll)...")
        df['Customer_Name'] = df['Customer_Name'].apply(clean_name_complete)

    # 3. CLEANING ANGKA (Revenue & Bandwidth)
    # ---------------------------------------------------------
    if 'Revenue' in df.columns:
        # Hapus Rp, titik, koma, huruf
        df['Revenue'] = df['Revenue'].astype(str).str.replace(r'[^\d]', '', regex=True)
        df['Revenue'] = pd.to_numeric(df['Revenue'], errors='coerce').fillna(0)

    if 'Bandwidth_Mbps' in df.columns:
        def clean_bw(val):
            val = str(val).lower()
            # Ambil angka pertama yang muncul
            nums = re.findall(r"[-+]?\d*\.\d+|\d+", val.replace(',','.'))
            num = float(nums[0]) if nums else 0

            # Konversi Satuan
            if 'gb' in val: return num * 1000
            if 'kb' in val: return num / 1000
            return num # Default Mbps

        df['Bandwidth_Mbps'] = df['Bandwidth_Mbps'].apply(clean_bw)

    # 4. FILTERING & DATES
    # ---------------------------------------------------------
    if 'Status' in df.columns:
        df = df[df['Status'].astype(str).str.contains('AKTIF', case=False, na=False)]

    if 'Contract_End_Date' in df.columns:
        df['Contract_End_Date'] = pd.to_datetime(df['Contract_End_Date'], errors='coerce')
        # Isi tanggal kosong dengan 31 Des 2030 (Asumsi Auto-Renewal)
        df['Contract_End_Date'] = df['Contract_End_Date'].fillna(pd.Timestamp('2030-12-31'))

    print("✅ Step 2 Selesai! Data sudah bersih.")
    display(df.head(3))

else:
    print("❌ Error: Harap jalankan CELL 1 terlebih dahulu.")

=== STEP 2: DATA CLEANING & STANDARDIZATION ===
🔍 Sedang mencocokkan nama kolom...
🧹 Membersihkan Nama Pelanggan (Hapus PT, Spasi Awal, dll)...
✅ Step 2 Selesai! Data sudah bersih.


Unnamed: 0,﻿idPerusahaan,noCustomerAx,idCustomerSap,idPelanggan,Customer_Name,emailPerusahaan,nomorKontrak,availabilityPerNode,latestMutasi,sid,...,Is it pullout 2024? (Adev),Is it Pullout (RAW)?,Kode Kategori,Status Pelanggan,Lama_Langganan,Kategori_Baru,Tier,Region,Margin,Bandwidth Fix
0,65773,0,200027394,131190003338,KANTOR PERTANAHAN KAB PURWOREJO SEKRETARIAT JE...,-,052/SPK-33.06.UP.03.01/XII/2024,999,AKTIVASI,131303002267,...,0,0,0-0-0,Aktif,1,Digital Infrastructure,DI-SDS-TS,JAWA-BALI,0,200 MBPS
1,65773,0,200027394,131190003338,KANTOR PERTANAHAN KAB PURWOREJO SEKRETARIAT JE...,-,052/SPK-33.06.UP.03.01/XII/2024,0,AKTIVASI,131610005797,...,0,0,0-0-0,Aktif,1,Technology Services,DI-SDS-TS,JAWA-BALI,0,Tidak Ada
2,65773,0,200027394,131190003338,KANTOR PERTANAHAN KAB PURWOREJO SEKRETARIAT JE...,-,052/SPK-33.06.UP.03.01/XII/2024,0,AKTIVASI,131610005805,...,0,0,0-0-0,Aktif,1,Technology Services,DI-SDS-TS,JAWA-BALI,0,Tidak Ada


In [None]:
print("=== STEP 3: STRATEGY & VISUALIZATION ===")

if 'df' in locals():
    # 1. HITUNG MEDIAN (Threshold)
    # ---------------------------------------------------------
    # Gunakan hanya data yang valid (>0) agar rata-rata tidak ditarik turun oleh data gratisan
    median_rev = df[df['Revenue'] > 0]['Revenue'].median()
    median_bw = df[df['Bandwidth_Mbps'] > 0]['Bandwidth_Mbps'].median()

    # Fallback jika data kosong
    if pd.isna(median_rev): median_rev = 0
    if pd.isna(median_bw): median_bw = 0

    print(f"📊 Market Thresholds:")
    print(f"   - Median Revenue: Rp {median_rev:,.0f}")
    print(f"   - Median Bandwidth: {median_bw:,.0f} Mbps")

    # 2. TENTUKAN KUADRAN (Logika Pak Hendi)
    # ---------------------------------------------------------
    def get_strategy(row):
        is_high_rev = row['Revenue'] >= median_rev
        is_high_bw = row['Bandwidth_Mbps'] >= median_bw

        if is_high_rev and is_high_bw: return "Star Client", "RETENTION"
        elif is_high_rev and not is_high_bw: return "Risk Area", "CROSS-SELL (High Margin)"
        elif not is_high_rev and is_high_bw: return "Sniper Zone", "UPSELL (Fix Price)"
        else: return "Incubator", "AUTOMATION"

    df[['Quadrant', 'Strategy']] = df.apply(lambda x: pd.Series(get_strategy(x)), axis=1)

    # 3. VISUALISASI SCATTER PLOT
    # ---------------------------------------------------------
    fig = px.scatter(
        df, x='Bandwidth_Mbps', y='Revenue', color='Quadrant',
        log_x=True, log_y=True, # Skala Logaritmik agar titik menyebar
        hover_data=['Customer_Name', 'Strategy'],
        title="Strategic Matrix: Revenue vs Bandwidth",
        color_discrete_map={
            "Star Client": "gold",
            "Risk Area": "red",
            "Sniper Zone": "blue",
            "Incubator": "grey"
        }
    )
    # Garis Pembatas
    fig.add_vline(x=median_bw, line_dash="dash", annotation_text="Med BW")
    fig.add_hline(y=median_rev, line_dash="dash", annotation_text="Med Rev")
    fig.show()

else:
    print("❌ Error: Harap jalankan CELL 2 terlebih dahulu.")

=== STEP 3: STRATEGY & VISUALIZATION ===
📊 Market Thresholds:
   - Median Revenue: Rp 975,000
   - Median Bandwidth: 20 Mbps


In [None]:
print("=== STEP 4: ACTIONABLE INSIGHTS ===")

if 'df' in locals():
    today = pd.Timestamp.now()

    # ---------------------------------------------------------
    # REPORT 1: DEFENSE (High Value Churn Risk)
    # ---------------------------------------------------------
    print("\n🛡️ [DEFENSE] Pelanggan Kaya yang Kontraknya Mau Habis (<90 Hari):")
    df['Days_Expire'] = (df['Contract_End_Date'] - today).dt.days

    churn_risk = df[
        (df['Revenue'] > median_rev) &
        (df['Days_Expire'] > 0) &
        (df['Days_Expire'] < 90)
    ].sort_values('Revenue', ascending=False)

    if not churn_risk.empty:
        display(churn_risk[['Customer_Name', 'Revenue', 'Days_Expire', 'Region']].head(5))
    else:
        print("✅ Aman. Tidak ada pelanggan High Value expired < 90 hari.")

=== STEP 4: ACTIONABLE INSIGHTS ===

🛡️ [DEFENSE] Pelanggan Kaya yang Kontraknya Mau Habis (<90 Hari):


Unnamed: 0,Customer_Name,Revenue,Days_Expire,Region
40726,KEMENTERIAN ENERGI DAN SUMBER DAYA MINERAL,5513513514,60,JABODETABEK
40643,KEMENTERIAN ENERGI DAN SUMBER DAYA MINERAL,5513513514,60,JABODETABEK
117353,UNIVERSITAS PENDIDIKAN GANESHA,5234774774,1,JAWA-BALI
45688,POLISI DAERAH ACEH,5001914788,1,SUMATERA
46969,POLRES BINTAN KEPOLISIAN NEGARA REPUBLIK INDON...,3333333333,29,SUMATERA


In [None]:
# ---------------------------------------------------------
# REPORT 2: OFFENSE (AI Recommendation)
# ---------------------------------------------------------
    print("\n⚔️ [OFFENSE] AI Product Recommendation (Look-alike Model):")

    # Cek kelengkapan kolom untuk ML
    ml_cols = ['Region', 'Tier', 'Industry_Segment', 'Product_Name']
    available_ml = [c for c in ml_cols if c in df.columns]

    if len(available_ml) >= 2 and 'Product_Name' in df.columns:
        # Simple ML Engine
        df_ml = df[available_ml].fillna('Unknown').astype(str)
        encoder = OneHotEncoder()
        encoded = encoder.fit_transform(df_ml)
        cosine_sim = cosine_similarity(encoded)

        # Ambil sampel dari Risk Area (Target Cross-sell)+
        targets = df[df['Quadrant'] == 'Risk Area'].head(10).index
        recommendations = []

        for idx in targets:
            # Cari 5 kembaran
            sim_scores = sorted(list(enumerate(cosine_sim[df.index.get_loc(idx)])), key=lambda x: x[1], reverse=True)[1:6]
            peer_indices = [i[0] for i in sim_scores]

            curr_prod = df.loc[idx, 'Product_Name']
            peer_prods = df.iloc[peer_indices]['Product_Name'].value_counts()

            rec = next((p for p in peer_prods.index if p != curr_prod), None)

            if rec:
                recommendations.append({
                    'Customer': df.loc[idx, 'Customer_Name'],
                    'Current Product': curr_prod,
                    'Recommended': rec,
                    'Reason': f"Used by peers in {df.loc[idx, 'Region']}"
                })

        if recommendations:
            display(pd.DataFrame(recommendations))
        else:
            print("⚠️ Belum cukup pola data untuk rekomendasi.")
    else:
        print("⚠️ Kolom tidak cukup untuk ML (Butuh Region, Industry, Product_Name).")

else:
    print("❌ Error: Harap jalankan CELL 3 terlebih dahulu.")

IndentationError: unexpected indent (ipython-input-4281329652.py, line 4)