<a href="https://colab.research.google.com/github/RITabayuni/Skripsi_Perbandingan_K-Prototypes_Agglomerative-Gower/blob/main/Prepocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**LIBRARY PRERPARATION**

In [None]:
!pip install kmodes openpyxl

Collecting kmodes
  Downloading kmodes-0.12.2-py2.py3-none-any.whl.metadata (8.1 kB)
Downloading kmodes-0.12.2-py2.py3-none-any.whl (20 kB)
Installing collected packages: kmodes
Successfully installed kmodes-0.12.2


In [None]:
!pip install gower kmodes openpyxl

Collecting gower
  Downloading gower-0.1.2-py3-none-any.whl.metadata (3.7 kB)
Downloading gower-0.1.2-py3-none-any.whl (5.2 kB)
Installing collected packages: gower
Successfully installed gower-0.1.2


In [None]:
from google.colab import drive, files
import glob, os, pandas as pd
import numpy as np
import math
from kmodes.kprototypes import KPrototypes
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
import gower
from scipy.cluster.hierarchy import linkage, fcluster
from scipy.spatial.distance import squareform
from collections import defaultdict
from scipy.cluster.hierarchy import dendrogram

**NORMALISASI KOLOM**

In [None]:

drive.mount('/content/drive')
FOLDER = "Data"

paths = sorted(set(
    glob.glob(os.path.join(FOLDER, "*.csv")) +
    glob.glob(os.path.join(FOLDER, "*.CSV")) +
    glob.glob(os.path.join(FOLDER, "*.xlsx")) +
    glob.glob(os.path.join(FOLDER, "*.xls"))
))
print(f"Found {len(paths)} files")

# Kebijakan drop:
DROP_POLICY = "either"        # baris dihapus jika system atau location kosong
TREAT_UNKNOWN_AS_MISSING = False  # "Unknown" tidak dianggap missing

# Kolom yang akan dihapus
DROP_COLS = [
    "accept_date","domain","ip_address","web_server","reason",
    "hackmode","mirror_image","state","def_grade","defacement_id"
]

BLANK_TOKENS = {"", "nan", "none", "null", "-", "--", "n/a", "na", "?"}

def read_any(p):
    if p.lower().endswith((".xlsx", ".xls")):
        return pd.read_excel(p)
    for enc in ("utf-8","ISO-8859-1","utf-16"):
        try:
            return pd.read_csv(p, encoding=enc)
        except Exception:
            continue
    return pd.read_csv(p, engine="python", encoding_errors="ignore")

def norm_colnames(df): #normalisasi kolom
    df.columns = [c.strip().lower().replace(" ", "_") for c in df.columns]
    return df



Mounted at /content/drive
Found 34 files


**DROP MISSING VALUES**

In [None]:
def is_missing_series(s: pd.Series, treat_unknown=False):
    if s is None:
        return pd.Series(True, index=range(0))
    # to string lower
    s_str = s.astype(str).str.strip().str.lower()
    miss = s.isna() | s_str.isin(BLANK_TOKENS)
    if treat_unknown:
        miss = miss | (s_str == "unknown")
    return miss

monthly_frames = []
summary = []

for p in paths:
    print("Reading", p)
    df = read_any(p)
    if df is None or len(df) == 0:
        print("  [skip empty]")
        continue

    df = norm_colnames(df)

    for col in ["system", "location"]:
        if col not in df.columns:
            df[col] = pd.NA

    n0 = len(df)

    # drop kolom yang tidak dipakai
    to_drop = [c for c in DROP_COLS if c in df.columns]
    if to_drop:
        df = df.drop(columns=to_drop, errors="ignore")

    # buat metadata file
    base = os.path.basename(p)
    df["ym"] = os.path.splitext(base)[0]
    df["source_file"] = base

    # deteksi missing untuk system/location
    miss_system   = is_missing_series(df["system"],   treat_unknown=TREAT_UNKNOWN_AS_MISSING)
    miss_location = is_missing_series(df["location"], treat_unknown=TREAT_UNKNOWN_AS_MISSING)

    if DROP_POLICY == "both":
        to_drop_mask = miss_system & miss_location
    elif DROP_POLICY == "either":
        to_drop_mask = miss_system | miss_location
    else:
        raise ValueError("DROP_POLICY harus 'both' atau 'either'")

    dropped = int(to_drop_mask.sum())
    kept = n0 - dropped

    df = df.loc[~to_drop_mask].copy()

    summary.append({
        "file": base,
        "rows_before": n0,
        "dropped": dropped,
        "kept": kept,
        "drop_policy": DROP_POLICY,
        "treat_unknown_as_missing": TREAT_UNKNOWN_AS_MISSING
    })

    monthly_frames.append(df)

raw = pd.concat(monthly_frames, ignore_index=True) if monthly_frames else pd.DataFrame()
print("Total rows loaded after drop:", len(raw))


Reading /content/drive/MyDrive/KULIAH/Skripsi/Defacements/Full/Defacements202301.csv
Reading /content/drive/MyDrive/KULIAH/Skripsi/Defacements/Full/Defacements202302.csv
Reading /content/drive/MyDrive/KULIAH/Skripsi/Defacements/Full/Defacements202303.csv
Reading /content/drive/MyDrive/KULIAH/Skripsi/Defacements/Full/Defacements202304.csv
Reading /content/drive/MyDrive/KULIAH/Skripsi/Defacements/Full/Defacements202305.csv
Reading /content/drive/MyDrive/KULIAH/Skripsi/Defacements/Full/Defacements202306.csv
Reading /content/drive/MyDrive/KULIAH/Skripsi/Defacements/Full/Defacements202307.csv
Reading /content/drive/MyDrive/KULIAH/Skripsi/Defacements/Full/Defacements202308.csv
Reading /content/drive/MyDrive/KULIAH/Skripsi/Defacements/Full/Defacements202309.csv
Reading /content/drive/MyDrive/KULIAH/Skripsi/Defacements/Full/Defacements202310.csv
Reading /content/drive/MyDrive/KULIAH/Skripsi/Defacements/Full/Defacements202311.csv
Reading /content/drive/MyDrive/KULIAH/Skripsi/Defacements/Full/De

In [None]:
# simpan data setelah drop
raw.to_csv("full_data_after_drop_all_months.csv", index=False)
print("Saved combined full data -> full_data_after_drop_all_months.csv")

Saved combined full data -> full_data_after_drop_all_months.csv


**DEDUPLIKASI TIPE SERANGAN**

In [None]:
COL_YM        = "ym"
COL_ATTACKER  = "attacker"
COL_LOCATION  = "location"
COL_TYPE      = "type"
COL_ADD_DATE  = "add_date"
COL_REDEF     = "redefacement"
COL_SYSTEM    = "system"


# Deduplikasi:
# - Mass akan dedup pakai level tanggal (YYYY-MM-DD) secara default
# - Regular akan dedup pakai timestamp penuh secara default
DEDUP_USE_ADD_DATE_DAY_MASS     = True
DEDUP_USE_ADD_DATE_DAY_REGULAR  = False


def ensure_datetime_day_key(s: pd.Series) -> pd.Series:
    # Ambil kunci harian (YYYY-MM-DD) dari kolom datetime/string
    if np.issubdtype(s.dtype, np.datetime64):
        return s.dt.date.astype(str)
    return pd.to_datetime(s, errors="coerce").dt.date.astype(str)


def to_string_key(s: pd.Series) -> pd.Series:
    # Pakai string apa adanya (untuk timestamp penuh atau teks)
    if np.issubdtype(s.dtype, np.datetime64):
        return s.astype("datetime64[ns]").astype(str)
    return s.astype(str)


def is_mass_type(series: pd.Series) -> pd.Series:
    return series.astype(str).str.lower().str.contains("mass")


def dedup_all_global(df: pd.DataFrame) -> pd.DataFrame:
    required = [COL_TYPE, COL_ATTACKER, COL_ADD_DATE, COL_LOCATION, COL_REDEF, COL_SYSTEM]
    missing = [c for c in required if c not in df.columns]
    if missing:
        print(f"[WARN] Lewatkan dedup karena kolom wajib hilang: {missing}")
        return df

    # ===== TOTAL SEBELUM =====
    before_total = len(df)

    mask_mass = is_mass_type(df[COL_TYPE])

    # key untuk MASS
    if DEDUP_USE_ADD_DATE_DAY_MASS:
        add_key_mass = ensure_datetime_day_key(df[COL_ADD_DATE])
    else:
        add_key_mass = to_string_key(df[COL_ADD_DATE])

    # key untuk REGULAR
    if DEDUP_USE_ADD_DATE_DAY_REGULAR:
        add_key_reg = ensure_datetime_day_key(df[COL_ADD_DATE])
    else:
        add_key_reg = to_string_key(df[COL_ADD_DATE])

    # ===== MASS =====
    mass_part = df[mask_mass].copy()
    if not mass_part.empty:
        mass_part = mass_part.assign(_add_key=add_key_mass[mask_mass])
        mass_part = mass_part.sort_values(
            [COL_ATTACKER, "_add_key", COL_LOCATION, COL_REDEF, COL_SYSTEM],
            kind="stable"
        )
        before_mass = len(mass_part)
        mass_part = mass_part.drop_duplicates(
            subset=[COL_ATTACKER, "_add_key", COL_LOCATION, COL_REDEF, COL_SYSTEM],
            keep="first"
        ).drop(columns=["_add_key"], errors="ignore")
        after_mass = len(mass_part)
    else:
        before_mass = after_mass = 0

    # ===== REGULAR =====
    regular_part = df[~mask_mass].copy()
    if not regular_part.empty:
        regular_part = regular_part.assign(_add_key=add_key_reg[~mask_mass])
        regular_part = regular_part.sort_values(
            [COL_TYPE, COL_ATTACKER, "_add_key", COL_LOCATION, COL_REDEF, COL_SYSTEM],
            kind="stable"
        )
        before_reg = len(regular_part)
        regular_part = regular_part.drop_duplicates(
            subset=[COL_TYPE, COL_ATTACKER, "_add_key", COL_LOCATION, COL_REDEF, COL_SYSTEM],
            keep="first"
        ).drop(columns=["_add_key"], errors="ignore")
        after_reg = len(regular_part)
    else:
        before_reg = after_reg = 0

    out = pd.concat([regular_part, mass_part], ignore_index=True)

    # ===== TOTAL SESUDAH =====
    after_total = len(out)
    removed_total = before_total - after_total

    # (opsional tapi berguna) print ringkas
    print(f"[DEDUP] Total: {before_total:,} -> {after_total:,} (removed {removed_total:,})")
    print(f"[DEDUP] Regular: {before_reg:,} -> {after_reg:,} (removed {before_reg-after_reg:,})")
    print(f"[DEDUP] Mass: {before_mass:,} -> {after_mass:,} (removed {before_mass-after_mass:,})")

    return out

raw = dedup_all_global(raw)



[DEDUP] Total: 331,901 -> 141,560 (removed 190,341)
[DEDUP] Regular: 144,668 -> 131,182 (removed 13,486)
[DEDUP] Mass: 187,233 -> 10,378 (removed 176,855)


**SIMPAN DATA SETELAH DEDUPLIKASI**

In [None]:
raw.to_csv("data_after_dedup.csv", index=False)
print("Saved combined full data -> data_after_dedup.csv")

Saved combined full data -> data_after_dedup.csv


**SIMPLE RANDOM SAMPLING DAN PENGKATEGORIAN TOP 10 ATTACKER LOCATION**

In [None]:
import os
import pandas as pd
import re

def add_year_month_from_ym(df, col_ym="ym"):
    if col_ym not in df.columns:
        return df
    s = df[col_ym].astype(str)
    yyyymm = s.str.extract(r"(\d{6})$", expand=False)
    df["year"]  = pd.to_numeric(yyyymm.str[:4], errors="coerce")
    df["month"] = pd.to_numeric(yyyymm.str[4:6], errors="coerce")
    return df

def topk_coverage(series, ks=(5,10,15,20,30,50)):
    s = series.fillna("NA").astype(str)
    vc = s.value_counts(normalize=True) * 100
    out = {}
    for k in ks:
        out[k] = float(vc.head(k).sum())
    return out

def apply_topk_to_newcol(df, src_col, dst_col, k=10, other="Other"):
    s = df[src_col].fillna("NA").astype(str)
    top = set(s.value_counts().head(k).index)
    df[dst_col] = s.where(s.isin(top), other)
    return df

def random_sample_pipeline(
    raw,
    frac,
    out_dir,
    random_state=42,
    # kolom
    col_ym="ym",
    col_attacker="attacker",
    col_location="location",
    # output grouping (kolom baru)
    attacker_group_col="attacker_group",
    location_group_col="location_group",
    topk_attacker=10,
    topk_location=10,
    other="Other",
    # flags
    do_topk=True,
    do_coverage=True,
    # drop sebelum clustering
    drop_cols_for_clustering=("ym", "source_file")
):
    os.makedirs(out_dir, exist_ok=True)

    # 1) random sampling
    N = len(raw)
    n_target = int(round(N * frac))
    n_target = min(n_target, N)

    sampled = raw.sample(n=n_target, replace=False, random_state=random_state).copy()
    print(f"\n=== {int(frac*100)}% RANDOM | n={len(sampled):,} ===")

    # 2) tambah year & month
    sampled = add_year_month_from_ym(sampled, col_ym=col_ym)

    # 3) Top-K GLOBAL
    if do_topk:
        if col_attacker in sampled.columns:
            sampled = apply_topk_to_newcol(
                sampled, src_col=col_attacker, dst_col=attacker_group_col,
                k=topk_attacker, other=other
            )
        if col_location in sampled.columns:
            sampled = apply_topk_to_newcol(
                sampled, src_col=col_location, dst_col=location_group_col,
                k=topk_location, other=other
            )

    # drop ym & source_file
    cluster_df = sampled.drop(
        columns=[c for c in drop_cols_for_clustering if c in sampled.columns],
        errors="ignore"
    )

    out_cluster = os.path.join(out_dir, f"{int(frac*100)}%_random_sampled.csv")
    cluster_df.to_csv(out_cluster, index=False)
    print("Saved (for clustering) ->", out_cluster)

    # 7) tampilkan top months (display only)
    if col_ym in sampled.columns:
        print("\nTop months (count) [display only]:")
        print(sampled.groupby(col_ym).size().rename("n").sort_values(ascending=False).head(15))

    return sampled, cluster_df


**PANGGIL FUNCTION**

In [None]:
OUT_DIR = "Data"

s10, s10_cluster = random_sample_pipeline(raw, 0.10, OUT_DIR, random_state=42)
s20, s20_cluster = random_sample_pipeline(raw, 0.20, OUT_DIR, random_state=42)
s30, s30_cluster = random_sample_pipeline(raw, 0.30, OUT_DIR, random_state=42)



=== 10% RANDOM | n=14,156 ===
Saved (for clustering) -> /content/drive/MyDrive/KULIAH/Skripsi/Defacements/RandomSampled/10%_random_sampled.csv

Top months (count) [display only]:
ym
Defacements202510    960
Defacements202301    853
Defacements202303    764
Defacements202304    700
Defacements202302    586
Defacements202305    522
Defacements202403    522
Defacements202308    509
Defacements202509    507
Defacements202307    499
Defacements202507    460
Defacements202404    447
Defacements202405    441
Defacements202505    420
Defacements202406    412
Name: n, dtype: int64

=== 20% RANDOM | n=28,312 ===
Saved (for clustering) -> /content/drive/MyDrive/KULIAH/Skripsi/Defacements/RandomSampled/20%_random_sampled.csv

Top months (count) [display only]:
ym
Defacements202510    1922
Defacements202301    1662
Defacements202303    1570
Defacements202304    1343
Defacements202302    1166
Defacements202305    1089
Defacements202509    1056
Defacements202308    1023
Defacements202403    1018
Def

In [None]:
s15, s15_cluster = random_sample_pipeline(raw, 0.15, OUT_DIR, random_state=42)
s16, s16_cluster = random_sample_pipeline(raw, 0.16, OUT_DIR, random_state=42)
s17, s17_cluster = random_sample_pipeline(raw, 0.17, OUT_DIR, random_state=42)
s18, s18_cluster = random_sample_pipeline(raw, 0.18, OUT_DIR, random_state=42)
s19, s19_cluster = random_sample_pipeline(raw, 0.19, OUT_DIR, random_state=42)


=== 15% RANDOM | n=21,234 ===
Saved (for clustering) -> /content/drive/MyDrive/KULIAH/Skripsi/Defacements/RandomSampled/15%_random_sampled.csv

Top months (count) [display only]:
ym
Defacements202510    1452
Defacements202301    1253
Defacements202303    1158
Defacements202304    1040
Defacements202302     882
Defacements202305     809
Defacements202509     801
Defacements202403     790
Defacements202308     760
Defacements202307     720
Defacements202507     658
Defacements202405     653
Defacements202505     626
Defacements202404     622
Defacements202506     612
Name: n, dtype: int64

=== 16% RANDOM | n=22,650 ===
Saved (for clustering) -> /content/drive/MyDrive/KULIAH/Skripsi/Defacements/RandomSampled/16%_random_sampled.csv

Top months (count) [display only]:
ym
Defacements202510    1554
Defacements202301    1321
Defacements202303    1229
Defacements202304    1099
Defacements202302     936
Defacements202305     872
Defacements202509     856
Defacements202403     840
Defacements202