# Pemusatan dan Penyebaran Data untuk Data pengeluaran mahasiswa undergraduate di universitas yang berada di USA

## Load data dan Library

In [102]:
import os
import shutil
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from scipy import stats
import statistics as s
from plotly.subplots import make_subplots
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [103]:
df = pd.read_csv('/home/radit/MachineLearning/Statistika-Polinema/Tugas-1/data/data.csv')

print('='* 80)
print(df.info())
print('='* 80)
print(df.describe())
print('='* 80)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3548 entries, 0 to 3547
Data columns (total 6 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Year     3548 non-null   int64 
 1   State    3548 non-null   object
 2   Type     3548 non-null   object
 3   Length   3548 non-null   object
 4   Expense  3548 non-null   object
 5   Value    3548 non-null   int64 
dtypes: int64(2), object(4)
memory usage: 166.4+ KB
None
              Year         Value
count  3548.000000   3548.000000
mean   2016.923337  13027.720124
std       2.553910   8734.568645
min    2013.000000   1225.000000
25%    2015.000000   7756.750000
50%    2017.000000  10203.500000
75%    2019.000000  14830.750000
max    2021.000000  49152.000000


## Pemusatan Data

### Mencari Mean, Median dan Modus

In [104]:
values = df['Value'].dropna()

mean_val = np.mean(values)
median_val = np.median(values)
modus_val = stats.mode(values)


print("=" * 50)
print("       STATISTIK DESKRIPTIF â€” KOLOM VALUE")
print("=" * 50)
print(f"  {'Jumlah Data':<25} : {len(values):>10,}")
print(f"  {'Mean':<25} : ${mean_val:>10,.2f}")
print(f"  {'Median':<25} : ${median_val:>10,.2f}")
print(f"  {'Modus':<25} : ${modus_val.mode:>10,.2f}")

       STATISTIK DESKRIPTIF â€” KOLOM VALUE
  Jumlah Data               :      3,548
  Mean                      : $ 13,027.72
  Median                    : $ 10,203.50
  Modus                     : $ 10,043.00


In [105]:
modus_val = stats.mode(values, axis=None, keepdims=False).mode

fig = go.Figure()

fig.add_trace(go.Histogram(
    x=values,
    nbinsx=30,
    name='Distribusi Value',
    marker=dict(
        color='steelblue',
        line=dict(color='white', width=0.8)
    ),
    opacity=0.75,
    hovertemplate="<b>Range:</b> %{x}<br><b>Frekuensi:</b> %{y}<extra></extra>"
))

fig.add_vline(
    x=mean_val,
    line=dict(color='red', dash='dash', width=2.5),
    annotation=dict(
        text=f"<b>Mean</b><br>${mean_val:,.0f}",
        font=dict(color='red', size=11),
        bgcolor='rgba(255,255,255,0.8)',
        bordercolor='red',
        borderwidth=1,
        borderpad=4,
        align='left'
    ),
    annotation_position='top left'
)

fig.add_vline(
    x=median_val,
    line=dict(color='green', dash='dot', width=2.5),
    annotation=dict(
        text=f"<b>Median</b><br>${median_val:,.0f}",
        font=dict(color='green', size=11),
        bgcolor='rgba(255,255,255,0.8)',
        bordercolor='green',
        borderwidth=1,
        borderpad=4,
        align='left'
    ),
    annotation_position='top right'
)

fig.add_vline(
    x=modus_val,
    line=dict(color='orange', dash='longdash', width=2.5),
    annotation=dict(
        text=f"<b>Modus</b><br>${modus_val:,.0f}",
        font=dict(color='orange', size=11),
        bgcolor='rgba(255,255,255,0.8)',
        bordercolor='orange',
        borderwidth=1,
        borderpad=4,
        align='left'
    ),
    annotation_position='bottom right'
)

fig.update_layout(
    title=dict(
        text='Distribusi Value dengan Mean, Median & Modus',
        font=dict(size=17),
        x=0.5
    ),
    xaxis=dict(
        title='Value ($)',
        tickprefix='$',
        tickformat=',.0f',
        showgrid=True,
        gridcolor='lightgrey'
    ),
    yaxis=dict(
        title='Frekuensi',
        showgrid=True,
        gridcolor='lightgrey'
    ),
    plot_bgcolor='white',
    paper_bgcolor='white',
    bargap=0.08,
    height=520,
    margin=dict(t=80, b=60, l=60, r=60),
    legend=dict(
        orientation='h',
        y=-0.15,
        x=0.5,
        xanchor='center'
    ),
    # Tambahkan dummy trace untuk legenda
    showlegend=True
)

fig.add_vrect(
    x0=mean_val - std_val,
    x1=mean_val + std_val,
    fillcolor='red',
    opacity=0.07,
    line_width=0,
    annotation_text='Â±1 Std Dev',
    annotation_position='top left',
    annotation_font=dict(color='red', size=10)
)

for name, color, dash in [
    ('Mean', 'red', 'dash'),
    ('Median', 'green', 'dot'),
    ('Mode', 'orange', 'longdash')
]:
    fig.add_trace(go.Scatter(
        x=[None], y=[None],
        mode='lines',
        name=name,
        line=dict(color=color, dash=dash, width=2.5)
    ))

fig.show()

Distribusi data bersifat Right Skewed (condong ke kanan), yang ditunjukkan oleh:

Mode ($10,043) < Median ($10,204) < Mean ($13,027)

Artinya sebagian besar mahasiswa mengeluarkan biaya di kisaran $10,000, namun terdapat sebagian kecil mahasiswa dengan pengeluaran sangat tinggi (hingga ~$50,000) yang menarik nilai Mean ke kanan menjadi $13,027 jauh di atas Mode dan Median.

## Penyebaran Data

### Rentang(Range)

In [106]:
values = df['Value'].dropna()

min_val   = np.min(values)
max_val   = np.max(values)
range_val = max_val - min_val

print("=" * 45)
print("    RANGE DATA â€” KOLOM VALUE")
print("=" * 45)
print(f"  {'Nilai Minimum':<20} : ${min_val:>10,.2f}")
print(f"  {'Nilai Maximum':<20} : ${max_val:>10,.2f}")
print(f"  {'Range (Max - Min)':<20} : ${range_val:>10,.2f}")
print("=" * 45)
print(f"\nRentang pengeluaran mahasiswa berkisar")
print(f"antara ${min_val:,.2f} hingga ${max_val:,.2f}")
print(f"dengan selisih sebesar ${range_val:,.2f}")

    RANGE DATA â€” KOLOM VALUE
  Nilai Minimum        : $  1,225.00
  Nilai Maximum        : $ 49,152.00
  Range (Max - Min)    : $ 47,927.00

Rentang pengeluaran mahasiswa berkisar
antara $1,225.00 hingga $49,152.00
dengan selisih sebesar $47,927.00


### Interquartile Range (IQR)

In [107]:
# ============================================================
# RANGE DATA â€” Kolom Value
# ============================================================
values = df['Value'].dropna()

min_val   = np.min(values)
max_val   = np.max(values)
range_val = max_val - min_val
q1_val    = np.percentile(values, 25)
q3_val    = np.percentile(values, 75)
iqr_val   = q3_val - q1_val

# ============================================================
# TABLE PLOTLY
# ============================================================
metrics = ['Nilai Minimum', 'Q1 (25%)', 'Median (50%)', 'Q3 (75%)', 'Nilai Maximum', 'Range (Max - Min)', 'IQR (Q3 - Q1)']
values_list = [min_val, q1_val, np.median(values), q3_val, max_val, range_val, iqr_val]
formatted   = [f"${v:,.2f}" for v in values_list]
keterangan  = [
    'Pengeluaran terendah',
    '25% data berada di bawah nilai ini',
    '50% data berada di bawah nilai ini',
    '75% data berada di bawah nilai ini',
    'Pengeluaran tertinggi',
    'Selisih nilai Max dan Min',
    'Rentang data tengah (robust terhadap outlier)'
]

# Warna selang-seling per baris
row_colors = [
    ['#f9f9f9' if i % 2 == 0 else '#ffffff' for i in range(len(metrics))],
    ['#f9f9f9' if i % 2 == 0 else '#ffffff' for i in range(len(metrics))],
    ['#f9f9f9' if i % 2 == 0 else '#ffffff' for i in range(len(metrics))],
]

fig_table = go.Figure(data=[go.Table(
    columnwidth=[200, 150, 350],
    header=dict(
        values=['<b>Metrik</b>', '<b>Nilai</b>', '<b>Keterangan</b>'],
        fill_color='steelblue',
        font=dict(color='white', size=13),
        align='center',
        height=40
    ),
    cells=dict(
        values=[metrics, formatted, keterangan],
        fill_color=row_colors,
        font=dict(color='#333333', size=12),
        align=['left', 'center', 'left'],
        height=35
    )
)])

fig_table.update_layout(
    title=dict(
        text='Tabel Range Data â€” Kolom Value',
        font=dict(size=17),
        x=0.5
    ),
    margin=dict(t=80, b=40, l=40, r=40),
    paper_bgcolor='white',
    height=380
)

fig_table.show()

Rentang pengeluaran mahasiswa sangat lebar dengan Range $47,927 (Max $49,152 âˆ’ Min $1,225), namun angka ini ditarik oleh outlier ekstrem di sisi atas.


Gambaran lebih representatif ditunjukkan oleh IQR sebesar $7,074, artinya 50% mahasiswa di tengah mengeluarkan biaya antara $7,756 (Q1) hingga $14,830 (Q3) â€” rentang yang jauh lebih sempit dibanding Range keseluruhan.

### Varians (Variance) dan Standard Deviation

In [108]:
# ============================================================
# VARIANS â€” Kolom Value
# ============================================================
values = df['Value'].dropna()

var_population = np.var(values, ddof=0)   # Varians Populasi
var_sample     = np.var(values, ddof=1)   # Varians Sampel
std_population = np.std(values, ddof=0)   # Std Dev Populasi
std_sample     = np.std(values, ddof=1)   # Std Dev Sampel
mean_val       = np.mean(values)
n              = len(values)

# ============================================================
# TABLE PLOTLY
# ============================================================
metrics    = [
    'Jumlah Data (n)',
    'Mean',
    'Varians Populasi (ÏƒÂ²)',
    'Varians Sampel (sÂ²)',
    'Std Dev Populasi (Ïƒ)',
    'Std Dev Sampel (s)'
]
vals       = [n, mean_val, var_population, var_sample, std_population, std_sample]
formatted  = [
    f"{n:,}",
    f"${mean_val:,.2f}",
    f"${var_population:,.2f}",
    f"${var_sample:,.2f}",
    f"${std_population:,.2f}",
    f"${std_sample:,.2f}"
]
keterangan = [
    'Total observasi dalam dataset',
    'Rata-rata pengeluaran keseluruhan',
    'Varians dengan pembagi N (seluruh populasi)',
    'Varians dengan pembagi N-1 (estimasi dari sampel)',
    'Akar kuadrat dari Varians Populasi',
    'Akar kuadrat dari Varians Sampel'
]
formula = [
    'n',
    'Î£x / n',
    'Î£(x - Î¼)Â² / N',
    'Î£(x - xÌ„)Â² / (N-1)',
    'âˆšÏƒÂ²',
    'âˆšsÂ²'
]

row_colors = [
    ['#f9f9f9' if i % 2 == 0 else '#ffffff' for i in range(len(metrics))],
    ['#f9f9f9' if i % 2 == 0 else '#ffffff' for i in range(len(metrics))],
    ['#f9f9f9' if i % 2 == 0 else '#ffffff' for i in range(len(metrics))],
    ['#f9f9f9' if i % 2 == 0 else '#ffffff' for i in range(len(metrics))],
]

fig_table = go.Figure(data=[go.Table(
    columnwidth=[180, 120, 100, 320],
    header=dict(
        values=['<b>Metrik</b>', '<b>Nilai</b>', '<b>Formula</b>', '<b>Keterangan</b>'],
        fill_color='steelblue',
        font=dict(color='white', size=13),
        align='center',
        height=40
    ),
    cells=dict(
        values=[metrics, formatted, formula, keterangan],
        fill_color=row_colors,
        font=dict(color='#333333', size=12),
        align=['left', 'center', 'center', 'left'],
        height=35
    )
)])

fig_table.update_layout(
    title=dict(
        text='ðŸ“Š Tabel Varians (Variance) â€” Kolom Value',
        font=dict(size=17),
        x=0.5
    ),
    margin=dict(t=80, b=40, l=40, r=40),
    paper_bgcolor='white',
    height=360
)


fig_table.show()

Pengeluaran mahasiswa memiliki Std Dev sebesar $8,734 dari rata-rata $13,027, artinya data tersebar sangat lebar â€” sekitar 67% mahasiswa memiliki pengeluaran di rentang $4,293 hingga $21,761.


Varians yang sangat besar ($76,292,689) mengkonfirmasi bahwa pengeluaran antar mahasiswa sangat bervariasi dan tidak seragam, dengan kesenjangan biaya yang signifikan antar individu.

### Box Plot

In [109]:
values   = df['Value'].dropna()
mean_val = np.mean(values)

# ============================================================
# BOXPLOT
# ============================================================
fig = go.Figure()

fig.add_trace(go.Box(
    y=values,
    name='Value',
    marker=dict(
        color='steelblue',
        outliercolor='red',
        size=4,
        line=dict(outliercolor='red', outlierwidth=1.5)
    ),
    line=dict(color='steelblue', width=2),
    fillcolor='rgba(70,130,180,0.3)',
    boxmean='sd',                        # Tampilkan mean & std dev
    boxpoints='outliers',                # Tampilkan outlier saja
    hovertemplate=(
        "<b>%{y:$,.0f}</b><extra></extra>"
    )
))

# Garis Mean
fig.add_hline(
    y=mean_val,
    line=dict(color='red', dash='dash', width=1.8),
    annotation_text=f'Mean: ${mean_val:,.0f}',
    annotation_position='top right',
    annotation_font=dict(color='red', size=11)
)

# ============================================================
# ANOTASI STATISTIK
# ============================================================
q1_val     = np.percentile(values, 25)
q3_val     = np.percentile(values, 75)
median_val = np.median(values)
iqr_val    = q3_val - q1_val
min_val    = np.min(values)
max_val    = np.max(values)
std_val    = np.std(values, ddof=1)

stats_text = (
    f"<b>Statistik Deskriptif</b><br>"
    f"â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€<br>"
    f"Min    : ${min_val:,.0f}<br>"
    f"Q1     : ${q1_val:,.0f}<br>"
    f"Median : ${median_val:,.0f}<br>"
    f"Mean   : ${mean_val:,.0f}<br>"
    f"Q3     : ${q3_val:,.0f}<br>"
    f"Max    : ${max_val:,.0f}<br>"
    f"IQR    : ${iqr_val:,.0f}<br>"
    f"Std Dev: ${std_val:,.0f}"
)

fig.add_annotation(
    x=1.32, y=0.5,
    xref='paper', yref='paper',
    text=stats_text,
    showarrow=False,
    align='left',
    bgcolor='lightyellow',
    bordercolor='grey',
    borderwidth=1,
    borderpad=8,
    font=dict(size=11)
)

fig.update_layout(
    title=dict(
        text='Boxplot Distribusi Pengeluaran Mahasiswa (Value)',
        font=dict(size=17),
        x=0.5
    ),
    yaxis=dict(
        title='Value ($)',
        tickprefix='$',
        tickformat=',.0f',
        showgrid=True,
        gridcolor='lightgrey'
    ),
    xaxis=dict(showticklabels=False),
    plot_bgcolor='white',
    paper_bgcolor='white',
    height=560,
    width=750,
    margin=dict(t=80, b=60, l=80, r=200),
    showlegend=False
)

fig.show()

Boxplot memperlihatkan distribusi pengeluaran yang right skewed, ditandai dengan Mean ($13,028) yang lebih tinggi dari Median ($10,204) nilai tengah yang sebenarnya lebih rendah dari rata-rata.50% mahasiswa mengeluarkan biaya dalam rentang $7,757 (Q1) hingga $14,831 (Q3) dengan IQR $7,074, namun terdapat outlier ekstrem yang mencapai hampir $49,152 terlihat jelas sebagai titik-titik di atas whisker atas.

### Hapus Outlier

In [110]:
# ============================================================
# HAPUS OUTLIER MENGGUNAKAN IQR â€” Batas 1.5
# ============================================================
Q1  = df['Value'].quantile(0.25)
Q3  = df['Value'].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

df_clean = df[(df['Value'] >= lower_bound) & (df['Value'] <= upper_bound)].copy()

# ============================================================
# VISUALISASI PERBANDINGAN SEBELUM & SESUDAH HAPUS OUTLIER
# ============================================================
fig = make_subplots(
    rows=2, cols=2,
    subplot_titles=(
        'Boxplot â€” Sebelum (Original)',
        'Boxplot â€” Sesudah (Clean)',
        'Histogram â€” Sebelum (Original)',
        'Histogram â€” Sesudah (Clean)'
    ),
    vertical_spacing=0.15,
    horizontal_spacing=0.1
)

# ============================================================
# BOXPLOT SEBELUM
# ============================================================
fig.add_trace(go.Box(
    y=df['Value'],
    name='Original',
    marker=dict(color='crimson', outliercolor='red', size=3),
    fillcolor='rgba(220,50,50,0.2)',
    line=dict(color='crimson', width=2),
    boxmean='sd',
    boxpoints='outliers',
    hovertemplate="<b>$%{y:,.0f}</b><extra>Original</extra>"
), row=1, col=1)

# ============================================================
# BOXPLOT SESUDAH
# ============================================================
fig.add_trace(go.Box(
    y=df_clean['Value'],
    name='Clean',
    marker=dict(color='steelblue', size=3),
    fillcolor='rgba(70,130,180,0.2)',
    line=dict(color='steelblue', width=2),
    boxmean='sd',
    boxpoints='outliers',
    hovertemplate="<b>$%{y:,.0f}</b><extra>Clean</extra>"
), row=1, col=2)

# ============================================================
# HISTOGRAM SEBELUM
# ============================================================
fig.add_trace(go.Histogram(
    x=df['Value'],
    nbinsx=40,
    name='Original',
    marker=dict(color='crimson', line=dict(color='white', width=0.5)),
    opacity=0.75,
    hovertemplate="<b>Range:</b> $%{x:,.0f}<br><b>Frekuensi:</b> %{y}<extra>Original</extra>"
), row=2, col=1)

# ============================================================
# HISTOGRAM SESUDAH
# ============================================================
fig.add_trace(go.Histogram(
    x=df_clean['Value'],
    nbinsx=40,
    name='Clean',
    marker=dict(color='steelblue', line=dict(color='white', width=0.5)),
    opacity=0.75,
    hovertemplate="<b>Range:</b> $%{x:,.0f}<br><b>Frekuensi:</b> %{y}<extra>Clean</extra>"
), row=2, col=2)

# ============================================================
# GARIS LOWER & UPPER BOUND
# ============================================================
for col in [1, 2]:
    fig.add_hline(y=lower_bound, line=dict(color='orange', dash='dash', width=1.5),
                  annotation_text=f'Lower: ${lower_bound:,.0f}',
                  annotation_font=dict(color='orange', size=9),
                  row=1, col=col)
    fig.add_hline(y=upper_bound, line=dict(color='orange', dash='dash', width=1.5),
                  annotation_text=f'Upper: ${upper_bound:,.0f}',
                  annotation_font=dict(color='orange', size=9),
                  row=1, col=col)

for col in [1, 2]:
    fig.add_vline(x=lower_bound, line=dict(color='orange', dash='dash', width=1.5),
                  row=2, col=col)
    fig.add_vline(x=upper_bound, line=dict(color='orange', dash='dash', width=1.5),
                  row=2, col=col)

# ============================================================
# LAYOUT
# ============================================================
fig.update_yaxes(tickprefix='$', tickformat=',.0f', showgrid=True, gridcolor='lightgrey')
fig.update_xaxes(tickprefix='$', tickformat=',.0f', showgrid=False, row=2)

fig.update_layout(
    title=dict(
        text=(
            f'Perbandingan Sebelum vs Sesudah Hapus Outlier (IQR Ã— 1.5)<br>'
            f'<sup>Data Awal: {len(df):,} baris  â†’  Data Clean: {len(df_clean):,} baris  '
            f'| Outlier Dihapus: {len(df) - len(df_clean):,} baris '
            f'({(len(df) - len(df_clean)) / len(df) * 100:.2f}%)</sup>'
        ),
        font=dict(size=16),
        x=0.5
    ),
    plot_bgcolor='white',
    paper_bgcolor='white',
    height=720,
    margin=dict(t=100, b=60, l=80, r=60),
    showlegend=False
)

fig.show()

Proses outlier removal berhasil menghapus 420 baris (11.84%) dari total 3,548 data, menyisakan 3,128 data bersih yang lebih representatif.


Perubahan signifikan yang terlihat:Boxplot Setelah cleaning, rentang data menyempit drastis dari hampir $50,000 menjadi maksimal $25,000. 


Box (IQR) terlihat lebih proporsional dan whisker lebih seimbang, menandakan distribusi yang lebih simetris dan stabil.Histogram Data original menunjukkan ekor panjang ke kanan yang ekstrem (right skewed), sedangkan data clean memperlihatkan distribusi yang lebih terpusat dan rapi di sekitar $8,000â€“$12,000 meski tetap sedikit right skewed.




### Shifting (Pergeseran)

In [111]:
# ============================================================
# DATA SHIFTING â€” Berbagai Metode
# ============================================================
shift_mean   = df_clean['Value'].mean()
shift_min    = df_clean['Value'].min()
shift_median = df_clean['Value'].median()

# Buat variabel baru untuk setiap metode shifting
df_shifted_mean   = df_clean.copy()
df_shifted_min    = df_clean.copy()
df_shifted_median = df_clean.copy()

df_shifted_mean['Value']   = df_clean['Value'] - shift_mean      # Geser ke Mean = 0
df_shifted_min['Value']    = df_clean['Value'] - shift_min       # Geser ke Min  = 0
df_shifted_median['Value'] = df_clean['Value'] - shift_median    # Geser ke Median = 0

print("=" * 55)
print("     DATA SHIFTING â€” KOLOM VALUE")
print("=" * 55)
print(f"  {'Shift Mean':<25} : ${shift_mean:>10,.2f}")
print(f"  {'Shift Min':<25} : ${shift_min:>10,.2f}")
print(f"  {'Shift Median':<25} : ${shift_median:>10,.2f}")
print("-" * 55)
print(f"\n  Setelah Shifting Mean:")
print(f"  {'  Min':<25} : ${df_shifted_mean['Value'].min():>10,.2f}")
print(f"  {'  Max':<25} : ${df_shifted_mean['Value'].max():>10,.2f}")
print(f"  {'  Mean':<25} : ${df_shifted_mean['Value'].mean():>10,.2f}")
print(f"\n  Setelah Shifting Min:")
print(f"  {'  Min':<25} : ${df_shifted_min['Value'].min():>10,.2f}")
print(f"  {'  Max':<25} : ${df_shifted_min['Value'].max():>10,.2f}")
print(f"  {'  Mean':<25} : ${df_shifted_min['Value'].mean():>10,.2f}")
print(f"\n  Setelah Shifting Median:")
print(f"  {'  Min':<25} : ${df_shifted_median['Value'].min():>10,.2f}")
print(f"  {'  Max':<25} : ${df_shifted_median['Value'].max():>10,.2f}")
print(f"  {'  Mean':<25} : ${df_shifted_median['Value'].mean():>10,.2f}")
print("=" * 55)


# ============================================================
# VISUALISASI PERBANDINGAN
# ============================================================
fig = make_subplots(
    rows=2, cols=2,
    subplot_titles=(
        'Original Clean Data',
        'Shift by Mean (Mean = 0)',
        'Shift by Min (Min = 0)',
        'Shift by Median (Median = 0)'
    ),
    vertical_spacing=0.18,
    horizontal_spacing=0.1
)

configs = [
    (df_clean['Value'],          'Clean',          'steelblue',  1, 1),
    (df_shifted_mean['Value'],   'Shift Mean',     'royalblue',  1, 2),
    (df_shifted_min['Value'],    'Shift Min',      'seagreen',   2, 1),
    (df_shifted_median['Value'], 'Shift Median',   'darkorange', 2, 2),
]

for data, name, color, row, col in configs:
    # Histogram
    fig.add_trace(go.Histogram(
        x=data,
        nbinsx=40,
        name=name,
        marker=dict(color=color, line=dict(color='white', width=0.5)),
        opacity=0.80,
        hovertemplate=f"<b>Range:</b> %{{x:,.0f}}<br><b>Frekuensi:</b> %{{y}}<extra>{name}</extra>"
    ), row=row, col=col)

    # Garis Mean
    fig.add_vline(
        x=data.mean(),
        line=dict(color='red', dash='dash', width=1.8),
        annotation_text=f"Mean: {data.mean():,.0f}",
        annotation_font=dict(color='red', size=9),
        annotation_position='top right',
        row=row, col=col
    )

    # Garis Median
    fig.add_vline(
        x=data.median(),
        line=dict(color='gold', dash='dot', width=1.8),
        annotation_text=f"Median: {data.median():,.0f}",
        annotation_font=dict(color='goldenrod', size=9),
        annotation_position='top left',
        row=row, col=col
    )

fig.update_xaxes(tickformat=',.0f', showgrid=False)
fig.update_yaxes(title_text='Frekuensi', showgrid=True, gridcolor='lightgrey')

fig.update_layout(
    title=dict(
        text=(
            'Data Shifting â€” Perbandingan 3 Metode<br>'
            f'<sup>n = {len(df_clean):,} | '
            f'Shift Mean = ${shift_mean:,.2f} | '
            f'Shift Min = ${shift_min:,.2f} | '
            f'Shift Median = ${shift_median:,.2f}</sup>'
        ),
        font=dict(size=16),
        x=0.5
    ),
    plot_bgcolor='white',
    paper_bgcolor='white',
    height=680,
    margin=dict(t=110, b=60, l=60, r=60),
    showlegend=False
)

fig.show()

     DATA SHIFTING â€” KOLOM VALUE
  Shift Mean                : $ 10,448.17
  Shift Min                 : $  1,225.00
  Shift Median              : $  9,678.00
-------------------------------------------------------

  Setelah Shifting Mean:
    Min                     : $ -9,223.17
    Max                     : $ 14,981.83
    Mean                    : $     -0.00

  Setelah Shifting Min:
    Min                     : $      0.00
    Max                     : $ 24,205.00
    Mean                    : $  9,223.17

  Setelah Shifting Median:
    Min                     : $ -8,453.00
    Max                     : $ 15,752.00
    Mean                    : $    770.17


Ketiga metode shifting tidak mengubah bentuk distribusi â€” pola right skewed tetap konsisten di semua panel, hanya posisi data di sumbu X yang bergeser.


Shift by Mean (Mean = 0) â€” Seluruh data digeser sehingga Mean tepat di angka 0, menghasilkan nilai negatif di sisi kiri hingga -$10,000 dan positif di sisi kanan hingga +$15,000. Paling cocok untuk PCA dan algoritma berbasis jarak.


Shift by Min (Min = 0) â€” Nilai terkecil dijadikan titik nol sehingga semua nilai menjadi positif (0 hingga ~$24,000). Bentuk distribusi identik dengan original, hanya digeser ke kiri. Cocok ketika nilai negatif tidak diperbolehkan.


Shift by Median (Median = 0) â€” Median dijadikan titik referensi nol, menghasilkan rentang -$9,678 hingga +$15,000. Mean yang tersisa sebesar $770 menunjukkan data masih sedikit right skewed. Pilihan paling robust untuk data yang memiliki outlier.

### Scaling (Penyekalaan)

In [112]:
# ============================================================
# SCALING â€” Standard Scaler & Min-Max Normalization
# ============================================================
values_clean = df_clean['Value'].values.reshape(-1, 1)

# Standard Scaler (Z-score): mean=0, std=1
scaler_standard          = StandardScaler()
values_standard          = scaler_standard.fit_transform(values_clean).flatten()

# Min-Max Normalization: range [0, 1]
scaler_minmax            = MinMaxScaler()
values_minmax            = scaler_minmax.fit_transform(values_clean).flatten()

# Simpan ke DataFrame
df_standard              = df_clean.copy()
df_standard['Value']     = values_standard

df_normalized            = df_clean.copy()
df_normalized['Value']   = values_minmax

# ============================================================
# SUMMARY PERBANDINGAN
# ============================================================
summary = {
    'Metode'   : ['Clean (Original)', 'Standard Scaler', 'Min-Max Normalization'],
    'Min'      : [df_clean['Value'].min(),    values_standard.min(),  values_minmax.min()],
    'Max'      : [df_clean['Value'].max(),    values_standard.max(),  values_minmax.max()],
    'Mean'     : [df_clean['Value'].mean(),   values_standard.mean(), values_minmax.mean()],
    'Std Dev'  : [df_clean['Value'].std(),    values_standard.std(),  values_minmax.std()],
    'Median'   : [df_clean['Value'].median(), np.median(values_standard), np.median(values_minmax)],
}

# ============================================================
# VISUALISASI
# ============================================================
fig = make_subplots(
    rows=2, cols=3,
    subplot_titles=(
        'Clean (Original)',
        'Standard Scaler',
        'Min-Max Normalization',
        'Boxplot â€” Clean',
        'Boxplot â€” Standard Scaler',
        'Boxplot â€” Min-Max Normalization'
    ),
    vertical_spacing=0.18,
    horizontal_spacing=0.1
)

configs = [
    (df_clean['Value'],  'Original',   'steelblue',  1, 1),
    (values_standard,    'Standard',   'royalblue',  1, 2),
    (values_minmax,      'Min-Max',    'seagreen',   1, 3),
]

# ============================================================
# HISTOGRAM
# ============================================================
for data, name, color, row, col in configs:
    fig.add_trace(go.Histogram(
        x=data,
        nbinsx=40,
        name=name,
        marker=dict(color=color, line=dict(color='white', width=0.5)),
        opacity=0.80,
        hovertemplate=f"<b>Range:</b> %{{x:,.4f}}<br><b>Frekuensi:</b> %{{y}}<extra>{name}</extra>"
    ), row=1, col=col)

    # Mean line
    fig.add_vline(
        x=np.mean(data),
        line=dict(color='red', dash='dash', width=1.8),
        annotation_text=f"Mean: {np.mean(data):.3f}",
        annotation_font=dict(color='red', size=9),
        annotation_position='top right',
        row=1, col=col
    )

    # Median line
    fig.add_vline(
        x=np.median(data),
        line=dict(color='gold', dash='dot', width=1.8),
        annotation_text=f"Median: {np.median(data):.3f}",
        annotation_font=dict(color='goldenrod', size=9),
        annotation_position='top left',
        row=1, col=col
    )

# ============================================================
# BOXPLOT
# ============================================================
box_configs = [
    (df_clean['Value'], 'Original', 'steelblue', 2, 1),
    (values_standard,   'Standard', 'royalblue', 2, 2),
    (values_minmax,     'Min-Max',  'seagreen',  2, 3),
]

for data, name, color, row, col in box_configs:
    fig.add_trace(go.Box(
        y=data,
        name=name,
        marker=dict(color=color, outliercolor='red', size=3),
        fillcolor=f'rgba(70,130,180,0.2)' if color == 'steelblue' else
                  f'rgba(65,105,225,0.2)' if color == 'royalblue' else
                  f'rgba(46,139,87,0.2)',
        line=dict(color=color, width=2),
        boxmean='sd',
        boxpoints='outliers',
        hovertemplate=f"<b>%{{y:,.4f}}</b><extra>{name}</extra>"
    ), row=2, col=col)

# ============================================================
# LAYOUT
# ============================================================
fig.update_xaxes(tickformat=',.2f', showgrid=False)
fig.update_yaxes(showgrid=True, gridcolor='lightgrey')

fig.update_layout(
    title=dict(
        text=(
            'Perbandingan: Clean Data vs Standard Scaler vs Min-Max Normalization<br>'
            f'<sup>n = {len(df_clean):,} | '
            f'Original Range: ${df_clean["Value"].min():,.0f} â€“ ${df_clean["Value"].max():,.0f} | '
            f'Standard: {values_standard.min():.2f} â€“ {values_standard.max():.2f} | '
            f'Min-Max: {values_minmax.min():.2f} â€“ {values_minmax.max():.2f}</sup>'
        ),
        font=dict(size=15),
        x=0.5
    ),
    plot_bgcolor='white',
    paper_bgcolor='white',
    height=720,
    margin=dict(t=110, b=60, l=60, r=60),
    showlegend=False
)

fig.show()

Clean Data (Original) memiliki skala pengeluaran dalam satuan dollar dengan rentang $1,225 hingga $25,430, Mean $10,448, dan Median $9,678.


Setelah diterapkan Standard Scaler, skala berubah menjadi rentang -1.79 hingga +2.90 dengan Mean = 0 dan Std Dev = 1.


Setelah diterapkan Min-Max Normalization, skala berubah menjadi rentang 0.00 hingga 1.00 dengan Mean = 0.381 dan Median = 0.349 â€” seluruh nilai kini dalam bentuk proporsi di mana $1,225 menjadi 0.00 dan $25,430 menjadi 1.00.