In [25]:
import pandas as pd
import numpy as np
from scipy.stats import t  # distribusi t, untuk hitung p-value manual

# Load data dari satu file
df = pd.read_csv('train_features.csv')

# Bagi data jadi dua kategori berdasarkan Label
low = df[df['Label'] == 0].drop(columns=['Set_ID', 'Label', 'jumlah_motion_artifacts', 'siklus_motion_artifacts'], errors='ignore')
high = df[df['Label'] == 1].drop(columns=['Set_ID', 'Label', 'jumlah_motion_artifacts', 'siklus_motion_artifacts'], errors='ignore')

# Fungsi manual sample variance
def sample_variance(data):
    n = len(data)
    mean = sum(data) / n
    squared_diffs = [(x - mean) ** 2 for x in data]
    return sum(squared_diffs) / (n - 1)

# Inisialisasi hasil
results = []

# Loop semua kolom fitur
for col in low.columns:
    x1 = low[col].dropna().values
    x2 = high[col].dropna().values

    if len(x1) > 1 and len(x2) > 1:
        n1, n2 = len(x1), len(x2)
        m1, m2 = np.mean(x1), np.mean(x2)
        s1_sq = sample_variance(x1)
        s2_sq = sample_variance(x2)

        # Welch’s t-statistic
        t_stat = (m1 - m2) / np.sqrt(s1_sq / n1 + s2_sq / n2)

        # Derajat bebas
        numerator = (s1_sq / n1 + s2_sq / n2) ** 2
        denominator = ((s1_sq / n1) ** 2) / (n1 - 1) + ((s2_sq / n2) ** 2) / (n2 - 1)
        dfree = numerator / denominator

        # p-value dua arah
        p_val = 2 * (1 - t.cdf(abs(t_stat), dfree))

        def interpret_p(p):
            if p < 0.005:
                return "✨ Sangat signifikan (bagus!)"
            elif p < 0.05:
                return "⚠️ Signifikan"
            else:
                return "❌ Tidak signifikan"

        results.append({
            'Fitur': col,
            'T_stat': t_stat,
            'p_TTest': p_val,
            'Keterangan_TTest': interpret_p(p_val)
        })
    else:
        print(f"⚠️ Fitur '{col}' tidak cukup data untuk uji t.")

# Jadiin DataFrame
df_result = pd.DataFrame(results)

# Format tampilan
pd.set_option('display.float_format', '{:.6f}'.format)
print(df_result.head(20))


        Fitur   T_stat  p_TTest              Keterangan_TTest
0  slope_mean 7.409043 0.000000  ✨ Sangat signifikan (bagus!)
1   slope_std 6.147867 0.000000  ✨ Sangat signifikan (bagus!)
2    auc_mean 7.407277 0.000000  ✨ Sangat signifikan (bagus!)
3     auc_std 8.172386 0.000000  ✨ Sangat signifikan (bagus!)
