In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

pd.set_option('display.max_columns', 20)
plt.style.use('seaborn-v0_8-whitegrid')

train_path = "dataset/SemgHandSubjectCh2_TRAIN.ts"
test_path  = "dataset/SemgHandSubjectCh2_TEST.ts"

def load_ts_file(file_path):
    encodings = ['utf-8', 'latin-1'] 
    data = []
    
    for enc in encodings:
        try:
            with open(file_path, 'r', encoding=enc) as file:
                is_data = False
                for line in file:
                    line = line.strip()
                    if not line or line.startswith(('#', '%')): continue
                        
                    if line.lower().startswith("@data"):
                        is_data = True
                        continue
                    
                    if not is_data: continue

                    if ':' in line:
                        parts = line.split(':')
                        features_str = parts[0]
                        label_str = parts[-1]   
                        
                        values = features_str.split(',')
                        values.append(label_str) 
                    else:
                        # Jika tidak ada titik dua, asumsi dipisah koma semua
                        values = line.split(',')
                        
                    data.append(values)
            break 
        except UnicodeDecodeError:
            continue
            
    if not data:
        raise ValueError(f"Gagal membaca file {file_path}.")

    # Konversi ke DataFrame
    df = pd.DataFrame(data)
    
    X = df.iloc[:, :-1].astype(float) 
    y = df.iloc[:, -1] 
    
    return X, y

X_train, y_train = load_ts_file(train_path)
X_test, y_test = load_ts_file(test_path)

In [13]:
from sklearn.preprocessing import StandardScaler, LabelEncoder
print("   TAHAP DATA PREPARATION   ")

le = LabelEncoder()

y_train_encoded = le.fit_transform(y_train)
y_test_encoded = le.transform(y_test) 

print("\n")
print(" Label Encoding Selesai.")
print(f"   Mapping Kelas: {dict(zip(le.classes_, le.transform(le.classes_)))}")
print(f"   Contoh Label Lama : {y_train.iloc[:5].values}")
print(f"   Contoh Label Baru : {y_train_encoded[:5]}")

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled  = scaler.transform(X_test)

print("\n Feature Scaling (StandardScaler) Selesai.")
print(f"   Mean sebelum scaling : {X_train.values.mean():.4f}")
print(f"   Mean setelah scaling : {X_train_scaled.mean():.4f}")
print(f"   Std  setelah scaling : {X_train_scaled.std():.4f} ")

X_train_final = X_train_scaled
X_test_final  = X_test_scaled
y_train_final = y_train_encoded
y_test_final  = y_test_encoded

print("\n DATA SIAP UNTUK PEMODELAN (MODELING)!")
print(f"   Dimensi X_train_final: {X_train_final.shape}")
print(f"   Dimensi y_train_final: {y_train_final.shape}")

   TAHAP DATA PREPARATION   


 Label Encoding Selesai.
   Mapping Kelas: {'1': 0, '2': 1, '3': 2, '4': 3, '5': 4}
   Contoh Label Lama : ['1' '1' '1' '1' '1']
   Contoh Label Baru : [0 0 0 0 0]

 Feature Scaling (StandardScaler) Selesai.
   Mean sebelum scaling : 16.8005
   Mean setelah scaling : -0.0000
   Std  setelah scaling : 1.0000 

 DATA SIAP UNTUK PEMODELAN (MODELING)!
   Dimensi X_train_final: (450, 1500)
   Dimensi y_train_final: (450,)


In [15]:
import numpy as np
print("Label :", np.unique(y_train_encoded))

Label : [0 1 2 3 4]


In [9]:
X_before = X_train.iloc[:5, :10]
X_after  = pd.DataFrame(
    X_train_final[:5, :10],
    columns=X_train.columns[:10],
    index=X_train.index[:5]
)

print("\n Data Sebelum Scaling:")
display(X_before)

print("\n Data Sesudah Scaling (StandardScaler):")
display(X_after)


 Data Sebelum Scaling:


Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,4.727125,2.797737,3.636139,2.746511,0.971014,2.210892,3.344643,0.584697,2.450385,2.185442
1,3.710899,2.4274,1.62462,0.83539,1.268488,1.498597,0.799706,1.25965,0.412866,0.928109
2,3.691784,7.646312,4.573417,6.589344,1.606932,5.105149,3.388973,6.454872,2.436382,6.15985
3,3.659098,1.34878,2.660728,1.951933,1.336938,2.802475,2.054903,2.29893,1.00757,0.718818
4,6.352287,3.688895,5.923694,4.516546,3.978157,4.106319,4.735417,5.000411,3.958317,3.482435



 Data Sesudah Scaling (StandardScaler):


Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,-0.040111,-0.331952,-0.098083,-0.194462,-0.943621,-0.379479,0.123906,-1.062592,-0.20153,-0.364214
1,-0.219762,-0.420839,-0.509532,-0.612577,-0.819651,-0.685947,-1.038461,-0.764704,-1.197992,-0.904067
2,-0.223141,0.831791,0.093633,0.646274,-0.678606,0.865786,0.144153,1.528182,-0.208378,1.342253
3,-0.228919,-0.679727,-0.2976,-0.3683,-0.791124,-0.124948,-0.465166,-0.306023,-0.907148,-0.993928
4,0.247189,-0.118058,0.369827,0.192786,0.309588,0.436035,0.759124,0.886263,0.535935,0.192668


In [10]:
comparison_stats = pd.DataFrame({
    "Mean Sebelum": X_train.iloc[:, :10].mean(),
    "Std Sebelum": X_train.iloc[:, :10].std(),
    "Mean Sesudah": X_train_final[:, :10].mean(axis=0),
    "Std Sesudah": X_train_final[:, :10].std(axis=0)
})

display(comparison_stats)

Unnamed: 0,Mean Sebelum,Std Sebelum,Mean Sesudah,Std Sesudah
0,4.95402,5.66297,-6.315935e-17,1.0
1,4.180769,4.170998,-1.578984e-16,1.0
2,4.115655,4.894315,-3.157968e-17,1.0
3,3.635357,4.575887,3.157968e-17,1.0
4,3.235284,2.402225,-1.184238e-16,1.0
5,3.092881,2.326796,-1.815831e-16,1.0
6,3.073359,2.19188,-2.3684760000000003e-17,1.0
7,2.992316,2.268322,0.0,1.0
8,2.862463,2.047029,7.894919e-18,1.0
9,3.033705,2.33162,0.0,1.0


In [None]:
import numpy as np
import os
import joblib


print("   MENYIMPAN DATA (SAVING)   ")

save_dir = "processed_data"
os.makedirs(save_dir, exist_ok=True)

np.save(os.path.join(save_dir, 'X_train_final.npy'), X_train_final)
np.save(os.path.join(save_dir, 'X_test_final.npy'), X_test_final)
np.save(os.path.join(save_dir, 'y_train_final.npy'), y_train_final)
np.save(os.path.join(save_dir, 'y_test_final.npy'), y_test_final)

print(f" Data berhasil disimpan di folder: '{save_dir}/'")
print(f"   - {save_dir}/X_train_final.npy")
print(f"   - {save_dir}/X_test_final.npy")
print(f"   - {save_dir}/y_train_final.npy")
print(f"   - {save_dir}/y_test_final.npy")

os.makedirs("models", exist_ok=True)
joblib.dump(scaler, 'models/scaler.pkl')

print("Scaler berhasil disimpan di: models/scaler.pkl")

   MENYIMPAN DATA (SAVING)   
 Data berhasil disimpan di folder: 'processed_data/'
   - processed_data/X_train_final.npy
   - processed_data/X_test_final.npy
   - processed_data/y_train_final.npy
   - processed_data/y_test_final.npy
Scaler berhasil disimpan di: models/scaler.pkl
