In [9]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np


file_path = "/content/drive/MyDrive/Dataset Skripsi/Hasil_Tes_Atlet_Porda.csv"
data = pd.read_csv(file_path, delimiter=";", encoding="utf-8")



In [10]:
columns_to_convert = [
    'Kelentukan', 'Kelincahan', 'Kecepatan', 'Hand Grip kanan',
    'Hand Grip Kiri', 'Vo2 max', 'Berat Badan', 'Power Otot Tungkai'
]

# Konversi ke tipe numerik dengan pembersihan
for col in columns_to_convert:
   data[col] =data[col].astype(str).str.replace(',', '.', regex=True)  # Ganti koma ke titik
   data[col] =data[col].str.replace(r'[^0-9.]', '', regex=True)  # Hapus karakter selain angka dan titik
   data[col] = pd.to_numeric(data[col], errors='coerce')  # Konversi ke float, NaN jika gagal
   data[col] =data[col].fillna(0).astype(int)  # Isi NaN dengan 0 sebelum ubah ke int

data['Usia'] =data['Usia'].round().astype(int)
print(data.dtypes)

index                   int64
Gender                 object
Kelompok               object
Berat Badan             int64
Usia                    int64
Vo2 max                 int64
Power Otot Tungkai      int64
Kecepatan Reaksi      float64
Kelentukan              int64
Kelincahan              int64
Kecepatan               int64
Hand Grip kanan         int64
Hand Grip Kiri          int64
Koordinasi            float64
dtype: object


In [11]:
print(data.dtypes)
print(data[columns_to_convert].head())

index                   int64
Gender                 object
Kelompok               object
Berat Badan             int64
Usia                    int64
Vo2 max                 int64
Power Otot Tungkai      int64
Kecepatan Reaksi      float64
Kelentukan              int64
Kelincahan              int64
Kecepatan               int64
Hand Grip kanan         int64
Hand Grip Kiri          int64
Koordinasi            float64
dtype: object
   Kelentukan  Kelincahan  Kecepatan  Hand Grip kanan  Hand Grip Kiri  \
0          31          15          4               33              20   
1          21          17          5               39              31   
2          36          18          5               22              26   
3          37          17          5               49              37   
4          37          20          6               21              25   

   Vo2 max  Berat Badan  Power Otot Tungkai  
0       41           50                  50  
1       41           53            

In [12]:
from sklearn.preprocessing import LabelEncoder

# Encoding 'Gender' dan 'Kelompok' kolom
label_encoder_gender = LabelEncoder()
label_encoder_kelompok = LabelEncoder()

data['Gender_encoded'] = label_encoder_gender.fit_transform(data['Gender'])
data['Kelompok_encoded'] = label_encoder_kelompok.fit_transform(data['Kelompok'])

{
    "Encoded Gender Mapping": dict(zip(label_encoder_gender.classes_, label_encoder_gender.transform(label_encoder_gender.classes_))),
    "Encoded Kelompok Mapping": dict(zip(label_encoder_kelompok.classes_, label_encoder_kelompok.transform(label_encoder_kelompok.classes_))),
}

{'Encoded Gender Mapping': {'Pria': 0, 'Wanita': 1},
 'Encoded Kelompok Mapping': {'Dewasa': 0, 'Remaja': 1}}

In [13]:
# Deteksi Outlier menggunakan z-score untuk numerical columns
from scipy.stats import zscore

# Select kolom numerik
numerical_cols = [
    'Power Otot Tungkai', 'Kecepatan Reaksi', 'Kelentukan',
    'Kelincahan', 'Kecepatan', 'Hand Grip kanan', 'Hand Grip Kiri'
]

# Compute z-scores for numerical columns
z_scores = np.abs(zscore(data[numerical_cols]))
outliers = np.where(z_scores > 3)  # Z-score > 3 is often considered an outlier

# Display outlier indices and corresponding column names
outlier_indices = list(set(outliers[0]))  # Unique indices of outliers
outlier_columns = [numerical_cols[i] for i in set(outliers[1])]

# Show results
outlier_results = {
    "Outlier Indices": outlier_indices,
    "Outlier Columns": outlier_columns
}

for key, value in outlier_results.items():
    print(f"{key}:")
    for item in value:
        print(f"- {item}")
    print("\n" + "-"*30 + "\n")

Outlier Indices:
- 256
- 226
- 196

------------------------------

Outlier Columns:
- Kecepatan Reaksi

------------------------------



In [14]:
from sklearn.preprocessing import MinMaxScaler
#===Power===
#Leg Power
data['Leg Power'] = (2.21 *data['Berat Badan'] * (data['Power Otot Tungkai'] / 100))

#Hand Power
data['Hand Power'] =data['Hand Grip kanan'] /data['Hand Grip Kiri']


#===Speed===
data['Speed'] = 20 /data['Kecepatan']

In [15]:
#Penentuan norma untuk klasifikasi atlit

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import GridSearchCV, StratifiedKFold


# Endurance
def classify_vo2max(row):
    if row['Gender'] == 'Pria':
        if row['Vo2 max'] <= 24:
            return 'Beginner'
        elif 25 <= row['Vo2 max'] <= 36:
            return 'Intermediate'
        elif 37 <= row['Vo2 max'] < 50:
            return 'Advanced'
    elif row['Gender'] == 'Wanita':
        if row['Vo2 max'] <= 22:
            return 'Beginner'
        elif 23 <= row['Vo2 max'] <= 33:
            return 'Intermediate'
        elif 34 <= row['Vo2 max'] < 46:
            return 'Advanced'
    return 'Uncategorized'

data['Endurance Category'] =data.apply(classify_vo2max, axis=1)

# Speed
def classify_speed(row):
    if row['Gender'] == 'Pria':
        if row['Speed'] >= 3.70:
            return 'Beginner'
        elif 3.31 <= row['Speed'] < 3.50:
            return 'Intermediate'
        elif row['Speed'] < 3.11:
            return 'Advanced'
    elif row['Gender'] == 'Wanita':
        if row['Speed'] >= 3.90:
            return 'Beginner'
        elif 3.51 <= row['Speed'] < 3.70:
            return 'Intermediate'
        elif row['Speed'] < 3.50:
            return 'Advanced'
    return 'Uncategorized'

data['Speed Category'] =data.apply(classify_speed, axis=1)

# Leg Power Classification (Estimasi Wanita)
def classify_leg_power(row):
    leg_power = row['Leg Power']

    if row['Gender'] == 'Pria':
        if leg_power >= 79:
            return 'Advanced'
        elif 65 <= leg_power < 79:
            return 'Intermediate'
        else:
            return 'Beginner'

    elif row['Gender'] == 'Wanita':
        if leg_power >= 59:
            return 'Advanced'
        elif 49 <= leg_power < 59:
            return 'Intermediate'
        else:
            return 'Beginner'

data['Leg Power Category'] = data.apply(classify_leg_power, axis=1)

# Hand Power
def classify_handpower(row):
    if row['Gender'] == 'Pria':
        if row['Hand Power'] >= 1.30 or row['Hand Power'] < 0.85:
            return 'Beginner'
        elif 0.85 <= row['Hand Power'] < 1.15:
            return 'Intermediate'
        elif 1.15 <= row['Hand Power'] < 1.30:
            return 'Advanced'
    elif row['Gender'] == 'Wanita':
        if row['Hand Power'] >= 1.25 or row['Hand Power'] < 0.80:
            return 'Beginner'
        elif 0.80 <= row['Hand Power'] < 1.10:
            return 'Intermediate'
        elif 1.10 <= row['Hand Power'] < 1.25:
            return 'Advanced'
    return 'Uncategorized'

data['Hand Power Category'] =data.apply(classify_handpower, axis=1)




In [16]:
# Kombinasi kategori untuk target klasifikasi
data['Overall Category'] =data[['Endurance Category', 'Speed Category','Leg Power Category','Hand Power Category']].mode(axis=1)[0]

# Encoding kategori menjadi numerik
label_encoder = LabelEncoder()
data['Overall Category Encoded'] = label_encoder.fit_transform(data['Overall Category'])

In [17]:
# Feature Selection
features =data[['Leg Power', 'Hand Power','Speed','Vo2 max']]
target =data['Overall Category Encoded']

# Normalisasi fitur
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

In [18]:
# Menentukan jumlah fold (k) secara otomatis menggunakan sqrt(N)
N = len(features_scaled)
k = min(10, max(2, int(np.sqrt(N))))
kf = KFold(n_splits=k, shuffle=True, random_state=42)

In [19]:
# Simpan Hasil Evaluasi untuk Setiap Fold
fold_evaluations = []

for train_index, test_index in kf.split(features_scaled):
    X_train_fold, X_test_fold = features_scaled[train_index], features_scaled[test_index]
    y_train_fold, y_test_fold = target.iloc[train_index], target.iloc[test_index]

    # Latih Model pada Fold Ini
    rf_model_fold = RandomForestClassifier(
        n_estimators=30,
        max_depth=3,
        min_samples_split=30,
        min_samples_leaf=15,
        max_features="sqrt",
        bootstrap=True,
        class_weight='balanced',
        random_state=42
    )

    rf_model_fold.fit(X_train_fold, y_train_fold)
    y_pred_fold = rf_model_fold.predict(X_test_fold)

    # Evaluasi Model pada Fold Ini
    fold_accuracy = accuracy_score(y_test_fold, y_pred_fold)
    fold_precision = precision_score(y_test_fold, y_pred_fold, average='weighted', zero_division=1)
    fold_recall = recall_score(y_test_fold, y_pred_fold, average='weighted', zero_division=1)
    fold_f1 = f1_score(y_test_fold, y_pred_fold, average='weighted')

    fold_evaluations.append({
        "Accuracy": fold_accuracy,
        "Precision": fold_precision,
        "Recall": fold_recall,
        "F1-Score": fold_f1
    })

In [20]:
# Hitung Rata-rata Evaluasi dari Semua Fold
average_evaluation = {
    "Mean Accuracy": np.mean([eval["Accuracy"] for eval in fold_evaluations]),
    "Mean Precision": np.mean([eval["Precision"] for eval in fold_evaluations]),
    "Mean Recall": np.mean([eval["Recall"] for eval in fold_evaluations]),
    "Mean F1-Score": np.mean([eval["F1-Score"] for eval in fold_evaluations])
}

In [21]:
# Tampilkan Hasil Evaluasi dengan Basic K-Fold
print("\n**K-Fold Evaluations per Fold:**\n")
for i, eval_metrics in enumerate(fold_evaluations):
    print(f"🔹 Fold {i+1}:")
    print(f"   - Accuracy:  {eval_metrics['Accuracy']:.4f}")
    print(f"   - Precision: {eval_metrics['Precision']:.4f}")
    print(f"   - Recall:    {eval_metrics['Recall']:.4f}")
    print(f"   - F1-Score:  {eval_metrics['F1-Score']:.4f}")
    print("-" * 40)

print("\n**Mean Evaluation Metrics (Averaged Across Folds):**\n")
print(f"✅ Mean Accuracy:  {average_evaluation['Mean Accuracy']:.4f}")
print(f"✅ Mean Precision: {average_evaluation['Mean Precision']:.4f}")
print(f"✅ Mean Recall:    {average_evaluation['Mean Recall']:.4f}")
print(f"✅ Mean F1-Score:  {average_evaluation['Mean F1-Score']:.4f}")


**K-Fold Evaluations per Fold:**

🔹 Fold 1:
   - Accuracy:  0.7333
   - Precision: 0.7465
   - Recall:    0.7333
   - F1-Score:  0.7219
----------------------------------------
🔹 Fold 2:
   - Accuracy:  0.6333
   - Precision: 0.6536
   - Recall:    0.6333
   - F1-Score:  0.6287
----------------------------------------
🔹 Fold 3:
   - Accuracy:  0.9000
   - Precision: 0.9074
   - Recall:    0.9000
   - F1-Score:  0.9023
----------------------------------------
🔹 Fold 4:
   - Accuracy:  0.8000
   - Precision: 0.8480
   - Recall:    0.8000
   - F1-Score:  0.8129
----------------------------------------
🔹 Fold 5:
   - Accuracy:  0.8333
   - Precision: 0.8396
   - Recall:    0.8333
   - F1-Score:  0.8342
----------------------------------------
🔹 Fold 6:
   - Accuracy:  0.8333
   - Precision: 0.8479
   - Recall:    0.8333
   - F1-Score:  0.8311
----------------------------------------
🔹 Fold 7:
   - Accuracy:  0.9000
   - Precision: 0.9083
   - Recall:    0.9000
   - F1-Score:  0.8967
-----

In [23]:
import pickle

# Simpan model setelah pelatihan
with open("random_forest_model.pkl", "wb") as file:
    pickle.dump(data, file)

print("Model berhasil disimpan sebagai 'random_forest_model.pkl'")

Model berhasil disimpan sebagai 'random_forest_model.pkl'
