## Import & Config

In [24]:
# ==========================================
# 1. SETUP ENVIRONMENT
# ==========================================

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors
import joblib # Untuk menyimpan model

# Agar tampilan tabel di terminal/notebook tidak terpotong
pd.set_option('display.max_columns', None)
pd.options.display.float_format = '{:.2f}'.format

print("✅ Library imported successfully.")

✅ Library imported successfully.


## Load Data

In [25]:
# ==========================================
# 2. LOAD DATASET
# ==========================================
# Sesuaikan path ini. Jika notebook ada di folder 'notebooks',
# maka kita harus mundur satu langkah (..) lalu masuk 'data/raw'
raw_data_path = '../data/raw/male_players.csv' 

df = pd.read_csv(raw_data_path, low_memory=False)

print(f"✅ Data Loaded. Shape: {df.shape}")

✅ Data Loaded. Shape: (180021, 109)


## Cleaning & Feature Selection

In [26]:
# ==========================================
# 3. DATA CLEANING
# ==========================================

# A. Hapus Duplikat (Ambil update terakhir/tertua)
df = df.sort_values(by=['long_name', 'age'], ascending=[True, False])
df = df.drop_duplicates(subset=['long_name'], keep='first')

# B. Filter Posisi (Buang Kiper)
df = df[~df['player_positions'].str.contains('GK', na=False)].copy()

# C. Definisi Kolom yang Akan Disimpan
# Metadata (Identitas & Info Tambahan)
meta_cols = [
    'player_id', 'short_name', 'long_name', 'player_positions', 
    'age', 'club_name', 'nationality_name', 
    'overall', 'potential', 'value_eur', 'wage_eur', 
    'player_url', 'player_url', 
    'preferred_foot'
]

# Fitur Skill & Fisik (37 Kolom untuk AI)
feature_cols = [
    # FISIK (Baru)
    'height_cm', 'weight_kg',
    
    # SKILL TEKNIS (Lama)
    'pace', 'shooting', 'passing', 'dribbling', 'defending', 'physic',
    'attacking_crossing', 'attacking_finishing', 'attacking_heading_accuracy',
    'attacking_short_passing', 'attacking_volleys',
    'skill_dribbling', 'skill_curve', 'skill_fk_accuracy', 'skill_long_passing',
    'skill_ball_control',
    'movement_acceleration', 'movement_sprint_speed', 'movement_agility',
    'movement_reactions', 'movement_balance',
    'power_shot_power', 'power_jumping', 'power_stamina', 'power_strength',
    'power_long_shots',
    'mentality_aggression', 'mentality_interceptions', 'mentality_positioning',
    'mentality_vision', 'mentality_penalties', 'mentality_composure',
    'defending_marking_awareness', 'defending_standing_tackle', 'defending_sliding_tackle'
]

# Gabungkan dan filter dataset
final_cols = meta_cols + feature_cols
existing_cols = [col for col in final_cols if col in df.columns]
df_clean = df[existing_cols].copy()

# D. Hapus Data Kosong (Hanya jika Fitur Utama kosong)
# Kita tidak mau melatih AI dengan data 'NaN'
valid_features = [c for c in feature_cols if c in df_clean.columns]
df_clean = df_clean.dropna(subset=valid_features)

# Reset Index
df_clean.reset_index(drop=True, inplace=True)

print(f"✅ Cleaning Selesai. Data Bersih: {df_clean.shape[0]} Pemain")

✅ Cleaning Selesai. Data Bersih: 41275 Pemain


## Preprocessing

In [27]:
# ==========================================
# 4. TRAINING MODEL (37 FEATURES)
# ==========================================

# Siapkan data X (Hanya angka statistik & fisik)
# Pastikan urutannya SAMA PERSIS dengan feature_cols di atas
X = df_clean[feature_cols]

# 1. Training Scaler (Normalisasi)
print("⏳ Sedang melatih Scaler...")
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X) # Scaler sekarang belajar 37 kolom


⏳ Sedang melatih Scaler...


## Model Training (KNN)

In [28]:
# ==========================================
# 5. MODEL TRAINING (KNN)
# ==========================================

# Kita pakai Unsupervised Learning (NearestNeighbors)
# Metric 'euclidean' = Mengukur jarak garis lurus
print("⏳ Sedang melatih KNN...")
knn_model = NearestNeighbors(n_neighbors=50, algorithm='brute', metric='euclidean')
knn_model.fit(X_scaled)

print("✅ Training Selesai!")

⏳ Sedang melatih KNN...
✅ Training Selesai!


## Export Simpan Hasil

In [29]:
# ==========================================
# 5. SIMPAN FILE (EXPORT)
# ==========================================

# Simpan Dataset Bersih (CSV)
df_clean.to_csv('../data/processed/players_clean.csv', index=False)

# Simpan Model (PKL)
joblib.dump(scaler, '../models/scaler.pkl')
joblib.dump(knn_model, '../models/knn_model.pkl')

print("✅ Semua file berhasil disimpan:")
print("   - data/processed/players_clean.csv (Dataset baru)")
print("   - models/scaler.pkl (Support 37 fitur)")
print("   - models/knn_model.pkl (Support 37 fitur)")

✅ Semua file berhasil disimpan:
   - data/processed/players_clean.csv (Dataset baru)
   - models/scaler.pkl (Support 37 fitur)
   - models/knn_model.pkl (Support 37 fitur)
