In [None]:
"""
üèÉ‚Äç‚ôÇÔ∏è Halfmarathon Wroc≈Çaw - Model Training Pipeline

Ten notebook zawiera kompletny pipeline do:
1. Wczytywania danych z DigitalOcean Spaces
2. Czyszczenia i przygotowania danych
3. Feature engineering
4. Trenowania modelu predykcyjnego
5. Ewaluacji modelu
6. Zapisywania modelu lokalnie i na DigitalOcean
7. Zapisywania wykres√≥w do katalogu data/
"""

# ============ 1. IMPORT BIBLIOTEK I KONFIGURACJA ============

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
import joblib
import boto3
from dotenv import load_dotenv
import os
import sys
import json
from pathlib import Path
from datetime import datetime

# Dodanie ≈õcie≈ºki do modu≈Ç√≥w utils
sys.path.append('..')
from utils.data_preprocessing import clean_data_for_modeling, prepare_features_for_model, merge_years_data

# Konfiguracja wy≈õwietlania
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)
pd.set_option('display.max_columns', None)

# Za≈Çadowanie zmiennych ≈õrodowiskowych
load_dotenv()

print("‚úÖ Biblioteki za≈Çadowane")

# ============ PLOT MANAGER - ZARZƒÑDZANIE WYKRESAMI ============

class PlotManager:
    """ZarzƒÖdza zapisywaniem i katalogowaniem wykres√≥w"""
    
    def __init__(self, base_dir='data'):
        self.base_dir = Path(base_dir)
        self.plots_dir = self.base_dir / 'training_plots'
        self.manifest_path = self.base_dir / 'plots_manifest.json'
        
        # Tworzenie katalog√≥w
        self.base_dir.mkdir(exist_ok=True)
        self.plots_dir.mkdir(exist_ok=True)
        
        # Inicjalizacja manifestu
        self.manifest = self._load_manifest()
        print(f"‚úÖ PlotManager gotowy. Katalog: {self.plots_dir}")
    
    def _load_manifest(self):
        """Wczytuje istniejƒÖcy manifest lub tworzy nowy"""
        if self.manifest_path.exists():
            with open(self.manifest_path, 'r', encoding='utf-8') as f:
                return json.load(f)
        return {
            'created': datetime.now().isoformat(),
            'plots': []
        }
    
    def save_plot(self, figure, filename, title, description):
        """Zapisuje wykres i dodaje do manifestu"""
        plot_path = self.plots_dir / filename
        figure.savefig(plot_path, dpi=100, bbox_inches='tight')
        plt.close(figure)
        
        plot_entry = {
            'filename': filename,
            'title': title,
            'description': description,
            'saved_at': datetime.now().isoformat(),
            'path': str(plot_path.relative_to(self.base_dir))
        }
        
        self.manifest['plots'].append(plot_entry)
        self._save_manifest()
        
        print(f"   ‚úÖ Zapisano: {filename}")
        return plot_path
    
    def _save_manifest(self):
        """Zapisuje manifest do pliku JSON"""
        with open(self.manifest_path, 'w', encoding='utf-8') as f:
            json.dump(self.manifest, f, indent=4, ensure_ascii=False)

# Inicjalizacja
plot_manager = PlotManager('data')

# ============ 2. WCZYTYWANIE DANYCH Z DIGITALOCEAN SPACES ============

BUCKET_NAME = "dane-modul9"

# Inicjalizacja klienta S3 (DigitalOcean Spaces)
s3_client = boto3.client(
    "s3",
    aws_access_key_id=os.getenv("AWS_ACCESS_KEY_ID"),
    aws_secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY"),
    endpoint_url=os.getenv("AWS_ENDPOINT_URL_S3")
)

print("üîó ≈ÅƒÖczenie z DigitalOcean Spaces...")

# Wczytanie danych
wroclaw_2023_df = pd.read_csv(
    f"s3://{BUCKET_NAME}/dane-zadanie_modul9/halfmarathon_wroclaw_2023__final.csv", 
    sep=";"
)
wroclaw_2024_df = pd.read_csv(
    f"s3://{BUCKET_NAME}/dane-zadanie_modul9/halfmarathon_wroclaw_2024__final.csv", 
    sep=";"
)

print(f"‚úÖ Dane wczytane:")
print(f"   - 2023: {len(wroclaw_2023_df)} wierszy")
print(f"   - 2024: {len(wroclaw_2024_df)} wierszy")

# ============ 3. CZYSZCZENIE I PRZYGOTOWANIE DANYCH ============

print("\nüßπ Czyszczenie danych...")
print("\n=== ROK 2023 ===")
df_2023_clean = clean_data_for_modeling(wroclaw_2023_df, 2023)

print("\n=== ROK 2024 ===")
df_2024_clean = clean_data_for_modeling(wroclaw_2024_df, 2024)

# Po≈ÇƒÖczenie danych z obu lat
print("\n=== ≈ÅƒÑCZENIE DANYCH ===")
df_combined = pd.concat([df_2023_clean, df_2024_clean], ignore_index=True)
df_combined['Year'] = df_combined['Rocznik'].apply(lambda x: 2023 if x < 2010 else 2024)

print(f"\n‚úÖ Dane po≈ÇƒÖczone: {len(df_combined)} wierszy")

# ============ 4. EKSPLORACJA OCZYSZCZONYCH DANYCH ============

print("\nüìä Podstawowe statystyki oczyszczonych danych:")
print(f"\nRozmiar datasetu: {df_combined.shape}")
print(f"\nRozk≈Çad p≈Çci:")
print(df_combined['P≈Çeƒá'].value_counts())
print(f"\nStatystyki wieku:")
print(df_combined['Wiek'].describe())
print(f"\nStatystyki tempo:")
print(df_combined['Tempo'].describe())

# Wizualizacja rozk≈Çad√≥w
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Rozk≈Çad tempa
axes[0, 0].hist(df_combined['Tempo'], bins=50, color='steelblue', edgecolor='black', alpha=0.7)
axes[0, 0].set_xlabel('Tempo (min/km)')
axes[0, 0].set_ylabel('Liczba')
axes[0, 0].set_title('Rozk≈Çad tempa')
axes[0, 0].axvline(df_combined['Tempo'].median(), color='red', linestyle='--', label='Mediana')
axes[0, 0].legend()

# Rozk≈Çad wieku
axes[0, 1].hist(df_combined['Wiek'], bins=30, color='coral', edgecolor='black', alpha=0.7)
axes[0, 1].set_xlabel('Wiek')
axes[0, 1].set_ylabel('Liczba')
axes[0, 1].set_title('Rozk≈Çad wieku')
axes[0, 1].axvline(df_combined['Wiek'].median(), color='red', linestyle='--', label='Mediana')
axes[0, 1].legend()

# Tempo vs Wiek
axes[1, 0].scatter(df_combined['Wiek'], df_combined['Tempo'], alpha=0.3, s=10)
axes[1, 0].set_xlabel('Wiek')
axes[1, 0].set_ylabel('Tempo (min/km)')
axes[1, 0].set_title('Tempo vs Wiek')

# Tempo wed≈Çug p≈Çci
df_combined.boxplot(column='Tempo', by='P≈Çeƒá', ax=axes[1, 1])
axes[1, 1].set_xlabel('P≈Çeƒá')
axes[1, 1].set_ylabel('Tempo (min/km)')
axes[1, 1].set_title('Tempo wed≈Çug p≈Çci')
axes[1, 1].get_figure().suptitle('')

plt.tight_layout()

plot_manager.save_plot(
    fig,
    'exploratory_distributions.png',
    'Rozk≈Çady danych wej≈õciowych',
    'Analiza rozk≈Çad√≥w tempa, wieku i ich korelacji'
)

# ============ 5. PRZYGOTOWANIE FEATURE'√ìW I TARGET'A ============

# Wyb√≥r feature'√≥w do modelu (ZOPTYMALIZOWANE na podstawie analizy)
# Usuniƒôto: Has_Team (0.04% importance), First_5km_Fast (dubluje 5 km Tempo)
feature_columns = [
    'Gender_Numeric',        # P≈Çeƒá (0=K, 1=M) - 0.0% importance, ale szybko dostƒôpne
    'Wiek',                  # Wiek - 0.3% importance, personalizacja
    '5 km Tempo',            # Tempo na pierwszych 5km - 87.7% importance ‚≠ê
    'Tempo Stabilno≈õƒá',      # Stabilno≈õƒá tempa - 11.9% importance ‚úÖ
]

print("\nüìä U≈ºyte feature'y (po optymalizacji):")
print("   ‚úÖ 5 km Tempo: 87.7% importance - KLUCZOWY")
print("   ‚úÖ Tempo Stabilno≈õƒá: 11.9% importance - WA≈ªNY")
print("   ‚ö†Ô∏è  Wiek: 0.3% importance - personalizacja")
print("   ‚ö†Ô∏è  Gender_Numeric: 0.0% importance - demographics")
print("\nüí° Usuniƒôto:")
print("   ‚ùå Has_Team: redundantny (ju≈º w 5 km Tempo)")
print("   ‚ùå First_5km_Fast: korelacja -0.786 z 5 km Tempo")

# Target - tempo na mecie
target_column = 'Tempo'

# Usuniƒôcie wierszy z brakujƒÖcymi warto≈õciami
df_model = df_combined[feature_columns + [target_column]].dropna()

print(f"\nüìä Dane do modelowania: {len(df_model)} wierszy")
print(f"Feature'y: {feature_columns}")
print(f"Target: {target_column}")

# Przygotowanie X i y
X = df_model[feature_columns]
y = df_model[target_column]

print(f"\n‚úÖ X shape: {X.shape}")
print(f"‚úÖ y shape: {y.shape}")

# ============ 6. PODZIA≈Å DANYCH NA TRAIN/TEST ============

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"\nüìä Podzia≈Ç danych:")
print(f"   - Train: {len(X_train)} wierszy ({len(X_train)/len(X)*100:.1f}%)")
print(f"   - Test: {len(X_test)} wierszy ({len(X_test)/len(X)*100:.1f}%)")

# Standaryzacja (opcjonalna, ale mo≈ºe poprawiƒá wyniki niekt√≥rych modeli)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("\n‚úÖ Dane przygotowane do trenowania")

# ============ 7. TRENOWANIE MODELI ============

print("\nüöÄ Trenowanie modeli...")

models = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(alpha=1.0),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, random_state=42)
}

results = {}

for name, model in models.items():
    print(f"\nüìà Trenowanie: {name}...")
    
    # Trenowanie
    if name in ['Linear Regression', 'Ridge Regression']:
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_test_scaled)
    else:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
    
    # Ewaluacja
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    
    results[name] = {
        'model': model,
        'mae': mae,
        'rmse': rmse,
        'r2': r2,
        'predictions': y_pred
    }
    
    print(f"   ‚úÖ MAE: {mae:.4f}")
    print(f"   ‚úÖ RMSE: {rmse:.4f}")
    print(f"   ‚úÖ R¬≤: {r2:.4f}")

# ============ 8. POR√ìWNANIE MODELI ============

# Dataframe z wynikami
results_df = pd.DataFrame({
    'Model': list(results.keys()),
    'MAE': [results[m]['mae'] for m in results.keys()],
    'RMSE': [results[m]['rmse'] for m in results.keys()],
    'R¬≤': [results[m]['r2'] for m in results.keys()]
}).sort_values('MAE')

print("\nüìä Por√≥wnanie modeli:")
print(results_df.to_string(index=False))

# Wizualizacja
fig, axes = plt.subplots(1, 3, figsize=(15, 5))

# MAE
axes[0].bar(results_df['Model'], results_df['MAE'], color='steelblue', alpha=0.7)
axes[0].set_ylabel('MAE')
axes[0].set_title('Mean Absolute Error (ni≈ºszy = lepszy)')
axes[0].tick_params(axis='x', rotation=45)

# RMSE
axes[1].bar(results_df['Model'], results_df['RMSE'], color='coral', alpha=0.7)
axes[1].set_ylabel('RMSE')
axes[1].set_title('Root Mean Squared Error (ni≈ºszy = lepszy)')
axes[1].tick_params(axis='x', rotation=45)

# R¬≤
axes[2].bar(results_df['Model'], results_df['R¬≤'], color='green', alpha=0.7)
axes[2].set_ylabel('R¬≤')
axes[2].set_title('R¬≤ Score (wy≈ºszy = lepszy)')
axes[2].tick_params(axis='x', rotation=45)

plt.tight_layout()

plot_manager.save_plot(
    fig,
    'models_comparison.png',
    'Por√≥wnanie wydajno≈õci modeli',
    'Metryki MAE, RMSE, R¬≤ dla wszystkich wytrenowanych modeli'
)

# Najlepszy model
best_model_name = results_df.iloc[0]['Model']
best_model = results[best_model_name]['model']

print(f"\nüèÜ Najlepszy model: {best_model_name}")
print(f"   MAE: {results[best_model_name]['mae']:.4f} min/km")
print(f"   RMSE: {results[best_model_name]['rmse']:.4f} min/km")
print(f"   R¬≤: {results[best_model_name]['r2']:.4f}")

# ============ 9. FEATURE IMPORTANCE (dla Random Forest) ============

if 'Random Forest' in results:
    rf_model = results['Random Forest']['model']
    
    feature_importance = pd.DataFrame({
        'Feature': feature_columns,
        'Importance': rf_model.feature_importances_
    }).sort_values('Importance', ascending=False)
    
    print("\nüìä Feature Importance (Random Forest):")
    print(feature_importance.to_string(index=False))
    
    # Wizualizacja
    fig = plt.figure(figsize=(10, 6))
    plt.barh(feature_importance['Feature'], feature_importance['Importance'], 
             color='steelblue', alpha=0.7)
    plt.xlabel('Importance')
    plt.title('Feature Importance - Random Forest')
    plt.gca().invert_yaxis()
    plt.tight_layout()
    
    plot_manager.save_plot(
        fig,
        'feature_importance.png',
        'Wa≈ºno≈õƒá cech (Feature Importance)',
        'Analiza wp≈Çywu poszczeg√≥lnych zmiennych na predykcjƒô'
    )

# ============ 10. ANALIZA PREDYKCJI ============

# Predykcje vs warto≈õci rzeczywiste dla najlepszego modelu
best_predictions = results[best_model_name]['predictions']

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Scatter plot
axes[0].scatter(y_test, best_predictions, alpha=0.3, s=10)
axes[0].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 
             'r--', lw=2)
axes[0].set_xlabel('Rzeczywiste tempo (min/km)')
axes[0].set_ylabel('Przewidywane tempo (min/km)')
axes[0].set_title(f'Predykcje vs Rzeczywiste - {best_model_name}')
axes[0].grid(True, alpha=0.3)

# Histogram b≈Çƒôd√≥w
errors = y_test - best_predictions
axes[1].hist(errors, bins=50, color='steelblue', edgecolor='black', alpha=0.7)
axes[1].axvline(0, color='red', linestyle='--', lw=2)
axes[1].set_xlabel('B≈ÇƒÖd predykcji (min/km)')
axes[1].set_ylabel('Liczba')
axes[1].set_title('Rozk≈Çad b≈Çƒôd√≥w predykcji')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()

plot_manager.save_plot(
    fig,
    'predictions_analysis.png',
    'Analiza dok≈Çadno≈õci predykcji',
    'Por√≥wnanie warto≈õci przewidywanych z rzeczywistymi oraz rozk≈Çad b≈Çƒôd√≥w'
)

print(f"\nüìä Statystyki b≈Çƒôd√≥w:")
print(f"   ≈öredni b≈ÇƒÖd: {errors.mean():.4f} min/km")
print(f"   Mediana b≈Çƒôdu: {errors.median():.4f} min/km")
print(f"   Odchylenie std b≈Çƒôdu: {errors.std():.4f} min/km")

# ============ 11. ZAPISYWANIE MODELU LOKALNIE ============

# Utworzenie katalogu models je≈õli nie istnieje
os.makedirs('models', exist_ok=True)

# Zapisanie najlepszego modelu
model_filename = f'models/halfmarathon_model_{best_model_name.replace(" ", "_").lower()}.pkl'
scaler_filename = 'models/scaler.pkl'

joblib.dump(best_model, model_filename)
joblib.dump(scaler, scaler_filename)

print(f"\n‚úÖ Model zapisany lokalnie: {model_filename}")
print(f"‚úÖ Scaler zapisany lokalnie: {scaler_filename}")

# Zapisanie informacji o modelu
model_info = {
    'model_name': best_model_name,
    'features': feature_columns,
    'mae': results[best_model_name]['mae'],
    'rmse': results[best_model_name]['rmse'],
    'r2': results[best_model_name]['r2'],
    'training_date': pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S'),
    'training_samples': len(X_train)
}

with open('models/model_info.json', 'w') as f:
    json.dump(model_info, f, indent=4)

print(f"‚úÖ Informacje o modelu zapisane: models/model_info.json")

# ============ 12. UPLOAD MODELU NA DIGITALOCEAN SPACES ============

print("\n‚òÅÔ∏è Wysy≈Çanie modelu na DigitalOcean Spaces...")

def upload_file_to_spaces(local_file, remote_file):
    """Upload pliku do DigitalOcean Spaces"""
    try:
        s3_client.upload_file(local_file, BUCKET_NAME, remote_file)
        print(f"   ‚úÖ Wys≈Çano: {remote_file}")
        return True
    except Exception as e:
        print(f"   ‚ùå B≈ÇƒÖd: {e}")
        return False

# Upload modelu
upload_file_to_spaces(
    model_filename, 
    f'models/halfmarathon_model_{best_model_name.replace(" ", "_").lower()}.pkl'
)

# Upload scalera
upload_file_to_spaces(
    scaler_filename,
    'models/scaler.pkl'
)

# Upload informacji o modelu
upload_file_to_spaces(
    'models/model_info.json',
    'models/model_info.json'
)

print("\n‚úÖ Model wys≈Çany na DigitalOcean Spaces!")

# ============ 13. PODSUMOWANIE ============

print("\n" + "="*60)
print("üéâ PIPELINE ZAKO≈ÉCZONY POMY≈öLNIE!")
print("="*60)
print(f"\nüìä Podsumowanie:")
print(f"   - Wytrenowano {len(models)} modeli")
print(f"   - Najlepszy model: {best_model_name}")
print(f"   - MAE: {results[best_model_name]['mae']:.4f} min/km")
print(f"   - RMSE: {results[best_model_name]['rmse']:.4f} min/km")
print(f"   - R¬≤: {results[best_model_name]['r2']:.4f}")
print(f"\nüíæ Model zapisany:")
print(f"   - Lokalnie: {model_filename}")
print(f"   - DigitalOcean: models/halfmarathon_model_*.pkl")
print(f"\nüìä Wykresy zapisane:")
print(f"   - Katalog: data/training_plots/")
print(f"   - Manifest: data/plots_manifest.json")
print(f"\nüöÄ Gotowe do wdro≈ºenia w aplikacji Streamlit!")
print("="*60)