In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score,  confusion_matrix, roc_auc_score
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

class WorldCupFinalistPredictor:
    def __init__(self, data_file):
        self.df = pd.read_excel(data_file)
        self.X = None
        self.y = None
        self.models = {}
        self.results = {}
        self.scaler = StandardScaler()


    def create_finalist_target(self):
    
        conditions = [
                # Elite performance criteria
                (self.df['win_rate'] > 0.6) & (self.df['goal_ratio'] > 1.5) & (self.df['world_cup_experience'] >= 3),
            
                # Historical powerhouses with consistent performance
                (self.df['world_cup_experience'] >= 4) & (self.df['win_rate'] > 0.6) & (self.df['points_per_game'] > 1.65),
            
                # Very dominant teams (high goal ratio)
                (self.df['goal_ratio'] > 2.0) & (self.df['win_rate'] > 0.55)
            ]
        
        choices = [1, 1, 1]  # Mark as potential finalist
        
        self.df['is_finalist'] = np.select(conditions, choices, default=0)
        
        print(f"Created finalist target:")
        
        # Show "finalists"
        finalists = self.df[self.df['is_finalist'] == 1]['Squad'].tolist()
        print(f"   Finalist candidates: {finalists}")
    
        return self.df

    def feature(self):
    
      # Use ALL available features from your dataset
      feature_columns = [
        'win_rate', 'goal_ratio', 'world_cup_experience', 'Appearances',
        'points_per_game', 'goal_diff_per_game', 'GF_per_game', 'GA_per_game',
        'xG_per_game', 'xGA_per_game', 'xGD_per_game', 'W_per_game', 'D_per_game', 'L_per_game',
        'attack_power', 'defense_strength', 'team_consistency', 'performance_efficiency'
      ]
    
      # Handle missing values properly
      for col in feature_columns:
          if col in self.df.columns:
              self.df[col] = self.df[col].fillna(self.df[col].median())
      
      return [col for col in feature_columns if col in self.df.columns]

    def prepare_data(self):
    
      # Get enhanced features
      feature_columns = self.feature()
    
      # Create actual target
      if 'is_finalist' not in self.df.columns:
          self.create_finalist_target()
    
      # Remove missing values
      clean_df = self.df[feature_columns + ['is_finalist']].dropna()
    
      self.X = clean_df[feature_columns]
      self.y = clean_df['is_finalist']
    
      # Use Stratified Split to maintain class distribution
      self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
          self.X, self.y, test_size=0.2, random_state=42, stratify=self.y
      )
    
      # Scale features
      self.X_train_scaled = self.scaler.fit_transform(self.X_train)
      self.X_test_scaled = self.scaler.transform(self.X_test)
    
      return self.X_train, self.X_test, self.y_train, self.y_test


    def build_models(self):
      
      
      models = {
          'Logistic Regression': {
              'model': LogisticRegression(class_weight='balanced', random_state=42, max_iter=1000),
              'params': {'C': [0.1, 1, 10], 'solver': ['liblinear', 'saga']}
          },
          'Random Forest': {
              'model': RandomForestClassifier(class_weight='balanced', random_state=42),
              'params': {'n_estimators': [100, 200], 'max_depth': [10, 15, None]}
         },
         'XGBoost': {
             'model': XGBClassifier(random_state=42, eval_metric='logloss'),
             'params': {'n_estimators': [100, 200], 'max_depth': [3, 6, 9], 'learning_rate': [0.01, 0.1]}
         }
      }
    
      for name, config in models.items():
        
          try:
              # Use GridSearch for hyperparameter tuning
              grid_search = GridSearchCV(
                  config['model'], config['params'], 
                  cv=5, scoring='roc_auc', n_jobs=-1
              )
            
              # Choose data based on model type
              if name == 'Logistic Regression':
                  grid_search.fit(self.X_train_scaled, self.y_train)
                  best_model = grid_search.best_estimator_
                  y_pred_proba = best_model.predict_proba(self.X_test_scaled)[:, 1]
                  y_pred = best_model.predict(self.X_test_scaled)
              else:
                  grid_search.fit(self.X_train, self.y_train)
                  best_model = grid_search.best_estimator_
                  y_pred_proba = best_model.predict_proba(self.X_test)[:, 1]
                  y_pred = best_model.predict(self.X_test)
            
              # Calculate comprehensive metrics
              accuracy = accuracy_score(self.y_test, y_pred)
              auc_roc = roc_auc_score(self.y_test, y_pred_proba)
            
              # Store results
              self.models[name] = best_model
              self.results[name] = {
                  'accuracy': accuracy,
                  'auc_roc': auc_roc,
                  'predictions': y_pred,
                  'probabilities': y_pred_proba,
                  'model': best_model,
                  'best_params': grid_search.best_params_
              }
            
            
          except Exception as e:
              print(f"Error training {name}: {e}")
    
      return self.models

    def evaluate_models(self):
      """ model evaluation with detailed metrics"""
      print("\n  MODEL EVALUATION")
      print("=" * 60)
    
      comparison = []
    
      for name, metrics in self.results.items():
          y_pred = metrics['predictions']
          y_true = self.y_test
        
          # Calculate comprehensive metrics
          cm = confusion_matrix(y_true, y_pred)
          tn, fp, fn, tp = cm.ravel()
        
          precision = tp / (tp + fp) if (tp + fp) > 0 else 0

          recall = tp / (tp + fn) if (tp + fn) > 0 else 0

          f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

          # Cross-validation scores
          if name == 'Logistic Regression':
              cv_scores = cross_val_score(metrics['model'], self.X_train_scaled, self.y_train, cv=5, scoring='roc_auc')
          else:
              cv_scores = cross_val_score(metrics['model'], self.X_train, self.y_train, cv=5, scoring='roc_auc')
        
          comparison.append({
            'Model': name,
            'Accuracy': metrics['accuracy'],
            'AUC-ROC': metrics['auc_roc'],
            'Precision': precision,
            'Recall': recall,
            'F1-Score': f1,
            'CV_AUC_Mean': cv_scores.mean()
          })
    
      results_df = pd.DataFrame(comparison)
      results_df = results_df.sort_values('F1-Score', ascending=False)
    
      print("\n Performance Comparison:")
      print(results_df.round(3).to_string(index=False))
    
      # Show best model
      best_model_row = results_df.iloc[0]
      print(f"\n BEST MODEL: {best_model_row['Model']}")
      print(f" F1-Score: {best_model_row['F1-Score']:.3f}, AUC-ROC: {best_model_row['AUC-ROC']:.3f}")
    
      return results_df

    def predict_finalists(self):
      """Finalist prediction """
      print("\n PREDICTING WORLD CUP FINALISTS")
      print("_" * 60)
    
      # Use the best model based on F1-score
      best_model_name = self.evaluate_models().iloc[0]['Model']
      best_model = self.models[best_model_name]
    
      print(f" Using best model: {best_model_name}")
    
      # Prepare all data for prediction
      feature_columns = self.X.columns.tolist()
      X_all = self.df[feature_columns].dropna()
      team_indices = X_all.index
      teams = self.df.loc[team_indices, 'Squad']
    
      # Get predictions for all teams
      if best_model_name == 'Logistic Regression':
          X_all_scaled = self.scaler.transform(X_all)
          probabilities = best_model.predict_proba(X_all_scaled)[:, 1]
      else:
          probabilities = best_model.predict_proba(X_all)[:, 1]
    
      # Create comprehensive results
      results = pd.DataFrame({
        'Team': teams,
        'Finalist_Probability': probabilities,
        'Actual_Finalist': self.df.loc[team_indices, 'is_finalist'],
        'Win_Rate': self.df.loc[team_indices, 'win_rate'],
        'World_Cup_Experience': self.df.loc[team_indices, 'world_cup_experience']
      })
    
      # Sort by probability
      results = results.sort_values('Finalist_Probability', ascending=False)
    
      # Show top predictions
      print("\n TOP PREDICTED FINALISTS:")
      print("-" * 50)
      top_teams = results.head(10)
    
      for idx, row in top_teams.iterrows():
          status = "ACTUAL FINALIST" if row['Actual_Finalist'] == 1 else " PREDICTED"
          print(f"   {row['Team']:20} | Prob: {row['Finalist_Probability']:.3f} | {status}")
    
      # Select final two finalists
      finalists = results.head(2)
    
      print(f"\n FINAL PREDICTION - WORLD CUP FINALISTS:")

      for i, (idx, team) in enumerate(finalists.iterrows(), 1):
          print(f" {i}. {team['Team']}")
          print(f" Probability: {team['Finalist_Probability']:.3f}")
          print(f" Win Rate: {team['Win_Rate']:.3f}")
          print(f" WC Experience: {team['World_Cup_Experience']} tournaments")
          print()
    
      return results

      
def main():
    
    print("WORLD CUP FINALIST PREDICTION SYSTEM")
    print("=" * 50)
    
    predictor = WorldCupFinalistPredictor('aggregated_team_stats.xlsx')
    
    predictor.prepare_data()
    predictor.build_models()
    predictions = predictor.predict_finalists()
    
    return predictor, predictions

if __name__ == "__main__":
    predictor, predictions = main()

WORLD CUP FINALIST PREDICTION SYSTEM
Created finalist target:
   Finalist candidates: ['ar Argentina', 'be Belgium', 'co Colombia', 'de Germany', 'fr France', 'nl Netherlands']

 PREDICTING WORLD CUP FINALISTS

  MODEL EVALUATION

 Performance Comparison:
              Model  Accuracy  AUC-ROC  Precision  Recall  F1-Score
      Random Forest     1.000      1.0      1.000     1.0       1.0
Logistic Regression     0.833      1.0      0.333     1.0       0.5
            XGBoost     0.917      1.0      0.000     0.0       0.0

 BEST MODEL: Random Forest
 F1-Score: 1.000, AUC-ROC: 1.000
 Using best model: Random Forest

 TOP PREDICTED FINALISTS:
--------------------------------------------------
   co Colombia          | Prob: 0.970 | ACTUAL FINALIST
   de Germany           | Prob: 0.970 | ACTUAL FINALIST
   nl Netherlands       | Prob: 0.950 | ACTUAL FINALIST
   be Belgium           | Prob: 0.920 | ACTUAL FINALIST
   fr France            | Prob: 0.860 | ACTUAL FINALIST
   ar Argentina     