In [9]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import r2_score, mean_squared_error
import matplotlib.pyplot as plt
import os
import copy

In [11]:
# Add this cell before using train_and_evaluate_sheet

# Define position-specific feature weights
POSITION_FEATURE_WEIGHTS = {
    'OFF': {
        'finishing': 4,
        'creativity': 2,
        'distribution': 1,
        'defense': 1,
        'duels': 2
    },
    'MID': {
        'finishing': 2,
        'creativity': 3,
        'distribution': 3,
        'defense': 2,
        'duels': 2
    },
    'DEF': {
        'finishing': 1,
        'creativity': 1,
        'distribution': 2,
        'defense': 4,
        'duels': 3
    }
}

def apply_position_weights(df, sheet_name, feature_cols):
    """
    Apply position-specific weights to features based on sheet name.
    Returns a weighted DataFrame and the list of weighted feature columns.
    """
    df_weighted = df.copy()
    weighted_feature_cols = []

    # Determine position from sheet name
    position = None
    if 'off' in sheet_name.lower():
        position = 'OFF'
    elif 'mid' in sheet_name.lower():
        position = 'MID'
    elif 'def' in sheet_name.lower():
        position = 'DEF'
    else:
        print(f"Warning: Could not determine position from sheet name '{sheet_name}'.")
        print("Using equal weights for all features.")
        return df_weighted, feature_cols

    weights = POSITION_FEATURE_WEIGHTS[position]

    for feature in feature_cols:
        if feature in weights and feature in df_weighted.columns:
            feature_weight = weights[feature]
            df_weighted[f'{feature}_weighted'] = df_weighted[feature] * feature_weight
            weighted_feature_cols.append(f'{feature}_weighted')

    return df_weighted, weighted_feature_cols

In [14]:
def plot_importance(model, feature_names, title):
    """Plot feature importance for a model."""
    importances = model.feature_importances_
    indices = np.argsort(importances)
    plt.figure(figsize=(10, 6))
    plt.barh(range(len(indices)), importances[indices], align='center')
    plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
    plt.title(title)
    plt.xlabel("Importance")
    plt.tight_layout()

In [12]:
from sklearn.ensemble import GradientBoostingRegressor

def train_and_evaluate_sheet(df, sheet_name, feature_cols, target_fifa, target_real, use_weighted_features=True):
    print(f"\n===== Processing Sheet: {sheet_name} =====")

    missing_features = [col for col in feature_cols if col not in df.columns]
    if missing_features:
        print(f"Warning: Missing features in {sheet_name}: {missing_features}")
        feature_cols = [col for col in feature_cols if col in df.columns]
        if not feature_cols:
            print("No valid features found. Skipping.")
            return None

    has_fifa_target = target_fifa in df.columns
    has_real_target = target_real in df.columns

    if not has_fifa_target and not has_real_target:
        print(f"Skipping {sheet_name}: No target columns found.")
        return None

    if use_weighted_features:
        df_weighted, weighted_feature_cols = apply_position_weights(df, sheet_name, feature_cols)
        X = df_weighted[weighted_feature_cols] if weighted_feature_cols else df[feature_cols]
        features_used = weighted_feature_cols if weighted_feature_cols else feature_cols
    else:
        X = df[feature_cols]
        features_used = feature_cols

    results = {'sheet_name': sheet_name, 'feature_cols': features_used}

    def train_and_eval_model(model_cls, X, y, label_prefix):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        model = model_cls()
        model.fit(X_train, y_train)
        preds = model.predict(X_test)
        r2 = r2_score(y_test, preds)
        rmse = np.sqrt(mean_squared_error(y_test, preds))

        print(f"--- {sheet_name} - {label_prefix} ---")
        print(f"R²: {r2:.3f}, RMSE: {rmse:.3f}")
        plot_importance(model, features_used, f"{label_prefix} ({sheet_name})")
        plt.savefig(f"{label_prefix.replace(' ', '_')}_{sheet_name.replace(' ', '_')}.png")
        plt.close()

        return model, r2, rmse, model.feature_importances_

    if has_fifa_target:
        y_fifa = df[target_fifa]
        # Random Forest
        rf_model, rf_r2, rf_rmse, rf_importances = train_and_eval_model(
            lambda: RandomForestRegressor(random_state=42), X, y_fifa, "FIFA RF Rating"
        )
        results.update({
            'fifa_rf_model': rf_model,
            'fifa_rf_r2': rf_r2,
            'fifa_rf_rmse': rf_rmse,
            'fifa_rf_importances': rf_importances
        })
        # Gradient Boosting
        gb_model, gb_r2, gb_rmse, gb_importances = train_and_eval_model(
            lambda: GradientBoostingRegressor(random_state=42), X, y_fifa, "FIFA GB Rating"
        )
        results.update({
            'fifa_gb_model': gb_model,
            'fifa_gb_r2': gb_r2,
            'fifa_gb_rmse': gb_rmse,
            'fifa_gb_importances': gb_importances
        })

    if has_real_target:
        y_real = df[target_real]
        # Random Forest
        rf_model, rf_r2, rf_rmse, rf_importances = train_and_eval_model(
            lambda: RandomForestRegressor(random_state=42), X, y_real, "Real RF Rating"
        )
        results.update({
            'real_rf_model': rf_model,
            'real_rf_r2': rf_r2,
            'real_rf_rmse': rf_rmse,
            'real_rf_importances': rf_importances
        })
        # Gradient Boosting
        gb_model, gb_r2, gb_rmse, gb_importances = train_and_eval_model(
            lambda: GradientBoostingRegressor(random_state=42), X, y_real, "Real GB Rating"
        )
        results.update({
            'real_gb_model': gb_model,
            'real_gb_r2': gb_r2,
            'real_gb_rmse': gb_rmse,
            'real_gb_importances': gb_importances
        })

    return results

In [17]:
FEATURE_COLS = ['finishing', 'creativity', 'distribution', 'defense', 'duels']
TARGET_FIFA = 'Fifa Ability Overall'  
TARGET_REAL = 'Rating'

# File to process - change to your normalized or weighted file
FILE_PATH = '../notebooks/Composite_Features_Output_normalized.xlsx'  # or 'General_Weighted_Features.xlsx'

# Whether to apply position-specific weights
USE_WEIGHTED_FEATURES = True
all_sheets = pd.read_excel(FILE_PATH, sheet_name=None)

results = []
for sheet_name, df in all_sheets.items():
    if df.empty:
        print(f"Sheet '{sheet_name}' is empty. Skipping.")
        continue
    result = train_and_evaluate_sheet(df, sheet_name, FEATURE_COLS, TARGET_FIFA, TARGET_REAL, USE_WEIGHTED_FEATURES)
    if result:
        results.append(result)

# Process all sheets
results = train_and_evaluate_sheet(df, FEATURE_COLS, TARGET_FIFA, TARGET_REAL, USE_WEIGHTED_FEATURES)


print("\nProcessing complete!")
print(f"Model performance summary and feature importance plots have been saved.")



===== Processing Sheet: Data =====
Using equal weights for all features.
--- Data - FIFA RF Rating ---
R²: 0.219, RMSE: 4.684
--- Data - FIFA GB Rating ---
R²: 0.246, RMSE: 4.600
--- Data - Real RF Rating ---
R²: 0.652, RMSE: 0.169
--- Data - Real GB Rating ---
R²: 0.682, RMSE: 0.161

===== Processing Sheet: DEF =====
--- DEF - FIFA RF Rating ---
R²: 0.126, RMSE: 4.990
--- DEF - FIFA GB Rating ---
R²: 0.196, RMSE: 4.787
--- DEF - Real RF Rating ---
R²: 0.551, RMSE: 0.174
--- DEF - Real GB Rating ---
R²: 0.512, RMSE: 0.181

===== Processing Sheet: MID =====
--- MID - FIFA RF Rating ---
R²: 0.244, RMSE: 4.316
--- MID - FIFA GB Rating ---
R²: 0.260, RMSE: 4.271
--- MID - Real RF Rating ---
R²: 0.701, RMSE: 0.165
--- MID - Real GB Rating ---
R²: 0.731, RMSE: 0.157

===== Processing Sheet: OFF =====
--- OFF - FIFA RF Rating ---
R²: 0.405, RMSE: 4.863
--- OFF - FIFA GB Rating ---
R²: 0.391, RMSE: 4.924
--- OFF - Real RF Rating ---
R²: 0.812, RMSE: 0.176
--- OFF - Real GB Rating ---
R²: 0.85