In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import joblib
import os

def normalize_sheet(df, target_columns, sheet_name):
    """
    Normalize a dataframe while preserving target columns untouched.
    
    Args:
        df: Input dataframe to normalize
        target_columns: List of columns that should not be normalized
        sheet_name: Name of the sheet for saving purposes
        
    Returns:
        Normalized dataframe and the scaler used
    """
    # Check if target columns exist in the dataframe
    existing_targets = [col for col in target_columns if col in df.columns]
    
    # Separate features and targets
    X = df.drop(columns=existing_targets, errors='ignore')
    Y = df[existing_targets] if existing_targets else pd.DataFrame()
    
    # Get only numeric columns for normalization
    numeric_cols = X.select_dtypes(include=['number']).columns
    
    if len(numeric_cols) > 0:
        # Normalize numeric features
        scaler_X = MinMaxScaler()
        X_numeric = X[numeric_cols]
        
        # Handle any NaN values before scaling
        X_numeric = X_numeric.fillna(X_numeric.mean())
        
        X_normalized = pd.DataFrame(
            scaler_X.fit_transform(X_numeric),
            columns=numeric_cols
        )
        
        # Keep non-numeric columns from X
        non_numeric = X.select_dtypes(exclude=['number'])
        
        # Combine non-numeric and normalized numeric features
        if not non_numeric.empty:
            X_normalized = pd.concat([
                non_numeric.reset_index(drop=True), 
                X_normalized.reset_index(drop=True)
            ], axis=1)
        
        # Combine normalized features with untouched targets
        if not Y.empty:
            df_normalized = pd.concat([
                X_normalized.reset_index(drop=True), 
                Y.reset_index(drop=True)
            ], axis=1)
        else:
            df_normalized = X_normalized
            
        return df_normalized, scaler_X
    else:
        print(f"Warning: No numeric columns found in sheet '{sheet_name}' to normalize")
        return df, None

def process_excel_file(input_file, target_columns, output_file=None):
    """
    Process all sheets in an Excel file, normalizing each one.
    
    Args:
        input_file: Path to the input Excel file
        target_columns: List of columns that should not be normalized
        output_file: Path to save the normalized Excel file (default: adds '_normalized' suffix)
    """
    # Default output filename if not provided
    if output_file is None:
        name_parts = os.path.splitext(input_file)
        output_file = f"{name_parts[0]}_normalized{name_parts[1]}"
    
    # Create Excel writer
    with pd.ExcelWriter(output_file, engine='openpyxl') as writer:
        # Read all sheets
        excel_file = pd.ExcelFile(input_file)
        sheet_names = excel_file.sheet_names
        
        # Store scalers for each sheet
        scalers = {}
        
        print(f"Processing {len(sheet_names)} sheets from {input_file}...")
        
        # Process each sheet
        for sheet_name in sheet_names:
            print(f"Normalizing sheet: {sheet_name}")
            df = pd.read_excel(input_file, sheet_name=sheet_name)
            
            # Skip empty sheets
            if df.empty:
                print(f"  Skipping empty sheet: {sheet_name}")
                df.to_excel(writer, sheet_name=sheet_name, index=False)
                continue
            
            # Normalize the sheet
            df_normalized, scaler = normalize_sheet(df, target_columns, sheet_name)
            
            if scaler is not None:
                scalers[sheet_name] = scaler
                # Save the normalized data to the new Excel file
                df_normalized.to_excel(writer, sheet_name=sheet_name, index=False)
                
                # Save the scaler for future use
                scaler_filename = f"scaler_{sheet_name.replace(' ', '_')}.pkl"
                joblib.dump(scaler, scaler_filename)
                print(f"  Saved scaler as {scaler_filename}")
            else:
                # If no normalization was performed, save the original data
                df.to_excel(writer, sheet_name=sheet_name, index=False)
    
    print(f"Normalized data saved to {output_file}")
    return output_file, scalers

In [2]:
input_file = 'Composite_Features_Output.xlsx'  # Replace with your input file
target_columns = ['Fifa Ability Overall', 'Rating']  # Replace with your target columns
output_file, scalers = process_excel_file(input_file, target_columns)
print(f"Output file: {output_file}")

Processing 4 sheets from Composite_Features_Output.xlsx...
Normalizing sheet: Data
  Saved scaler as scaler_Data.pkl
Normalizing sheet: DEF
  Saved scaler as scaler_DEF.pkl
Normalizing sheet: MID
  Saved scaler as scaler_MID.pkl
Normalizing sheet: OFF
  Saved scaler as scaler_OFF.pkl
Normalized data saved to Composite_Features_Output_normalized.xlsx
Output file: Composite_Features_Output_normalized.xlsx
