Initial and create copy of data 

In [20]:
import numpy as np
import pandas as pd

pd.set_option('display.max_columns', None)
data = pd.read_csv("/home/antonios/Desktop/Practica_de_vara/data-science-internship2/data/cleaned/fifa_target_leakage_cleaned.csv")
fifa = data.copy()
fifa.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18943 entries, 0 to 18942
Data columns (total 13 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   player_url               18943 non-null  object 
 1   dob                      18943 non-null  object 
 2   overall                  18943 non-null  int64  
 3   potential                18943 non-null  int64  
 4   value_eur                18943 non-null  int64  
 5   real_face                18943 non-null  bool   
 6   joined                   17960 non-null  object 
 7   passing                  16860 non-null  float64
 8   dribbling                16860 non-null  float64
 9   attacking_short_passing  18943 non-null  int64  
 10  movement_reactions       18943 non-null  int64  
 11  mentality_composure      18943 non-null  int64  
 12  lcm                      18943 non-null  int64  
dtypes: bool(1), float64(2), int64(7), object(3)
memory usage: 1.8+ MB


Encode the data

In [21]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

def encode_categorical_variables_fifa(data, target_column='value_eur'):
    """
    Encode categorical variables for FIFA regression project
    Adapted for predicting value_eur
    """
    print("=== ENCODING CATEGORICAL VARIABLES FOR FIFA ===")
    print(f"Initial data shape: {data.shape}")
    
    data = data.copy()
    label_encoders = {}
    
    # Identify categorical columns (excluding target)
    categorical_columns = data.select_dtypes(include=['object']).columns.tolist()
    
    # Remove target variable if present
    if target_column in categorical_columns:
        categorical_columns.remove(target_column)
    
    # Classify columns based on their type and content
    special_columns = []
    binary_categorical = []
    onehot_categorical = []
    date_columns = []
    
    for col in categorical_columns:
        if col in data.columns:
            unique_count = data[col].nunique()
            
            # Check for date columns
            if col in ['dob', 'joined'] or 'date' in col.lower():
                date_columns.append(col)
            
            # Check for URL columns (high cardinality, not useful)
                special_columns.append(col)
            
            # Binary categorical 
            elif unique_count == 2:
                binary_categorical.append(col)
            
            # Small categorical 
            elif unique_count <= 10:
                onehot_categorical.append(col)
            
            # Large categorical 
            else:
                if unique_count <= 50:
                    onehot_categorical.append(col)
                else:
                    special_columns.append(col)
    
    # HANDLE DATE COLUMNS
    for col in date_columns:
        if col in data.columns:
            # Convert to datetime
            data[col + '_datetime'] = pd.to_datetime(data[col], errors='coerce')
            
            # Extract useful features
            data[col + '_year'] = data[col + '_datetime'].dt.year
            data[col + '_month'] = data[col + '_datetime'].dt.month
            data[col + '_day_of_year'] = data[col + '_datetime'].dt.dayofyear
            
            # Calculate age for dob
            if col == 'dob':
                current_year = 2025
                data['calculated_age'] = current_year - data[col + '_year']
                print(f" Created calculated_age from {col}")
            
            # Calculate years since joining
            if col == 'joined':
                current_year = 2025
                data['years_since_joined'] = current_year - data[col + '_year']
                print(f" Created years_since_joined from {col}")
            
            # Remove original columns
            data = data.drop(columns=[col, col + '_datetime'])
    
    # REMOVE SPECIAL COLUMNS
    for col in special_columns:
        if col in data.columns:
            data = data.drop(columns=[col])
            print(f"🗑️  Removed {col}")
    
    # LABEL ENCODING FOR BINARY CATEGORICAL
    for col in binary_categorical:
        if col in data.columns:
            le = LabelEncoder()
            col_values = data[col].astype(str)
            data[col] = le.fit_transform(col_values)
            label_encoders[col] = le
            
            encoding_map = dict(zip(le.classes_, le.transform(le.classes_)))
            print(f" Label encoded {col}: {encoding_map}")
    
    # ONE-HOT ENCODING FOR MULTI-CLASS CATEGORICAL
    created_dummies = []
    
    for col in onehot_categorical:
        if col in data.columns:
            dummies = pd.get_dummies(data[col], prefix=col, drop_first=True, dummy_na=True)
            created_dummies.extend(dummies.columns.tolist())
            
            data = pd.concat([data, dummies], axis=1)
            print(f" One-hot encoded {col}: created {len(dummies.columns)} dummy variables")
    
    # Remove original categorical columns after one-hot encoding
    for col in onehot_categorical:
        if col in data.columns:
            data = data.drop(columns=[col])
    
    print("\n FINAL SUMMARY:")
    print(f"   • Final data shape: {data.shape}")
    print(f"   • Total new features created: {len(created_dummies) + len(date_columns)*4}")
    
    output_path = '/home/antonios/Desktop/Practica_de_vara/data-science-internship2/data/results/fifa_encoded.csv'
    data.to_csv(output_path, index=False)
    print(f"💾 Encoded data saved to: {output_path}")
    
    return data, label_encoders


fifa_encoded, encoders = encode_categorical_variables_fifa(fifa, target_column='value_eur')

=== ENCODING CATEGORICAL VARIABLES FOR FIFA ===
Initial data shape: (18943, 13)
 Created calculated_age from dob
 Created years_since_joined from joined
🗑️  Removed player_url

 FINAL SUMMARY:
   • Final data shape: (18943, 18)
   • Total new features created: 8
💾 Encoded data saved to: /home/antonios/Desktop/Practica_de_vara/data-science-internship2/data/results/fifa_encoded.csv


MODEL CRAFTING

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.preprocessing import StandardScaler

def simple_fifa_regression(data_path=None, data=None, target_column='value_eur'):
    """
    Simple regression model for FIFA player value prediction
    """
    print("=== FIFA REGRESSION MODEL ===")
    
    # Load data
    if data is None:
        if data_path is None:
            data_path = '/home/antonios/Desktop/Practica_de_vara/data-science-internship2/data/results/fifa_encoded.csv'
        data = pd.read_csv(data_path)
        print(f" Loaded data from: {data_path}")
    
    print(f" Data shape: {data.shape}")
    
    # Prepare features and target
    if target_column not in data.columns:
        print(f" Error: Target column '{target_column}' not found!")
        return None
    
    # Separate features and target
    X = data.drop(columns=[target_column])
    y = data[target_column]
    
    print(f" Target: {target_column}")
    print(f" Features: {X.shape[1]} columns")
    print(f" Target stats: Min={y.min():,.0f}, Max={y.max():,.0f}, Mean={y.mean():,.0f}")
    
    # Handle any remaining missing values
    X = X.fillna(X.mean())
    y = y.fillna(y.mean())
    
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )
    
    print(f" Train set: {X_train.shape[0]} samples")
    print(f" Test set: {X_test.shape[0]} samples")
    
    # Scale features for Linear Regression
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Model 1: Linear Regression
    print("\n Training Linear Regression...")
    lr_model = LinearRegression()
    lr_model.fit(X_train_scaled, y_train)
    
    # Predictions
    y_pred_lr = lr_model.predict(X_test_scaled)
    
    # Evaluate Linear Regression
    lr_r2 = r2_score(y_test, y_pred_lr)
    lr_rmse = np.sqrt(mean_squared_error(y_test, y_pred_lr))
    lr_mae = mean_absolute_error(y_test, y_pred_lr)
    
    print(" Linear Regression Results:")
    print(f"   R² Score: {lr_r2:.4f}")
    print(f"   RMSE: €{lr_rmse:,.0f}")
    print(f"   MAE: €{lr_mae:,.0f}")
    
    # Model 2: Random Forest (doesn't need scaling)
    print("\n Training Random Forest...")
    rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
    rf_model.fit(X_train, y_train)
    
    # Predictions
    y_pred_rf = rf_model.predict(X_test)
    
    # Evaluate Random Forest
    rf_r2 = r2_score(y_test, y_pred_rf)
    rf_rmse = np.sqrt(mean_squared_error(y_test, y_pred_rf))
    rf_mae = mean_absolute_error(y_test, y_pred_rf)
    
    print(" Random Forest Results:")
    print(f"   R² Score: {rf_r2:.4f}")
    print(f"   RMSE: €{rf_rmse:,.0f}")
    print(f"   MAE: €{rf_mae:,.0f}")
    
    # Compare models
    print("\n MODEL COMPARISON:")
    if rf_r2 > lr_r2:
        print(f" Best Model: Random Forest (R² = {rf_r2:.4f})")
        best_predictions = y_pred_rf
        best_name = "Random Forest"
    else:
        print(f" Best Model: Linear Regression (R² = {lr_r2:.4f})")
        best_predictions = y_pred_lr
        best_name = "Linear Regression"
    
    # Feature importance (Random Forest only)
    if hasattr(rf_model, 'feature_importances_'):
        print("\n TOP 10 MOST IMPORTANT FEATURES:")
        feature_importance = pd.DataFrame({
            'feature': X.columns,
            'importance': rf_model.feature_importances_
        }).sort_values('importance', ascending=False)
        
        for i, (_, row) in enumerate(feature_importance.head(10).iterrows()):
            print(f"   {i+1:2d}. {row['feature']:<25} {row['importance']:.4f}")
    
    print("\n SAMPLE PREDICTIONS vs ACTUAL:")
    sample_indices = np.random.choice(len(y_test), 5, replace=False)
    for i, idx in enumerate(sample_indices):
        actual = y_test.iloc[idx]
        predicted = best_predictions[idx]
        error = abs(actual - predicted)
        error_pct = (error / actual) * 100 if actual > 0 else 0
        
        print(f"   {i+1}. Actual: €{actual:>9,.0f} | Predicted: €{predicted:>9,.0f} | Error: {error_pct:5.1f}%")
    
    # Save results
    results_path = '/home/antonios/Desktop/Practica_de_vara/data-science-internship2/data/results/fifa_regression_results.txt'
    with open(results_path, 'w') as f:
        f.write("FIFA Player Value Prediction Results\n")
        f.write("="*40 + "\n\n")
        f.write(f"Dataset: {data.shape[0]} players, {X.shape[1]} features\n\n")
        f.write("Linear Regression:\n")
        f.write(f"  R² Score: {lr_r2:.4f}\n")
        f.write(f"  RMSE: €{lr_rmse:,.0f}\n")
        f.write(f"  MAE: €{lr_mae:,.0f}\n\n")
        f.write("Random Forest:\n")
        f.write(f"  R² Score: {rf_r2:.4f}\n")
        f.write(f"  RMSE: €{rf_rmse:,.0f}\n")
        f.write(f"  MAE: €{rf_mae:,.0f}\n\n")
        f.write(f"Best Model: {best_name}\n")
    
    print(f"\n💾 Results saved to: {results_path}")
    
    return {
        'linear_regression': {
            'model': lr_model,
            'scaler': scaler,
            'r2': lr_r2,
            'rmse': lr_rmse,
            'mae': lr_mae
        },
        'random_forest': {
            'model': rf_model,
            'r2': rf_r2,
            'rmse': rf_rmse,
            'mae': rf_mae
        },
        'test_data': (X_test, y_test),
        'feature_names': X.columns.tolist()
    }


results = simple_fifa_regression()



=== FIFA REGRESSION MODEL ===
📁 Loaded data from: /home/antonios/Desktop/Practica_de_vara/data-science-internship2/data/results/fifa_encoded.csv
📊 Data shape: (18943, 18)
 Target: value_eur
 Features: 17 columns
 Target stats: Min=0, Max=4,050,000, Mean=1,265,326
 Train set: 15154 samples
 Test set: 3789 samples



 Training Linear Regression...
 Linear Regression Results:
   R² Score: 0.7589
   RMSE: €664,627
   MAE: €532,395

 Training Random Forest...
 Random Forest Results:
   R² Score: 0.9663
   RMSE: €248,511
   MAE: €69,783

 MODEL COMPARISON:
 Best Model: Random Forest (R² = 0.9663)

 TOP 10 MOST IMPORTANT FEATURES:
    1. overall                   0.8604
    2. potential                 0.0703
    3. dob_year                  0.0180
    4. calculated_age            0.0178
    5. joined_month              0.0054
    6. joined_day_of_year        0.0053
    7. dribbling                 0.0040
    8. movement_reactions        0.0037
    9. mentality_composure       0.0033
   10. lcm                       0.0027

 SAMPLE PREDICTIONS vs ACTUAL:
   1. Actual: €  230,000 | Predicted: €  250,350 | Error:   8.8%
   2. Actual: €1,000,000 | Predicted: €  980,500 | Error:   1.9%
   3. Actual: €1,200,000 | Predicted: €1,203,000 | Error:   0.2%
   4. Actual: €4,050,000 | Predicted: €4,050,000 | Error: