1. Import Libraries

In [23]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.linear_model import Ridge
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBRegressor


2. Data Cleaning Functions

In [24]:
def clean_data(df):
    # Drop unnecessary columns
    df = df.drop(['Nationality', 'Overall', 'Club', 'Work Rate', 'Body Type',
                  'Jersey Number', 'Joined', 'Loaned From', 'Contract Valid Until',
                  'GKDiving', 'GKHandling', 'GKKicking', 'GKPositioning', 
                  'GKReflexes', 'Release Clause', 'Positioning'], axis=1)

    # Convert categorical features
    label_encoder = LabelEncoder()
    df['Preferred Foot'] = label_encoder.fit_transform(df['Preferred Foot'])
    df['Position'] = label_encoder.fit_transform(df['Position'])
    
    return df


3. Conversion Functions

In [25]:
def convert_height_to_cm(height):
    try:
        feet, inches = map(int, height.split("'"))
        total_inches = feet * 12 + inches
        return round(total_inches * 2.54, 2)
    except Exception:
        return None

def convert_weight_to_kg(weight):
    try:
        return round(float(weight.replace('lbs', '').strip()) * 0.453592, 2)
    except Exception:
        return None

def convert_value_wage(value):
    if value[-1] == 'M':
        return float(value[1:-1]) * 1e6
    elif value[-1] == 'K':
        return float(value[1:-1]) * 1e3
    return float(value[1:])


In [26]:
def prepare_data_for_model(data):
    # Drop non-numeric columns
    data = data.drop(['Name'], axis=1, errors='ignore')  # Drop 'Name' if it exists
    return data


4. Feature Engineering Functions

In [27]:
def feature_engineering(df):
    df['Height'] = df['Height'].apply(convert_height_to_cm)
    df['Weight'] = df['Weight'].apply(convert_weight_to_kg)
    df['Value'] = df['Value'].apply(convert_value_wage).astype(int)
    df['Wage'] = df['Wage'].apply(convert_value_wage).astype(int)
    
    # Create additional features
    df['Fitness'] = df[['Acceleration', 'SprintSpeed', 'Agility', 'Reactions',
                         'Balance', 'Jumping', 'Stamina', 'Strength', 
                         'Aggression', 'Vision']].sum(axis=1)
    
    return df


5. Model Training Functions

In [28]:
def train_xgboost_model(x_train, y_train):
    xgboost_model = xgb.XGBRegressor(objective='reg:squarederror')
    param_grid = {
        'n_estimators': [200],
        'learning_rate': [0.05],
        'max_depth': [7],
        'subsample': [0.6],
        'colsample_bytree': [1.0],
        'reg_alpha': [0.1],
        'reg_lambda': [1.5]
    }
    
    grid_search = GridSearchCV(xgboost_model, param_grid, cv=5, scoring='r2', n_jobs=-1)
    grid_search.fit(x_train, y_train)
    
    return grid_search.best_estimator_

def train_random_forest_model(x_train, y_train):
    rf_model = RandomForestRegressor()
    rf_model.fit(x_train, y_train)
    return rf_model

def train_ridge_model(x_train, y_train):
    ridge_model = Ridge()
    ridge_model.fit(x_train, y_train)
    return ridge_model


6. Main Program Logic

Part 1: Predicting Potential

In [29]:

def clean_data(data):
    # Add your data cleaning logic here (e.g., handling missing values)
    return data

def feature_engineering(data):
    # Add your feature engineering logic here (e.g., creating new features)
    return data

def predict_hybrid_model(predictions):
    return sum(predictions) / len(predictions)

def main():
    # Load the dataset
    LB = pd.read_csv("LB_position.csv")

    # Clean the data
    LB = clean_data(LB)
    LB = feature_engineering(LB)

    # Define features and targets for potential prediction
    x_potential = LB.drop(['ID', 'Potential'], axis=1)
    y_potential = LB['Potential']

    # Ensure x_potential contains only numeric values
    x_potential = x_potential.select_dtypes(include=['number'])

    # Split data
    x_train_p, x_test_p, y_train_p, y_test_p = train_test_split(x_potential, y_potential, test_size=0.25, random_state=42)

    # Train models for potential prediction
    best_xgboost_model = train_xgboost_model(x_train_p, y_train_p)
    rf_model = train_random_forest_model(x_train_p, y_train_p)
    ridge_model = train_ridge_model(x_train_p, y_train_p)

    # Make predictions for potential
    pred_xgboost_p = best_xgboost_model.predict(x_test_p)
    pred_rf_p = rf_model.predict(x_test_p)
    pred_ridge_p = ridge_model.predict(x_test_p)

    # Hybrid prediction for potential
    hybrid_pred_p = predict_hybrid_model([pred_xgboost_p, pred_rf_p, pred_ridge_p])

    # Calculate R2 scores for potential
    print(f'XGBoost Potential Test R2: {r2_score(y_test_p, pred_xgboost_p)}')
    print(f'Random Forest Potential Test R2: {r2_score(y_test_p, pred_rf_p)}')
    print(f'Ridge Potential Test R2: {r2_score(y_test_p, pred_ridge_p)}')
    print(f'Hybrid Potential Test R2: {r2_score(y_test_p, hybrid_pred_p)}')

    potential_results = pd.DataFrame({
    'Name': LB.loc[x_test_p.index, 'Name'],  # Get player names from the original DataFrame
    'Predicted Potential': hybrid_pred_p,
    'Actual Potential': y_test_p
})

    top_potential = potential_results.nlargest(10, 'Predicted Potential')
    print("\nTop 10 Players Based on Predicted Potential:")
    print(top_potential)

if __name__ == "__main__":
    main()


XGBoost Potential Test R2: 0.8717904686927795
Random Forest Potential Test R2: 0.8739011410724974
Ridge Potential Test R2: 0.8021997714204026
Hybrid Potential Test R2: 0.8773504958874885

Top 10 Players Based on Predicted Potential:
             Name  Predicted Potential  Actual Potential
49   Aarón Martín            83.358966                85
23   R. Rodríguez            82.802695                83
29          Jonny            82.383815                83
56        Wendell            81.871923                80
44      D. Laxalt            81.789879                80
107   M. Saracchi            81.715359                85
43      J. Mojica            81.653330                81
31   S. Kolašinac            81.286768                82
51       J. Amavi            80.905744                82
32      Mário Rui            80.750215                80


Part 2: Predicting Value

In [34]:

def clean_data(data):
    # Add your data cleaning logic here
    return data

def feature_engineering(data):
    # Create the Fitness column
    data['Fitness'] = (data['Acceleration'] + data['SprintSpeed'] + data['Agility'] +
                       data['Reactions'] + data['Balance'] + data['Jumping'] +
                       data['Stamina'] + data['Strength'] + data['Aggression'] +
                       data['Vision'])
    
    
    # Preprocess the Value column to convert to numerical
    def convert_value(value):
        if 'M' in value:
            return float(value.replace('€', '').replace('M', '').replace(',', '').strip()) * 1_000_000
        elif 'K' in value:
            return float(value.replace('€', '').replace('K', '').replace(',', '').strip()) * 1_000
        return float(value.replace('€', '').replace(',', '').strip())
    
    data['Value'] = data['Value'].apply(convert_value)
    return data

def predict_hybrid_model(predictions):
    # Combine predictions from different models (e.g., average them)
    return sum(predictions) / len(predictions)

def main():
    # Load the dataset
    LB = pd.read_csv("LB_position.csv")

    # Clean the data
    LB = clean_data(LB)
    LB = feature_engineering(LB)

    # Define features and targets for value prediction
    x_value = LB[['International Reputation', 'Potential', 'Fitness', 'Skill Moves']]
    y_value = LB['Value']

    # Ensure x_value contains only numeric values
    x_value = x_value.select_dtypes(include=['number'])

    # Split data
    x_train_v, x_test_v, y_train_v, y_test_v = train_test_split(x_value, y_value, test_size=0.25, random_state=42)

    # Train models for value prediction
    best_xgboost_model_value = train_xgboost_model(x_train_v, y_train_v)
    rf_model_value = train_random_forest_model(x_train_v, y_train_v)
    ridge_model_value = train_ridge_model(x_train_v, y_train_v)

    # Make predictions for value
    pred_xgboost_v = best_xgboost_model_value.predict(x_test_v)
    pred_rf_v = rf_model_value.predict(x_test_v)
    pred_ridge_v = ridge_model_value.predict(x_test_v)

    # Hybrid prediction for value
    hybrid_pred_v = predict_hybrid_model([pred_xgboost_v, pred_rf_v, pred_ridge_v])

    # Calculate R2 scores for value
    print(f'XGBoost Value Test R2: {r2_score(y_test_v, pred_xgboost_v)}')
    print(f'Random Forest Value Test R2: {r2_score(y_test_v, pred_rf_v)}')
    print(f'Ridge Value Test R2: {r2_score(y_test_v, pred_ridge_v)}')
    print(f'Hybrid Value Test R2: {r2_score(y_test_v, hybrid_pred_v)}')

    # Identify and print top 10 players based on hybrid value predictions
    value_results = pd.DataFrame({
    'Name': LB.loc[x_test_v.index, 'Name'],  # Get player names from the original DataFrame
    'Predicted Value': hybrid_pred_v,
    'Actual Value': y_test_v
})

    top_value = value_results.nlargest(10, 'Predicted Value')
    print("\nTop 10 Players Based on Predicted Value:")
    print(top_value)

if __name__ == "__main__":
    main()


XGBoost Value Test R2: 0.7490930314645785
Random Forest Value Test R2: 0.7483544674541049
Ridge Value Test R2: 0.5183634757149471
Hybrid Value Test R2: 0.7856677125970721

Top 10 Players Based on Predicted Value:
               Name  Predicted Value  Actual Value
23     R. Rodríguez     1.382923e+07    15500000.0
29            Jonny     1.265402e+07    13500000.0
31     S. Kolašinac     1.216342e+07    13000000.0
107     M. Saracchi     1.183859e+07     9500000.0
32        Mário Rui     1.136720e+07    11500000.0
44        D. Laxalt     1.129535e+07    12500000.0
56          Wendell     1.113375e+07     9500000.0
30        J. Hector     1.068096e+07    10000000.0
51         J. Amavi     1.018841e+07    10000000.0
65   Alberto Moreno     9.410048e+06     9000000.0
