1. Import Libraries

In [68]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.linear_model import Ridge
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBRegressor


2. Data Cleaning Functions

In [69]:
def clean_data(df):
    # Drop unnecessary columns
    df = df.drop(['Nationality', 'Overall', 'Club', 'Work Rate', 'Body Type',
                  'Jersey Number', 'Joined', 'Loaned From', 'Contract Valid Until',
                  'GKDiving', 'GKHandling', 'GKKicking', 'GKPositioning', 
                  'GKReflexes', 'Release Clause', 'Positioning'], axis=1)

    # Convert categorical features
    label_encoder = LabelEncoder()
    df['Preferred Foot'] = label_encoder.fit_transform(df['Preferred Foot'])
    df['Position'] = label_encoder.fit_transform(df['Position'])
    
    return df


3. Conversion Functions

In [70]:
def convert_height_to_cm(height):
    try:
        feet, inches = map(int, height.split("'"))
        total_inches = feet * 12 + inches
        return round(total_inches * 2.54, 2)
    except Exception:
        return None

def convert_weight_to_kg(weight):
    try:
        return round(float(weight.replace('lbs', '').strip()) * 0.453592, 2)
    except Exception:
        return None

def convert_value_wage(value):
    if value[-1] == 'M':
        return float(value[1:-1]) * 1e6
    elif value[-1] == 'K':
        return float(value[1:-1]) * 1e3
    return float(value[1:])


In [71]:
def prepare_data_for_model(data):
    # Drop non-numeric columns
    data = data.drop(['Name'], axis=1, errors='ignore')  # Drop 'Name' if it exists
    return data


4. Feature Engineering Functions

In [72]:
def feature_engineering(df):
    df['Height'] = df['Height'].apply(convert_height_to_cm)
    df['Weight'] = df['Weight'].apply(convert_weight_to_kg)
    df['Value'] = df['Value'].apply(convert_value_wage).astype(int)
    df['Wage'] = df['Wage'].apply(convert_value_wage).astype(int)
    
    # Create additional features
    df['Fitness'] = df[['Acceleration', 'SprintSpeed', 'Agility', 'Reactions',
                         'Balance', 'Jumping', 'Stamina', 'Strength', 
                         'Aggression', 'Vision']].sum(axis=1)
    
    return df


5. Model Training Functions

In [73]:
def train_xgboost_model(x_train, y_train):
    xgboost_model = xgb.XGBRegressor(objective='reg:squarederror')
    param_grid = {
        'n_estimators': [200],
        'learning_rate': [0.05],
        'max_depth': [7],
        'subsample': [0.6],
        'colsample_bytree': [1.0],
        'reg_alpha': [0.1],
        'reg_lambda': [1.5]
    }
    
    grid_search = GridSearchCV(xgboost_model, param_grid, cv=5, scoring='r2', n_jobs=-1)
    grid_search.fit(x_train, y_train)
    
    return grid_search.best_estimator_

def train_random_forest_model(x_train, y_train):
    rf_model = RandomForestRegressor()
    rf_model.fit(x_train, y_train)
    return rf_model

def train_ridge_model(x_train, y_train):
    ridge_model = Ridge()
    ridge_model.fit(x_train, y_train)
    return ridge_model


6. Main Program Logic

Part 1: Predicting Potential

In [74]:
def main():
    # Load the dataset
    LB = pd.read_csv("LB_position.csv")

    # Clean the data
    LB = clean_data(LB)
    LB = feature_engineering(LB)

    # Define features and targets for potential prediction
    x_potential = LB.drop(['ID', 'Potential'], axis=1)
    y_potential = LB['Potential']

    # Ensure x_potential contains only numeric values
    x_potential = x_potential.select_dtypes(include=['number'])

    # Split data
    x_train_p, x_test_p, y_train_p, y_test_p = train_test_split(x_potential, y_potential, test_size=0.25, random_state=42)

    # Train models for potential prediction
    best_xgboost_model = train_xgboost_model(x_train_p, y_train_p)
    rf_model = train_random_forest_model(x_train_p, y_train_p)
    ridge_model = train_ridge_model(x_train_p, y_train_p)

    # Make predictions for potential
    pred_xgboost_p = best_xgboost_model.predict(x_test_p)
    pred_rf_p = rf_model.predict(x_test_p)
    pred_ridge_p = ridge_model.predict(x_test_p)

    # Hybrid prediction for potential
    hybrid_pred_p = (pred_xgboost_p + pred_rf_p + pred_ridge_p) / 3

    # Calculate R2 scores for potential
    print(f'XGBoost Potential Test R2: {r2_score(y_test_p, pred_xgboost_p)}')
    print(f'Random Forest Potential Test R2: {r2_score(y_test_p, pred_rf_p)}')
    print(f'Ridge Potential Test R2: {r2_score(y_test_p, pred_ridge_p)}')
    print(f'Hybrid Potential Test R2: {r2_score(y_test_p, hybrid_pred_p)}')

    
  # Add predictions to DataFrame
    LB.loc[x_test_p.index, 'Predicted Potential'] = hybrid_pred_p

if __name__ == "__main__":
    main()
    


XGBoost Potential Test R2: 0.9193024635314941
Random Forest Potential Test R2: 0.9102804473410151
Ridge Potential Test R2: 0.7701605974289529
Hybrid Potential Test R2: 0.8978394819572967


  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T


Part 2: Predicting Value

In [75]:
# Define features and targets for value prediction
def main():
    # Load the dataset
    LB = pd.read_csv("LB_position.csv")

    # Clean the data
    LB = clean_data(LB)
    LB = feature_engineering(LB)

    x_Wage = LB[['International Reputation', 'Potential', 'Fitness', 'Skill Moves','Value']]
    y_Wage = LB['Wage']

    # Ensure x_Wage contains only numeric values
    x_Wage = x_Wage.select_dtypes(include=['number'])

    # Split data
    x_train_w, x_test_w, y_train_w, y_test_w = train_test_split(x_Wage, y_Wage, test_size=0.25, random_state=42)

    # Train models for Wage prediction
    best_xgboost_model_value = train_xgboost_model(x_train_w, y_train_w)
    rf_model_value = train_random_forest_model(x_train_w, y_train_w)
    ridge_model_value = train_ridge_model(x_train_w, y_train_w)

    # Make predictions for Wage
    pred_xgboost_w = best_xgboost_model_value.predict(x_test_w)
    pred_rf_w = rf_model_value.predict(x_test_w)
    pred_ridge_w = ridge_model_value.predict(x_test_w)

    # Hybrid prediction for Wage
    hybrid_pred_w = (pred_xgboost_w + pred_rf_w + pred_ridge_w) / 3

    # Calculate R2 scores for Wage
    print(f'XGBoost Wage Test R2: {r2_score(y_test_w, pred_xgboost_w)}')
    print(f'Random Forest Wage Test R2: {r2_score(y_test_w, pred_rf_w)}')
    print(f'Ridge Wage Test R2: {r2_score(y_test_w, pred_ridge_w)}')
    print(f'Hybrid Wage Test R2: {r2_score(y_test_w, hybrid_pred_w)}')

    # Add Predicted Wage to the DataFrame
    LB.loc[x_test_w.index, 'Predicted Wage'] = hybrid_pred_w
    return LB  # Return the DataFrame with predictions

if __name__ == "__main__":
    main()


XGBoost Wage Test R2: 0.6931073665618896
Random Forest Wage Test R2: 0.6688459763279542
Ridge Wage Test R2: 0.7034301313577702
Hybrid Wage Test R2: 0.7191279040448988


In [76]:


def display_top_player_info(LB):
    # Get the top 10 players based on predicted potential
    top_players = LB.nlargest(10, 'Predicted Potential')
    
    # Display the top players' information in a table format
    print("Top 10 Players' Information:")
    print(top_players.to_string(index=False))

if __name__ == "__main__":
    # Now you can display the top players' information
    display_top_player_info(LB)


Top 10 Players' Information:
    ID         Name  Age  Potential    Value  Wage  Preferred Foot  International Reputation  Weak Foot  Skill Moves  Position  Height  Weight  Crossing  Finishing  HeadingAccuracy  ShortPassing  Volleys  Dribbling  Curve  FKAccuracy  LongPassing  BallControl  Acceleration  SprintSpeed  Agility  Reactions  Balance  ShotPower  Jumping  Stamina  Strength  LongShots  Aggression  Interceptions  Vision  Penalties  Composure  Marking  StandingTackle  SlidingTackle  Fitness  Predicted Potential  Predicted Wage
236295 Aarón Martín   21         85 11000000 18000               0                       1.0        3.0          3.0         0  180.34   72.12      76.0       33.0             73.0          76.0     21.0       71.0   62.0        54.0         61.0         75.0          76.0         69.0     68.0       77.0     67.0       55.0     61.0     77.0      60.0       28.0        60.0           78.0    59.0       43.0       70.0     78.0            78.0           76.0