1. Import Libraries

In [43]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.linear_model import Ridge
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBRegressor


2. Data Cleaning Functions

In [44]:
def clean_data(df):
    # Drop unnecessary columns
    df = df.drop(['Nationality', 'Overall', 'Club', 'Work Rate', 'Body Type',
                  'Jersey Number', 'Joined', 'Loaned From', 'Contract Valid Until',
                  'GKDiving', 'GKHandling', 'GKKicking', 'GKPositioning', 
                  'GKReflexes', 'Release Clause', 'Positioning'], axis=1)

    # Convert categorical features
    label_encoder = LabelEncoder()
    df['Preferred Foot'] = label_encoder.fit_transform(df['Preferred Foot'])
    df['Position'] = label_encoder.fit_transform(df['Position'])
    
    return df


3. Conversion Functions

In [45]:
def convert_height_to_cm(height):
    try:
        feet, inches = map(int, height.split("'"))
        total_inches = feet * 12 + inches
        return round(total_inches * 2.54, 2)
    except Exception:
        return None

def convert_weight_to_kg(weight):
    try:
        return round(float(weight.replace('lbs', '').strip()) * 0.453592, 2)
    except Exception:
        return None

def convert_value_wage(value):
    if value[-1] == 'M':
        return float(value[1:-1]) * 1e6
    elif value[-1] == 'K':
        return float(value[1:-1]) * 1e3
    return float(value[1:])


In [46]:
def prepare_data_for_model(data):
    # Drop non-numeric columns
    data = data.drop(['Name'], axis=1, errors='ignore')  # Drop 'Name' if it exists
    return data


4. Feature Engineering Functions

In [47]:
def feature_engineering(df):
    df['Height'] = df['Height'].apply(convert_height_to_cm)
    df['Weight'] = df['Weight'].apply(convert_weight_to_kg)
    df['Value'] = df['Value'].apply(convert_value_wage).astype(int)
    df['Wage'] = df['Wage'].apply(convert_value_wage).astype(int)
    
    # Create additional features
    df['Fitness'] = df[['Acceleration', 'SprintSpeed', 'Agility', 'Reactions',
                         'Balance', 'Jumping', 'Stamina', 'Strength', 
                         'Aggression', 'Vision']].sum(axis=1)
    
    return df


5. Model Training Functions

In [48]:
def train_xgboost_model(x_train, y_train):
    xgboost_model = xgb.XGBRegressor(objective='reg:squarederror')
    param_grid = {
        'n_estimators': [200],
        'learning_rate': [0.05],
        'max_depth': [7],
        'subsample': [0.6],
        'colsample_bytree': [1.0],
        'reg_alpha': [0.1],
        'reg_lambda': [1.5]
    }
    
    grid_search = GridSearchCV(xgboost_model, param_grid, cv=5, scoring='r2', n_jobs=-1)
    grid_search.fit(x_train, y_train)
    
    return grid_search.best_estimator_

def train_random_forest_model(x_train, y_train):
    rf_model = RandomForestRegressor()
    rf_model.fit(x_train, y_train)
    return rf_model

def train_ridge_model(x_train, y_train):
    ridge_model = Ridge()
    ridge_model.fit(x_train, y_train)
    return ridge_model


6. Main Program Logic

Part 1: Predicting Potential

In [49]:
def main():
    # Load the dataset
    LB = pd.read_csv("LB_position.csv")

    # Clean the data
    LB = clean_data(LB)
    LB = feature_engineering(LB)

    # Define features and targets for potential prediction
    x_potential = LB.drop(['ID', 'Potential'], axis=1)
    y_potential = LB['Potential']

    # Ensure x_potential contains only numeric values
    x_potential = x_potential.select_dtypes(include=['number'])

    # Split data
    x_train_p, x_test_p, y_train_p, y_test_p = train_test_split(x_potential, y_potential, test_size=0.25, random_state=42)

    # Train models for potential prediction
    best_xgboost_model = train_xgboost_model(x_train_p, y_train_p)
    rf_model = train_random_forest_model(x_train_p, y_train_p)
    ridge_model = train_ridge_model(x_train_p, y_train_p)

    # Make predictions for potential
    pred_xgboost_p = best_xgboost_model.predict(x_test_p)
    pred_rf_p = rf_model.predict(x_test_p)
    pred_ridge_p = ridge_model.predict(x_test_p)

    # Hybrid prediction for potential
    hybrid_pred_p = predict_hybrid_model([pred_xgboost_p, pred_rf_p, pred_ridge_p])

    # Calculate R2 scores for potential
    print(f'XGBoost Potential Test R2: {r2_score(y_test_p, pred_xgboost_p)}')
    print(f'Random Forest Potential Test R2: {r2_score(y_test_p, pred_rf_p)}')
    print(f'Ridge Potential Test R2: {r2_score(y_test_p, pred_ridge_p)}')
    print(f'Hybrid Potential Test R2: {r2_score(y_test_p, hybrid_pred_p)}')

    LB.loc[x_test_p.index, 'Predicted Potential'] = hybrid_pred_p

if __name__ == "__main__":
    main()
    


XGBoost Potential Test R2: 0.9193024635314941
Random Forest Potential Test R2: 0.9077495966342167
Ridge Potential Test R2: 0.7701605974289529
Hybrid Potential Test R2: 0.8974249208412445


  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T


Part 2: Predicting Value

In [50]:
# Define features and targets for value prediction
def main():
    # Load the dataset
    LB = pd.read_csv("LB_position.csv")

    # Clean the data
    LB = clean_data(LB)
    LB = feature_engineering(LB)

    x_value = LB[['International Reputation', 'Potential', 'Fitness', 'Skill Moves']]
    y_value = LB['Value']

    # Ensure x_value contains only numeric values
    x_value = x_value.select_dtypes(include=['number'])

    # Split data
    x_train_v, x_test_v, y_train_v, y_test_v = train_test_split(x_value, y_value, test_size=0.25, random_state=42)

    # Train models for value prediction
    best_xgboost_model_value = train_xgboost_model(x_train_v, y_train_v)
    rf_model_value = train_random_forest_model(x_train_v, y_train_v)
    ridge_model_value = train_ridge_model(x_train_v, y_train_v)

    # Make predictions for value
    pred_xgboost_v = best_xgboost_model_value.predict(x_test_v)
    pred_rf_v = rf_model_value.predict(x_test_v)
    pred_ridge_v = ridge_model_value.predict(x_test_v)

    # Hybrid prediction for value
    hybrid_pred_v = predict_hybrid_model([pred_xgboost_v, pred_rf_v, pred_ridge_v])

    # Calculate R2 scores for value
    print(f'XGBoost Value Test R2: {r2_score(y_test_v, pred_xgboost_v)}')
    print(f'Random Forest Value Test R2: {r2_score(y_test_v, pred_rf_v)}')
    print(f'Ridge Value Test R2: {r2_score(y_test_v, pred_ridge_v)}')
    print(f'Hybrid Value Test R2: {r2_score(y_test_v, hybrid_pred_v)}')

    # Add Predicted Value to the DataFrame
    LB.loc[x_test_v.index, 'Predicted Value'] = hybrid_pred_v

    return LB  # Return the DataFrame with predictions

if __name__ == "__main__":
    main()


XGBoost Value Test R2: 0.7490930557250977
Random Forest Value Test R2: 0.7358263868758599
Ridge Value Test R2: 0.5183634767244403
Hybrid Value Test R2: 0.7815672523647325


In [51]:


def display_top_player_info(LB):
    # Get the top player based on predicted potential
    top_player = LB.loc[LB['Predicted Potential'].idxmax()]
    print("Top Player's Information:")
    print(top_player.to_string())  # This prints all columns for the top player without the index

if __name__ == "__main__":

    # Now you can display the top player's information
    display_top_player_info(LB)



Top Player's Information:
ID                                  236295
Name                          Aarón Martín
Age                                     21
Potential                               85
Value                           11000000.0
Wage                                  €18K
Preferred Foot                           0
International Reputation               1.0
Weak Foot                              3.0
Skill Moves                            3.0
Position                                 0
Height                                5'11
Weight                              159lbs
Crossing                              76.0
Finishing                             33.0
HeadingAccuracy                       73.0
ShortPassing                          76.0
Volleys                               21.0
Dribbling                             71.0
Curve                                 62.0
FKAccuracy                            54.0
LongPassing                           61.0
BallControl                 