In [67]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor
from xgboost.sklearn import XGBRegressor

import time

In [68]:
num_merged = pd.read_csv('data/2023_num_merged_cleaned.csv', header = 0)

In [69]:
num_merged

Unnamed: 0,games_2023,minutes_played_2023,goals_2023,assists_2023,goals_against_2023,goals_for_2023,clean_sheet_2023,position,sub_position,foot,height_in_cm,age,term_days_remaining,market_value_in_eur,highest_market_value_in_eur,yellow_cards_2023,red_cards_2023,current_club_domestic_competition_id
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,1.0,191.0,37.0,67,400000.0,4000000.0,0.0,0.0,3
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,1.0,189.0,37.0,67,1500000.0,8000000.0,0.0,0.0,2
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,1,1.0,185.0,35.0,67,1200000.0,15000000.0,0.0,0.0,0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2,2,0.0,174.0,36.0,251,2000000.0,70000000.0,0.0,0.0,4
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,1,1.0,186.0,35.0,67,800000.0,5000000.0,0.0,0.0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3397,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,1,1.0,186.0,30.0,67,14000000.0,40000000.0,0.0,0.0,1
3398,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3,8,1.0,175.0,29.0,67,28000000.0,45000000.0,0.0,0.0,2
3399,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3,9,0.0,180.0,22.0,1162,35000000.0,35000000.0,0.0,0.0,1
3400,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,7,1.0,186.0,21.0,432,120000000.0,120000000.0,0.0,0.0,1


In [70]:
merged_cols = np.array(num_merged.columns)
X_col = merged_cols[merged_cols != 'market_value_in_eur']
y_col = 'market_value_in_eur'

X = num_merged[X_col]
y = num_merged[y_col]

In [71]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25,random_state=19)

## Attempt to GridSearch GradientBoostingRegressor

In [72]:
gbr = GradientBoostingRegressor(random_state = 19)

In [73]:
gbr_param_grid = {
    'n_estimators': [100, 200, 300, 400, 500, 600],
    'learning_rate': [0.01, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3],
    'max_depth': [3,4,5,6,7],
    'subsample': [0.5, 0.7, 1.0]
}

In [74]:
gbr_search = GridSearchCV(estimator = gbr,
                         param_grid = gbr_param_grid,
                         cv =5,
                         n_jobs = 4)

In [75]:
start = time.time()

gbr_search.fit(X_train, y_train)

end = time.time()



In [76]:
print('time taken: ', (end - start)/60)

time taken:  10.034581780433655


In [77]:
best_params = gbr_search.best_params_

In [78]:
best_params

{'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 500, 'subsample': 0.7}

In [79]:
best_gbr = gbr_search.best_estimator_

In [80]:
best_gbr_test_score = best_gbr.score(X_test, y_test)

In [81]:
print('Best GBR Model Accuracy Score: ', best_gbr_test_score)

Best GBR Model Accuracy Score:  0.8887433206671332


## Gridsearch XGboost

In [82]:
xgboost = XGBRegressor()

In [83]:
xg_grid = {
    'objective': ['reg:squarederror'],
    'n_estimators': [10,30,50,70,80,100,150, 175,200,300],
    'learning_rate': [0.03, 0.05, 0.07, 0.1, 0.15, 0.2],
    'max_depth': [3,4,5],
    'min_child_weight': [3,4,5],
    'subsample': [0.5, 0.7, 1.0],
}

In [84]:
xgb_search = GridSearchCV(estimator = xgboost,
                         param_grid = xg_grid,
                         cv =5,
                         n_jobs = 4)

In [85]:
start = time.time()

xgb_search.fit(X_train, y_train)

end = time.time()

In [86]:
print('time taken for xg regressor', (end-start)/60)

time taken for xg regressor 1.8839442054430644


In [87]:
xbg_best_params = xgb_search.best_params_

In [88]:
xbg_best_params

{'learning_rate': 0.05,
 'max_depth': 3,
 'min_child_weight': 4,
 'n_estimators': 80,
 'objective': 'reg:squarederror',
 'subsample': 0.7}

In [89]:
xgb_best_model = xgb_search.best_estimator_

In [90]:
xbg_score = xgb_best_model.score(X_test, y_test)

In [91]:
print('Best XG Boost Accuracy Score: ', xbg_score)

Best XG Boost Accuracy Score:  0.8837388921504618


## Load in 2022 Data


In [98]:
last_year_df = pd.read_csv('data/2022_big5.csv')
merged_cols = np.array(last_year_df.columns)

X_col = merged_cols[merged_cols != 'market_value_in_eur']
y_col = 'market_value_in_eur'
X = last_year_df[X_col]
y = last_year_df[y_col]

In [99]:
previous_year_score_gbr = best_gbr.score(X,y)

In [100]:
print('Best GBR Model Accuracy on 2022 Data: ',previous_year_score_gbr )

Best GBR Model Accuracy on 2022 Data:  0.919657767996378


In [101]:
previous_year_score_xgb = xgb_best_model.score(X,y)

In [102]:
print('Best XGB Model Accuracy on 2022 Data: ',previous_year_score_xgb )

Best XGB Model Accuracy on 2022 Data:  0.9052360053152594


### Feature Weights

In [103]:
gbr_weights = best_gbr.feature_importances_

In [104]:
print("GBR Weights")
for i in range(len(X_col)):
    print(X_col[i], 'feature weight: ', gbr_weights[i])

GBR Weights
games_2023 feature weight:  0.0
minutes_played_2023 feature weight:  0.0
goals_2023 feature weight:  0.0
assists_2023 feature weight:  0.0
goals_against_2023 feature weight:  0.0
goals_for_2023 feature weight:  0.0
clean_sheet_2023 feature weight:  0.0
position feature weight:  0.00121194383694798
sub_position feature weight:  0.007126990424443735
foot feature weight:  0.001247983845560803
height_in_cm feature weight:  0.0051233467105386956
age feature weight:  0.15905286920112535
term_days_remaining feature weight:  0.012586742800227466
highest_market_value_in_eur feature weight:  0.8068626927084439
yellow_cards_2023 feature weight:  0.0
red_cards_2023 feature weight:  0.0
current_club_domestic_competition_id feature weight:  0.006787430472712309


In [105]:
xgb_weights = xgb_best_model.feature_importances_

In [106]:
print("XGB Weights")
for i in range(len(X_col)):
    print(X_col[i], 'feature weight: ', xgb_weights[i])

XGB Weights
games_2023 feature weight:  0.0
minutes_played_2023 feature weight:  0.0
goals_2023 feature weight:  0.0
assists_2023 feature weight:  0.0
goals_against_2023 feature weight:  0.0
goals_for_2023 feature weight:  0.0
clean_sheet_2023 feature weight:  0.0
position feature weight:  0.040939912
sub_position feature weight:  0.058903206
foot feature weight:  0.044227287
height_in_cm feature weight:  0.0517805
age feature weight:  0.17645197
term_days_remaining feature weight:  0.046783373
highest_market_value_in_eur feature weight:  0.5420865
yellow_cards_2023 feature weight:  0.0
red_cards_2023 feature weight:  0.0
current_club_domestic_competition_id feature weight:  0.03882733


## Read in Premade Team and Calculate Valuation with GBR Model

In [107]:
ult_team = pd.read_csv('data/ultimate_team.csv', header = 0)

In [108]:
ultimate_team_valuation = ult_team['market_value_in_eur'].sum()

In [109]:
print("Expected Team Valuation is ", '{:,}'.format(ultimate_team_valuation))

Expected Team Valuation is  870,000,000.0


In [110]:
merged_cols_2 = np.array(ult_team.columns)

X_col_ult = merged_cols_2[merged_cols_2 != 'market_value_in_eur']
y_col_ult = 'market_value_in_eur'
X_ult = ult_team[X_col_ult]
y_ult = ult_team[y_col_ult]

In [111]:
predicted_valuation = xgb_best_model.predict(X_ult)

In [112]:
print("Predicted Team Valuation is ", '{:,}'.format(sum(predicted_valuation)))

Predicted Team Valuation is  758,943,206.0


In [113]:
for i in range(len(y_ult)):
    print('Predicted Valuation: ', '{:,}'.format(predicted_valuation[i]))
    print('Actual Valuation: ', '{:,}'.format(y_ult[i]))
    print('\n')

Predicted Valuation:  66,298,996.0
Actual Valuation:  75,000,000.0


Predicted Valuation:  51,750,388.0
Actual Valuation:  70,000,000.0


Predicted Valuation:  33,619,220.0
Actual Valuation:  45,000,000.0


Predicted Valuation:  33,938,148.0
Actual Valuation:  60,000,000.0


Predicted Valuation:  89,788,784.0
Actual Valuation:  65,000,000.0


Predicted Valuation:  9,601,634.0
Actual Valuation:  7,000,000.0


Predicted Valuation:  103,352,488.0
Actual Valuation:  110,000,000.0


Predicted Valuation:  111,954,432.0
Actual Valuation:  170,000,000.0


Predicted Valuation:  99,626,680.0
Actual Valuation:  100,000,000.0


Predicted Valuation:  53,741,996.0
Actual Valuation:  48,000,000.0


Predicted Valuation:  105,270,440.0
Actual Valuation:  120,000,000.0




In [115]:
players = pd.read_csv('data/name_plus_value.csv', header = 0)

In [116]:
pd.options.display.float_format = '{:,}'.format

In [117]:
players

Unnamed: 0,name,market_value_in_eur
0,Rúben Dias,75000000.0
1,Mohamed Salah,70000000.0
2,Virgil van Dijk,45000000.0
3,Heung-min Son,60000000.0
4,Trent Alexander-Arnold,65000000.0
5,Manuel Neuer,7000000.0
6,Jamal Musiala,110000000.0
7,Erling Haaland,170000000.0
8,Pedri,100000000.0
9,Andrew Robertson,48000000.0
