In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
pd.options.display.max_columns = None

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.svm import SVR
from sklearn.metrics import r2_score, accuracy_score
from sklearn.ensemble import GradientBoostingRegressor, AdaBoostRegressor, RandomForestRegressor
import time
from sklearn.tree import DecisionTreeRegressor
import xgboost as xgb

## Read in Dataset, and exclude 0 weighted columns as well as Highest Market value

In [2]:
num_merged = pd.read_csv('data/2023_num_merged_cleaned.csv', header = 0)

In [3]:
merged_cols = np.array(num_merged.columns)
#exclude = ['market_value_in_eur', 'highest_market_value_in_eur']

#new_X_col = [item for item in merged_cols if item not in exclude]

include = ['position', 'sub_position', 'foot', 'height_in_cm', 'age', 'term_days_remaining', 'yellow_cards_2023',
          'red_cards_2023', 'current_club_domestic_competition_id', 'highest_market_value_in_eur']

X_col = [item for item in merged_cols if item in include]
#X_col = [item for item in merged_cols if item not in exclude]
y_col = 'market_value_in_eur'


In [4]:
X_col

['position',
 'sub_position',
 'foot',
 'height_in_cm',
 'age',
 'term_days_remaining',
 'highest_market_value_in_eur',
 'yellow_cards_2023',
 'red_cards_2023',
 'current_club_domestic_competition_id']

In [5]:
X = num_merged[X_col]
y = num_merged[y_col]

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25,random_state=19)

In [7]:
model_list = []

In [8]:
linreg = LinearRegression()
linreg.fit(X_train, y_train)
model_list.append(linreg)

In [9]:
forest1 = RandomForestRegressor(n_estimators = 100, max_depth = 5, random_state = 19)
forest1.fit(X_train, y_train)

In [10]:
model_list.append(forest1)

In [11]:
dtr1 = DecisionTreeRegressor(max_depth = 6, random_state = 19)
dtr1.fit(X_train, y_train)
model_list.append(dtr1)

In [12]:
gbc1 = GradientBoostingRegressor(random_state = 19, n_estimators = 200, learning_rate = 0.1, max_depth = 5)
gbc1.fit(X_train, y_train)
model_list.append(gbc1)

In [13]:
xg_reg = xgb.XGBRegressor(objective ='reg:squarederror',  learning_rate = 0.1,
                max_depth = 5, alpha = 10, #n_estimators = 10
                         )
xg_reg.fit(X_train, y_train)
model_list.append(xg_reg)

In [14]:
print("Accuracy on the 2023 data \n")
for model in model_list:
    
    try:
        this_acc = model.score(X_test,y_test)
        print(type(model).__name__," accuracy: ", this_acc, "\n")
    except:
        print(type(model).__name__," can't do this")


Accuracy on the 2023 data 

LinearRegression  accuracy:  0.8120771117088027 

RandomForestRegressor  accuracy:  0.8797724146825466 

DecisionTreeRegressor  accuracy:  0.8449129533789984 

GradientBoostingRegressor  accuracy:  0.8736470224988069 

XGBRegressor  accuracy:  0.8488410268661266 

