In [1]:
#Creating Machine Learning Model For Premier League Midfielders

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score

# Load data
data = pd.read_csv('midfielders_PL_2425.csv')

# Define selected features
selected_features = [
    'Passes per 90', 'Progressive passes per 90', 'Accurate short / medium passes, %', 
    'Shot assists per 90', 'xA per 90', 'Smart passes per 90', 
    'Successful dribbles, %', 'Accelerations per 90', 
    'Duels won, %', 'Successful defensive actions per 90', 'pAdj Tkl+Int per 90', 
    'Shots per 90', 'npxG per 90', 'Goal conversion, %', 
    'Fouls per 90', 'Aerial duels won, %'
]

# Create a proxy value score
data['Proxy_Value_Score'] = (
    0.4 * data['npxG per 90'] +
    0.3 * data['Duels won, %'] +
    0.3 * data['xA per 90']
)

# Define features and target for the model
features = data[selected_features]
target = data['Proxy_Value_Score']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Train model
model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Predict values for the entire dataset and calculate residuals
data['Predicted_Proxy_Value'] = model.predict(features)
data['Residual'] = data['Proxy_Value_Score'] - data['Predicted_Proxy_Value']

# Identify top undervalued players
undervalued_players = data.sort_values(by='Residual', ascending=False)
#undervalued_players.head()

# Display results
print(undervalued_players[['Player', 'Team', 'Age', 'Residual']].head(10))
print("Mean Absolute Error:", mae)
print("R-squared:", r2)

             Player               Team  Age  Residual
5          Y. Ayari           Brighton   21  0.159157
19  A. Mac Allister          Liverpool   25  0.106333
26        V. Janelt          Brentford   26  0.085296
8         C. Palmer            Chelsea   22  0.078269
12       M. Caicedo            Chelsea   23  0.069639
50         Casemiro  Manchester United   32  0.057537
48      A. Doucouré            Everton   31  0.054852
32  Andreas Pereira             Fulham   28  0.047676
35        D. Kamada     Crystal Palace   28  0.046981
31     Y. Tielemans        Aston Villa   27  0.042861
Mean Absolute Error: 0.0739873737373738
R-squared: 0.11653929565233667


In [3]:
undervalued_players.head()

Unnamed: 0.1,Unnamed: 0,Player,Age,League,Position,Team,Birth country,Non-penalty goals per 90,npxG per 90,"Successful dribbles, %",...,Smart passes per 90,xA per Shot Assist,Accelerations per 90,Aerial duels won per 90,Fouls suffered per 90,npxG per shot,Crosses per 90,Proxy_Value_Score,Predicted_Proxy_Value,Residual
5,5,Y. Ayari,21,Premier League 24-25,"LCMF, RDMF, LW",Brighton,Sweden,0.268519,0.574074,0.574074,...,0.361111,0.5,0.12037,0.444444,0.888889,0.37037,0.740741,0.685185,0.526028,0.159157
19,19,A. Mac Allister,25,Premier League 24-25,LDMF,Liverpool,Argentina,0.268519,0.388889,0.611111,...,0.564815,1.0,0.796296,0.425926,0.666667,0.648148,0.37037,0.525,0.418667,0.106333
26,26,V. Janelt,26,Premier League 24-25,"LDMF, DMF, LCMF",Brentford,Germany,0.62963,0.240741,0.972222,...,0.444444,0.685185,0.12037,0.537037,0.175926,0.388889,0.685185,0.557407,0.472111,0.085296
8,8,C. Palmer,22,Premier League 24-25,"AMF, CF, RAMF",Chelsea,England,1.0,0.981481,0.759259,...,0.796296,0.851852,0.898148,0.018519,0.925926,0.759259,0.814815,0.909259,0.830991,0.078269
12,12,M. Caicedo,23,Premier League 24-25,RDMF,Chelsea,Ecuador,0.555556,0.148148,0.555556,...,0.851852,0.787037,0.398148,0.37037,0.685185,0.185185,0.425926,0.478704,0.409065,0.069639


In [5]:
# Load the data (replace with your dataset's filename)
data = pd.read_csv('attackers_eliteserien.csv')

In [7]:
data.columns

Index(['Unnamed: 0', 'Player', 'Age', 'League', 'Position', 'Team',
       'Birth country', 'Non-penalty goals per 90', 'npxG per 90',
       'Successful dribbles, %', 'Goal conversion, %', 'Touches in box per 90',
       'Accurate short / medium passes, %', 'Passes per 90',
       'Shot assists per 90', 'xA per 90', 'Assists per 90',
       'Second assists per 90', 'Third assists per 90',
       'Progressive passes per 90', 'Progressive runs per 90', 'Duels won, %',
       'pAdj Tkl+Int per 90', 'Successful defensive actions per 90',
       'PAdj Sliding tackles', 'Defensive duels won, %', 'Fouls per 90',
       'Cards per 90', 'Shots blocked per 90', 'PAdj Interceptions',
       'Aerial duels won, %', 'Accurate long passes, %',
       '1st, 2nd, 3rd assists', 'Conceded goals per 90',
       'Prevented goals per 90', 'Shots against per 90', 'Save rate, %',
       'Exits per 90', 'Passes', 'Long Pass\nCmp %',
       '% of Passes\nBeing Short', '% of Passes\nBeing Lateral',
       'Rece

In [17]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score

# Load the data (replace with your dataset's filename)
data = pd.read_csv('attackers_eliteserien.csv')

# Define selected features for attackers
selected_features = [
    'npxG per 90', 'Shots per 90', 'Touches in box per 90', 
    'xA per 90', 'Goal conversion, %', 'Shot assists per 90', 
    'Accelerations per 90', 
    'Successful dribbles, %', 'Progressive runs per 90'
]

# Create a proxy value score for attackers
data['Proxy_Value_Score'] = (
    0.4 * data['npxG per 90'] +
    0.3 * data['Shots per 90'] +
    0.3 * data['Touches in box per 90']
)

# Define features and target for the model
features = data[selected_features]
target = data['Proxy_Value_Score']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Train the Random Forest Regressor
model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Predict values for the entire dataset and calculate residuals
data['Predicted_Proxy_Value'] = model.predict(features)
data['Residual'] = data['Proxy_Value_Score'] - data['Predicted_Proxy_Value']

# Identify top undervalued players
undervalued_players = data.sort_values(by='Residual', ascending=False)

# Display results
print(undervalued_players[['Player', 'Team', 'Age', 'Residual']].head(10))
print("Mean Absolute Error:", mae)
print("R-squared:", r2)


             Player          Team  Age  Residual
49        N. Castro         Brann   28  0.141203
32         J. Hauge  Bodø / Glimt   25  0.051254
5        H. Meister  Sarpsborg 08   20  0.049636
13          P. Dahl          KFUM   20  0.048195
25  T. Berg Haltvik          KFUM   24  0.046915
26     A. Mikkelsen  Bodø / Glimt   24  0.043839
3        M. Broholm     Rosenborg   19  0.037042
27  P. Christiansen        Viking   24  0.036178
52         B. Finne         Brann   29  0.033305
22          K. Høgh  Bodø / Glimt   23  0.031975
Mean Absolute Error: 0.05052824858757046
R-squared: 0.9340551915664271


In [19]:
data.head()

Unnamed: 0.1,Unnamed: 0,Player,Age,League,Position,Team,Birth country,Non-penalty goals per 90,npxG per 90,"Successful dribbles, %",...,Smart passes per 90,xA per Shot Assist,Accelerations per 90,Aerial duels won per 90,Fouls suffered per 90,npxG per shot,Crosses per 90,Proxy_Value_Score,Predicted_Proxy_Value,Residual
0,0,J. Reitan-Sunde,18,Eliteserien 2024,"LWF, RWF",Rosenborg,Norway,0.237288,0.186441,0.135593,...,0.262712,0.288136,0.847458,0.177966,0.847458,0.711864,0.779661,0.237288,0.303653,-0.066364
1,1,F. Tewelde,18,Eliteserien 2024,"CF, LW, LWF",Odds,Norway,0.70339,0.576271,0.389831,...,0.559322,0.372881,0.983051,0.59322,0.661017,0.372881,0.559322,0.520339,0.515458,0.004881
2,2,B. Bang-Kittilsen,19,Eliteserien 2024,"RWF, RW, LAMF",Odds,Norway,0.661017,0.440678,0.016949,...,0.127119,0.983051,0.245763,0.610169,0.864407,0.79661,0.271186,0.255085,0.288381,-0.033297
3,3,M. Broholm,19,Eliteserien 2024,"RWF, CF",Rosenborg,Norway,0.601695,0.20339,0.644068,...,0.813559,0.610169,0.79661,0.050847,0.661017,0.067797,0.864407,0.544068,0.507025,0.037042
4,4,H. Mikaelsson,20,Eliteserien 2024,CF,Kristiansund,Iceland,0.084746,0.338983,0.864407,...,0.127119,0.338983,0.423729,0.881356,0.432203,0.237288,0.245763,0.397458,0.410051,-0.012593


In [2]:
import pandas as pd
data = pd.read_csv('attackers_eliteserien.csv')

In [5]:
data.head()

Unnamed: 0.1,Unnamed: 0,Player,Age,League,Position,Team,Birth country,Non-penalty goals per 90,npxG per 90,"Successful dribbles, %",...,"Accurate passes, %",Shots per 90,"Accurate crosses, %",Smart passes per 90,xA per Shot Assist,Accelerations per 90,Aerial duels won per 90,Fouls suffered per 90,npxG per shot,Crosses per 90
0,0,J. Reitan-Sunde,18,Eliteserien 2024,"LWF, RWF",Rosenborg,Norway,0.237288,0.186441,0.135593,...,0.542373,0.050847,0.508475,0.262712,0.288136,0.847458,0.177966,0.847458,0.711864,0.779661
1,1,F. Tewelde,18,Eliteserien 2024,"CF, LW, LWF",Odds,Norway,0.70339,0.576271,0.389831,...,0.169492,0.677966,0.864407,0.559322,0.372881,0.983051,0.59322,0.661017,0.372881,0.559322
2,2,B. Bang-Kittilsen,19,Eliteserien 2024,"RWF, RW, LAMF",Odds,Norway,0.661017,0.440678,0.016949,...,0.20339,0.161017,0.067797,0.127119,0.983051,0.245763,0.610169,0.864407,0.79661,0.271186
3,3,M. Broholm,19,Eliteserien 2024,"RWF, CF",Rosenborg,Norway,0.601695,0.20339,0.644068,...,0.847458,0.79661,0.610169,0.813559,0.610169,0.79661,0.050847,0.661017,0.067797,0.864407
4,4,H. Mikaelsson,20,Eliteserien 2024,CF,Kristiansund,Iceland,0.084746,0.338983,0.864407,...,0.474576,0.508475,0.372881,0.127119,0.338983,0.423729,0.881356,0.432203,0.237288,0.245763


In [7]:
data.columns

Index(['Unnamed: 0', 'Player', 'Age', 'League', 'Position', 'Team',
       'Birth country', 'Non-penalty goals per 90', 'npxG per 90',
       'Successful dribbles, %', 'Goal conversion, %', 'Touches in box per 90',
       'Accurate short / medium passes, %', 'Passes per 90',
       'Shot assists per 90', 'xA per 90', 'Assists per 90',
       'Second assists per 90', 'Third assists per 90',
       'Progressive passes per 90', 'Progressive runs per 90', 'Duels won, %',
       'pAdj Tkl+Int per 90', 'Successful defensive actions per 90',
       'PAdj Sliding tackles', 'Defensive duels won, %', 'Fouls per 90',
       'Cards per 90', 'Shots blocked per 90', 'PAdj Interceptions',
       'Aerial duels won, %', 'Accurate long passes, %',
       '1st, 2nd, 3rd assists', 'Conceded goals per 90',
       'Prevented goals per 90', 'Shots against per 90', 'Save rate, %',
       'Exits per 90', 'Passes', 'Long Pass\nCmp %',
       '% of Passes\nBeing Short', '% of Passes\nBeing Lateral',
       'Rece

In [28]:
# Filter players under 25 years old
filtered_data = data[data['Age'] < 23]

# Isolate the 'Player' and '1st, 2nd, 3rd assists' columns and sort in ascending order
ass = filtered_data[['Player', '1st, 2nd, 3rd assists']].sort_values(by='1st, 2nd, 3rd assists', ascending=False)

# Display the result
ass.head()


Unnamed: 0,Player,"1st, 2nd, 3rd assists"
11,V. Halvorsen,0.932203
7,S. Ørjasæter,0.847458
6,O. Sivertsen,0.822034
3,M. Broholm,0.720339
14,J. Romsaas,0.635593
