In [384]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# import dataset

In [385]:
data = pd.read_csv("players_season2022-2023.csv")
# show first 10 rows
data.head(10)

Unnamed: 0.1,Unnamed: 0,Player,shirt_number,Nation,Pos,Age,Min,Gls,Ast,PK,PKatt,Sh,SoT,CrdY,CrdR,Touches,Tkl,Int,Blocks
0,0,Odsonne Édouard,22,FRA,FW,24-201,57,0,0,0,0,3,1,0,0,20,0,0.0,0.0
1,1,Jean-Philippe Mateta,14,FRA,FW,25-038,33,0,0,0,0,1,0,0,0,13,0,0.0,0.0
2,2,Wilfried Zaha,11,CIV,LW,29-268,90,0,0,0,0,1,0,0,0,50,3,0.0,0.0
3,3,Jordan Ayew,9,GHA,"RW,AM",30-328,90,0,0,0,0,1,0,0,0,43,2,0.0,0.0
4,4,Eberechi Eze,10,ENG,AM,24-037,85,0,0,0,0,1,1,0,0,55,1,0.0,1.0
5,5,Malcolm Ebiowei,23,ENG,RW,18-335,5,0,0,0,0,0,0,0,0,5,0,0.0,0.0
6,6,Jeffrey Schlupp,15,GHA,DM,29-225,85,0,0,0,0,3,0,0,0,53,4,1.0,2.0
7,7,Will Hughes,19,ENG,DM,27-110,5,0,0,0,0,0,0,0,0,5,0,0.0,0.0
8,8,Cheick Doucouré,28,MLI,DM,22-209,74,0,0,0,0,0,0,0,0,39,1,1.0,1.0
9,9,Luka Milivojević,4,SRB,DM,31-120,16,0,0,0,0,0,0,0,0,17,0,0.0,0.0


# data preprocessing and cleaning

In [386]:
# convert features to numeric

dectNationl = {}
idxNational = 0
def convertNationalityToInt(nation):
    global idxNational
    if nation not in dectNationl:
        dectNationl[nation] = idxNational
        idxNational += 1
    return dectNationl[nation]


dectPostion = {}
idxPostion = 0
def convertPositionToInt(position):
    global idxPostion
    if position not in dectPostion:
        dectPostion[position] = idxPostion
        idxPostion += 1
    return dectPostion[position]


In [387]:
# remove irrelevant variables
data = data.drop(["Player","shirt_number"], axis=1) 
# remove first column
data = data.iloc[:,1:]
# convert Age to numeric
data["Age"] = data["Age"].str[:2].astype(int)
# remove any rows with missing data
data = data.dropna() 

# show first 10 rows
data.head(10)

Unnamed: 0,Nation,Pos,Age,Min,Gls,Ast,PK,PKatt,Sh,SoT,CrdY,CrdR,Touches,Tkl,Int,Blocks
0,FRA,FW,24,57,0,0,0,0,3,1,0,0,20,0,0.0,0.0
1,FRA,FW,25,33,0,0,0,0,1,0,0,0,13,0,0.0,0.0
2,CIV,LW,29,90,0,0,0,0,1,0,0,0,50,3,0.0,0.0
3,GHA,"RW,AM",30,90,0,0,0,0,1,0,0,0,43,2,0.0,0.0
4,ENG,AM,24,85,0,0,0,0,1,1,0,0,55,1,0.0,1.0
5,ENG,RW,18,5,0,0,0,0,0,0,0,0,5,0,0.0,0.0
6,GHA,DM,29,85,0,0,0,0,3,0,0,0,53,4,1.0,2.0
7,ENG,DM,27,5,0,0,0,0,0,0,0,0,5,0,0.0,0.0
8,MLI,DM,22,74,0,0,0,0,0,0,0,0,39,1,1.0,1.0
9,SRB,DM,31,16,0,0,0,0,0,0,0,0,17,0,0.0,0.0


# data visualization

In [388]:
# import matplotlib.pyplot as plt

# # Compute descriptive statistics
# print(data.describe())

# # Plot histograms
# data.hist(bins=20, figsize=(20,15))
# plt.show()

# # Plot scatter plots
# pd.plotting.scatter_matrix(data, figsize=(20,20))
# plt.show()

# # Compute correlation coefficients
# print(data.corr())

# build model

In [389]:
# convert Nationality and Position to numeric mapping on data["Nation"] and data["Pos"]
data["Nation"] = data["Nation"].apply(convertNationalityToInt)
data["Pos"] = data["Pos"].apply(convertPositionToInt)
data.dropna(inplace = True)
# 
# ["Gls", "Ast", "PK", "PKatt", "Sh", "SoT", "CrdY", "CrdR"]
X = data.drop(["Gls", "Ast", "PK", "PKatt", "Sh", "SoT", "CrdY", "CrdR","Touches","Tkl","Int","Blocks"], axis=1) # input features
# convert to numpy array
X = X.to_numpy()

#"Gls", "Ast", "PK", "PKatt", "Sh", "SoT", "CrdY", "CrdR","Touches","Tkl","Int","Blocks"

y = data[["Gls","Ast","Touches","Tkl","Sh"]] # output variables

# convert to numpy array
y = y.to_numpy()
print(y[0])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # split data into training and testing sets
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)
data.head(10)


[ 0  0 20  0  3]
(7616, 4) (7616, 5)
(1905, 4) (1905, 5)


Unnamed: 0,Nation,Pos,Age,Min,Gls,Ast,PK,PKatt,Sh,SoT,CrdY,CrdR,Touches,Tkl,Int,Blocks
0,0,0,24,57,0,0,0,0,3,1,0,0,20,0,0.0,0.0
1,0,0,25,33,0,0,0,0,1,0,0,0,13,0,0.0,0.0
2,1,1,29,90,0,0,0,0,1,0,0,0,50,3,0.0,0.0
3,2,2,30,90,0,0,0,0,1,0,0,0,43,2,0.0,0.0
4,3,3,24,85,0,0,0,0,1,1,0,0,55,1,0.0,1.0
5,3,4,18,5,0,0,0,0,0,0,0,0,5,0,0.0,0.0
6,2,5,29,85,0,0,0,0,3,0,0,0,53,4,1.0,2.0
7,3,5,27,5,0,0,0,0,0,0,0,0,5,0,0.0,0.0
8,4,5,22,74,0,0,0,0,0,0,0,0,39,1,1.0,1.0
9,5,5,31,16,0,0,0,0,0,0,0,0,17,0,0.0,0.0


In [390]:
model = LinearRegression() # create a linear regression model
model.fit(X_train, y_train) # train the model on the training data


In [391]:
import numpy as np
y_pred = model.predict(X_test) # make predictions on the testing data
mse = mean_squared_error(y_test, y_pred) # calculate mean squared error
rmse = np.sqrt(mse) # calculate root mean squared error
r2 = r2_score(y_test, y_pred) # calculate R-squared
# Calculate the accuracy of the model on the testing data
accuracy = model.score(X_test, y_test)
print("Accuracy:", accuracy*100, "%")
print("Mean squared error: %.2f" % mse)
print("Root mean squared error: %.2f" % rmse)
print("R-squared: %.2f" % r2)

Accuracy: 14.14250403558101 %
Mean squared error: 65.23
Root mean squared error: 8.08
R-squared: 0.14
