In [8]:
import pandas as pd

# Load the datasets
players_df = pd.read_csv('euro2024_players.csv')
shots_df = pd.read_csv('shots_all.csv')
summary_df = pd.read_csv('summary.csv')

# Display the first few rows of each dataframe
print("Players Data:")
print(players_df.head())

print("\nShots Data:")
print(shots_df.head())

print("\nSummary Data:")
print(summary_df.head())

Players Data:
                    Name     Position  Age                 Club  Height  \
0  Marc-André ter Stegen   Goalkeeper   32         FC Barcelona     187   
1           Manuel Neuer   Goalkeeper   38        Bayern Munich     193   
2         Oliver Baumann   Goalkeeper   34  TSG 1899 Hoffenheim     187   
3     Nico Schlotterbeck  Centre-Back   24    Borussia Dortmund     191   
4           Jonathan Tah  Centre-Back   28  Bayer 04 Leverkusen     195   

    Foot  Caps  Goals  MarketValue  Country  
0  right    40      0     28000000  Germany  
1  right   119      0      4000000  Germany  
2  right     0      0      3000000  Germany  
3   left    12      0     40000000  Germany  
4  right    25      0     30000000  Germany  

Shots Data:
   Unnamed: 0             player        team  xg_shot  psxg_shot outcome  \
0           0      Florian Wirtz  de Germany     0.10       0.37    Goal   
1           1      Jamal Musiala  de Germany     0.18       0.34    Goal   
2           2     

In [9]:
# Display basic statistics and info about the data
print("Players Data Info:")
print(players_df.info())
print(players_df.describe())

print("\nShots Data Info:")
print(shots_df.info())
print(shots_df.describe())

print("\nSummary Data Info:")
print(summary_df.info())
print(summary_df.describe())

Players Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 623 entries, 0 to 622
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Name         623 non-null    object
 1   Position     623 non-null    object
 2   Age          623 non-null    int64 
 3   Club         623 non-null    object
 4   Height       623 non-null    int64 
 5   Foot         620 non-null    object
 6   Caps         623 non-null    int64 
 7   Goals        623 non-null    int64 
 8   MarketValue  623 non-null    int64 
 9   Country      623 non-null    object
dtypes: int64(5), object(5)
memory usage: 48.8+ KB
None
              Age      Height        Caps       Goals   MarketValue
count  623.000000  623.000000  623.000000  623.000000  6.230000e+02
mean    27.040128  184.181380   30.338684    4.152488  1.840903e+07
std      4.124275    6.569258   30.987902   10.086803  2.426195e+07
min     16.000000  167.000000    0.000000    0.000000  5

In [14]:
# Example of feature engineering: market value per goal
players_df['value_per_goal'] = players_df['MarketValue'] / players_df['Goals']

# Additional feature engineering steps here...

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Define the features (X) and target (y)
X = players_df[['value_per_goal', 'assists', 'saves']]  # Example features
y = players_df['market_value']  # Assuming we have a market value column

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train the regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("R-squared:", r2)

In [None]:
from sklearn.model_selection import cross_val_score

# Perform cross-validation
cv_scores = cross_val_score(model, X, y, cv=5)

print("Cross-Validation Scores:", cv_scores)
print("Mean CV Score:", cv_scores.mean())

In [None]:
# Merge summary data with players data to get market values
merged_df = pd.merge(summary_df, players_df, left_on='shirtnumber', right_on='Name', how='left')

# Calculate goals per minute
merged_df['goals_per_minute'] = merged_df['goals'] / merged_df['minutes']

# Example of feature engineering: Include assists, saves, shots, and other relevant features
features = ['goals_per_minute', 'assists', 'saves', 'shots', 'xg', 'npxg', 'key_passes', 'passes_completed']
merged_df = merged_df.dropna(subset=features)  # Drop rows with missing feature values

# Define the features (X) and target (y)
X = merged_df[features]
y = merged_df['MarketValue']

# Display the first few rows of the feature set
X.head(), y.head()

In [None]:
# Convert the 'shirtnumber' column to string
summary_df['shirtnumber'] = summary_df['shirtnumber'].astype(str)

# Merge summary data with players data to get market values
merged_df = pd.merge(summary_df, players_df, left_on='shirtnumber', right_on='Name', how='left')

# Calculate goals per minute
merged_df['goals_per_minute'] = merged_df['goals_x'] / merged_df['minutes']

# Example of feature engineering: Include assists, saves, shots, and other relevant features
features = ['goals_per_minute', 'assists', 'saves', 'shots', 'xg', 'npxg', 'key_passes', 'passes_completed']
merged_df = merged_df.dropna(subset=features)  # Drop rows with missing feature values

# Define the features (X) and target (y)
X = merged_df[features]
y = merged_df['MarketValue']

# Display the first few rows of the feature set
X.head(), y.head()

In [None]:
# Inspect columns in merged_df
merged_df.columns

In [None]:
print(merged_df.columns)