In [2]:
%pip install pandas matplotlib




In [3]:
import json
import textwrap
import pandas as pd
import matplotlib.pyplot as plt
import os

DATA_DIR = '..\processed_data\\'
with open('Appendix.json', 'r') as f:
    football_stats_dict = json.load(f)

def plot_scatter_for_columns(file_path: str) -> str:
    '''
    Plots scatter plots for all numeric columns in a CSV file.
    The scatter plots show the values of the columns and the mean and standard deviation.
    The plots are saved to a file with the same name as the input file, but with '_scatter_plots.png' appended.
    Plots are saved in the 'scatter_plots' folder. 
    
    Parameters:
        file_path (str): The path to the CSV file.

    Returns:
        str: The path to the output file with the scatter plots.
    '''
    df = pd.read_csv(file_path)

    num_columns = len(df.columns)
    num_rows = (num_columns + 1) // 2  # Adjust the number of rows
    plt.figure(figsize=(12, num_rows * 4))
    
    for i, column in enumerate(df.columns, 1):
        if pd.api.types.is_numeric_dtype(df[column]):
            plt.subplot(num_rows, 2, i)
            plt.scatter(range(len(df[column])), df[column], label=column)
            wrapped_title = "\n".join(textwrap.wrap(football_stats_dict[column], width=40))  
            plt.title(wrapped_title+ ' in ' + file_path.split('\\')[-1])
            plt.xlabel('Index')
            plt.ylabel(column)
            plt.axhline(df[column].mean(), color='r', linestyle='--', label='Mean')
            plt.axhline(df[column].mean() + df[column].std(), color='g', linestyle='--', label='Mean + 1 STD')
            plt.axhline(df[column].mean() - df[column].std(), color='g', linestyle='--', label='Mean - 1 STD')
            plt.legend()

    plt.tight_layout()
    output_dir = os.path.join(DATA_DIR, 'scatter_plots')
    os.makedirs(output_dir, exist_ok=True)  
    output_path = os.path.join(output_dir, os.path.basename(file_path).replace('.csv', '_scatter_plots.png'))
    plt.savefig(output_path)
    # plt.show() # Uncomment to show the plots 
    plt.close()
    return os.path.basename(file_path).replace( '.csv',''), df


# Apply the function to each file
files = os.listdir(DATA_DIR)
df_dict = {} 
for file in files:
    if file.endswith('.csv'):
        df_name, df = plot_scatter_for_columns(os.path.join(DATA_DIR, file))
        df_dict[df_name] = df

df_dict.keys() 

# plots can be found in the processed_data/scatter_plots folder
# df_dict contains the dataframes for each file


dict_keys(['Attacking Midfield_players', 'Back_players', 'Centre-Back_players', 'Centre-Forward_players', 'combined', 'Defensive Midfield_players', 'Goalkeeper_players', 'Winger_players'])

In [4]:
# data cleaning

# PasCrs (Crosses) seemed to be very different in each season, so we decided to remove it
for key in df_dict:
    df_dict[key].drop(columns=['PasCrs'], inplace=True)

# In Back_players the ScaDrib (Successful dribbles that lead to a shot attempt) has a outlier
df_meam = df_dict['Back_players']['ScaDrib'].mean()
df_dict['Back_players'].loc[df_dict['Back_players']['ScaDrib'] > 9, 'ScaDrib'] = df_meam

# In Ceneter_forwards_players the Err (Mistakes leading to an opponent's shot) has a outlier
df_meam = df_dict['Centre-Forward_players']['Err'].mean()
df_dict['Centre-Forward_players'].loc[df_dict['Centre-Forward_players']['Err'] > 4, 'Err'] = df_meam

# we also found that Goals was collected differently in each season, so we fixed it in the extract_data.py file


In [5]:
# from sklearn.linear_model import Ridge
# from sklearn.model_selection import train_test_split
# from sklearn.preprocessing import StandardScaler

# # Separate features and target variable
# X = df.drop(['Valuation', 'Date', 'Position'], axis=1)
# y = df['Valuation']

# # Split the data into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# # Standardize the features
# scaler = StandardScaler()
# X_train_scaled = scaler.fit_transform(X_train)
# X_test_scaled = scaler.transform(X_test)

# # Fit the Ridge regression model
# ridge_model = Ridge(alpha=1.0)
# ridge_model.fit(X_train_scaled, y_train)

# # Predict the valuation for the 9th row
# predicted_value = ridge_model.predict(scaler.transform(X.iloc[[9]]))[0]

# print(f"actual value: {df.loc[9, 'Valuation']}")
# print(f"predicted value: {predicted_value}")

In [6]:
import statsmodels.api as sm
from statsmodels.formula.api import ols
import patsy
import pandas as pd

X = df_dict['Centre-Forward_players'].drop([ 'Date', 'Position'], axis=1)
columns = X.columns

# Create the formula dynamically
formula = 'Q("Valuation") ~ ' + ' + '.join([f'Q("{col}")' for col in columns if col != 'Valuation'])

linearRegressionModel = ols(formula=formula, data=X).fit()


In [7]:

# Test the model, let's predict the valuation for the 9th row
test_row = 123
print("Testing the model:\n")
actual_value = X.loc[test_row, 'Valuation']
predicted_value = linearRegressionModel.predict(X.drop(['Valuation'], axis=1).iloc[[test_row]]).values[0]

print(f"actual value: {actual_value}")
print(f"predic value: {predicted_value}")
print("\nLooks like the model is doing a good job predicting the valuation, but let's check some more analysis\n")

Testing the model:

actual value: 1200000.0
predic value: 149717.25597965717

Looks like the model is doing a good job predicting the valuation, but let's check some more analysis



In [8]:
%pip install tensorflow 





In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Separate features and target variable
X = df.drop(['Valuation', 'Date', 'Position'], axis=1)
y = df['Valuation']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Build the neural network model
model = Sequential()

# Add input layer and first hidden layer
model.add(Dense(units=64, activation='relu', input_shape=(X_train_scaled.shape[1],)))

# Add second hidden layer
model.add(Dense(units=128, activation='relu'))

# Add second hidden layer
model.add(Dense(units=32, activation='relu'))

# Add output layer
model.add(Dense(units=1))

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')

# Train the model
model.fit(X_train_scaled, y_train, epochs=100, batch_size=10, validation_split=0.2, verbose=1)


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.src.callbacks.History at 0x1b3a0ebda20>

In [10]:

# Predict the valuation for the 9th row
predicted_value = model.predict(scaler.transform(X.iloc[[123]]))[0][0]

print(f"actual value: {df.loc[123, 'Valuation']}")
print(f"predic value: {predicted_value}")


actual value: 450000.0
predic value: 12254929.0


In [12]:
'''PCA'''
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.decomposition import PCA

# Separate features and target variable
X = df.drop(['Valuation', 'Date', 'Position'], axis=1)
y = df['Valuation']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

#PCA
pca = PCA(.95) # 95% of the variance.
pca.fit(X_train_scaled)
print(pca.n_components_, 'components')
X_train_reduced_dim = pca.transform(X_train_scaled)
X_test_reduced_dim = pca.transform(X_test_scaled)

# Build the neural network model
model = Sequential()

# Add input layer and first hidden layer
model.add(Dense(units=64, activation='relu', input_shape=(X_train_reduced_dim.shape[1],)))

# Add second hidden layer
model.add(Dense(units=128, activation='relu'))

# Add second hidden layer
model.add(Dense(units=32, activation='relu'))

# Add output layer
model.add(Dense(units=1))

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')

# Train the model
model.fit(X_train_reduced_dim, y_train, epochs=100, batch_size=10, validation_split=0.2, verbose=1)

52 components
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 7

<keras.src.callbacks.History at 0x1b3a2761db0>

In [29]:
X_test_reduced_dim.shape


(108, 52)

In [24]:

# Predict the valuation for the 9th row
predicted_value = model.predict(scaler.transform(X_test_reduced_dim[44].reshape(1, -1)))[0][0]

print(f"actual value: {df.loc[123, 'Valuation']}")
print(f"predic value: {predicted_value}")




ValueError: X has 52 features, but StandardScaler is expecting 111 features as input.