## <center>SOCCER PLAYER GOALS CONSIDERING ASSISTING PLAYER PREDICTION REGRESSION ANALYSIS<center>

In [31]:
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
import pandas as pd
from sklearn.multioutput import MultiOutputRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor

In [32]:

# Load dataset
df_raw = pd.read_csv('501_PROJECT_DATASET.csv')

# Remove rows where 'GOAL_SCORER' or 'GOAL_SCORER_TEAM_NAME' is null
df_raw = df_raw.dropna(subset=['GOAL_SCORER', 'GOAL_SCORER_TEAM_NAME'])

# Drop unnecessary columns
df_cleaned = df_raw.drop(columns=['DURATION', 'SEASON', 'GOAL_DESC', 'DATE_TIME', 'FOOT', 'POSITION', 
                                  'HEIGHT', 'WEIGHT', 'ASSISTS'])

df_cleaned.info()


<class 'pandas.core.frame.DataFrame'>
Index: 2049 entries, 0 to 2337
Data columns (total 7 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   HOME_TEAM              2049 non-null   object
 1   AWAY_TEAM              2049 non-null   object
 2   STADIUM                2049 non-null   object
 3   GOAL_SCORER            2049 non-null   object
 4   GOAL_SCORER_TEAM_NAME  2049 non-null   object
 5   ASSIST_PLAYER          1466 non-null   object
 6   GOALS                  2049 non-null   int64 
dtypes: int64(1), object(6)
memory usage: 128.1+ KB


In [33]:
# Categorical columns for one-hot encoding
categorical_cols = ['HOME_TEAM', 'AWAY_TEAM', 'STADIUM', 'GOAL_SCORER', 'GOAL_SCORER_TEAM_NAME', 'ASSIST_PLAYER']

# Load dataset and preprocessing as before...

# Separate features and target variables for both 'GOALS' and 'ASSISTS'
X = df_cleaned.drop(columns=['GOALS'])  # Features
y = df_cleaned[['GOALS']]  # Target variables for both 'GOALS' and 'ASSISTS'

# Split the data into training and testing sets for both features and targets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Define a preprocessing pipeline for one-hot encoding
preprocessor = ColumnTransformer(
    transformers=[
        ('onehot', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ],
    remainder='passthrough'
)

In [34]:
# Create a dictionary of regressors for different algorithms
regressors = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(),
    'Lasso Regression': Lasso(),
    'ElasticNet Regression': ElasticNet(),
    'Random Forest Regression': RandomForestRegressor(),
    'SVR': SVR(),
    'XGBoost Regression': XGBRegressor()
}

# Fit and evaluate each regressor with MultiOutputRegressor
for name, regressor in regressors.items():
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', regressor)
    ])

    pipeline.fit(X_train, y_train)
    predictions = pipeline.predict(X_test)
    mse_goals = mean_squared_error(y_test, predictions)
    print(f"{name} MSE for GOALS: {mse_goals}")


Linear Regression MSE for GOALS: 0.7933908076231897
Ridge Regression MSE for GOALS: 0.4377225044956613
Lasso Regression MSE for GOALS: 0.7304365574384571
ElasticNet Regression MSE for GOALS: 0.7304365574384571


  return fit_method(estimator, *args, **kwargs)


Random Forest Regression MSE for GOALS: 0.24492926856368563


  y = column_or_1d(y, warn=True)


SVR MSE for GOALS: 0.4175880936174297
XGBoost Regression MSE for GOALS: 0.25569687086514586


In [37]:


# Define the parameter grid to search
param_grid = {
    'regressor__n_estimators': [100, 200, 300, 400],
    'regressor__max_depth': [5, 9, 10],
    'regressor__learning_rate': [0.01, 0.1, 0.2]
}

# Create a pipeline with preprocessing and XGBoost Regression
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', XGBRegressor())
])

# Apply RandomizedSearchCV
random_search = RandomizedSearchCV(pipeline, param_distributions=param_grid, n_iter=10, cv=5, scoring='neg_mean_squared_error', random_state=42)
random_search.fit(X_train, y_train)

# Get the best parameters
best_params = random_search.best_params_
print("Best Parameters:", best_params)

# Predict on the test set with the best model
best_model = random_search.best_estimator_
predictions = best_model.predict(X_test)

# Calculate Mean Squared Error (MSE) for both goals and assists
mse_goals = mean_squared_error(y_test, predictions)
print(f"Tuned XGBoost Regression MSE for GOALS: {mse_goals}")

Best Parameters: {'regressor__n_estimators': 400, 'regressor__max_depth': 10, 'regressor__learning_rate': 0.2}
Tuned XGBoost Regression MSE for GOALS: 0.23793436798774734


In [50]:

# Use the best parameters obtained from the hyperparameter tuning
best_params = {
    'n_estimators': 245,
    'max_depth': 10,
    'learning_rate': 0.2
}

# Create an XGBoost regressor with the best parameters
best_model = XGBRegressor(**best_params)

# Create a pipeline with preprocessing and XGBoost Regression
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', best_model)
])

# Fit the pipeline on the training data
pipeline.fit(X_train, y_train)

# Predict on the test set
predictions = pipeline.predict(X_test)

# Calculate Mean Squared Error (MSE) for both goals and assists
mse_goals = mean_squared_error(y_test, predictions)
print(f"Tuned XGBoost Regression MSE for GOALS: {mse_goals}")


Tuned XGBoost Regression MSE for GOALS: 0.23514275600220003


In [53]:
# xgboost is much faster than the random forest 
# hence we are finalising our model and choosing xgboost for better mse and faster prediction

# Sample data for a single row (adjust according to your actual data)
input_data = {
    'HOME_TEAM': 'Real Madrid',
    'AWAY_TEAM': 'SSC Napoli',
    'STADIUM': 'Santiago Bernabéu',
    'GOAL_SCORER': 'Jude Bellingham',
    'GOAL_SCORER_TEAM_NAME': 'Real Madrid',
    'ASSIST_PLAYER': 'David Alaba'
}

# Create a DataFrame from the single row data
input_df = pd.DataFrame([input_data])

# Use the trained pipeline to preprocess the single row data and make predictions
predictions = pipeline.predict(input_df)

print(f"Predicted Goals : {predictions}")



Predicted Goals : [1.0277627]
