# 06 - Construction of Pipeline with everything

In [325]:
import pandas as pd
import numpy as np
import os
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.base import BaseEstimator, TransformerMixin

In [326]:
df = pd.read_csv(os.path.join('..','data', 'processed','all_concat_football_data.csv')).dropna()

df.head()

Unnamed: 0,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,Referee,...,HC,AC,HY,AY,HR,AR,B365H,B365D,B365A,Season
0,13/08/05,Aston Villa,Bolton,2.0,2.0,D,2.0,2.0,D,M Riley,...,7.0,8.0,0.0,2.0,0.0,0.0,2.3,3.25,3.0,2005
1,13/08/05,Everton,Man United,0.0,2.0,A,0.0,1.0,A,G Poll,...,8.0,6.0,3.0,1.0,0.0,0.0,5.0,3.4,1.72,2005
2,13/08/05,Fulham,Birmingham,0.0,0.0,D,0.0,0.0,D,R Styles,...,6.0,6.0,1.0,2.0,0.0,0.0,2.37,3.25,2.87,2005
3,13/08/05,Man City,West Brom,0.0,0.0,D,0.0,0.0,D,C Foy,...,3.0,6.0,2.0,3.0,0.0,0.0,1.72,3.4,5.0,2005
4,13/08/05,Middlesbrough,Liverpool,0.0,0.0,D,0.0,0.0,D,M Halsey,...,5.0,0.0,2.0,3.0,1.0,0.0,2.87,3.2,2.4,2005


In [327]:
df = pd.read_csv(os.path.join('..','data', 'processed','all_concat_football_data.csv')).dropna()

df.head()

class TargetEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, target_column='FTR', mapping=None):
        self.target_column = target_column
        self.mapping = mapping if mapping is not None else {'H': 0, 'D': 1, 'A': 2}

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        X[self.target_column] = X[self.target_column].map(self.mapping)
        return X[[self.target_column]]

class FixColumnsTeamsReferees(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        df = X.copy()
        # Lowercase all columns and replace spaces with underscores
        df.columns = df.columns.str.lower().str.replace(' ', '_')
        
        # Define column renaming dictionary
        columns = {
            'hometeam': 'home_team',
            'awayteam': 'away_team',
            'fthg': 'home_total_goals',
            'ftag': 'away_total_goals',
            'hthg': 'home_half_goals',
            'htag': 'away_half_goals',
            'htr': 'half_time_result',
            'hs': 'home_total_shots',
            'as': 'away_total_shots',
            'hst': 'home_shots_on_target',
            'ast': 'away_shots_on_target',
            'hf': 'home_fouls',
            'af': 'away_fouls',
            'hc': 'home_corners',
            'ac': 'away_corners',
            'hy': 'home_yellow_cards',
            'ay': 'away_yellow_cards',
            'hr': 'home_red_cards',
            'ar': 'away_red_cards',
            'b365h': 'market_home_odds',
            'b365d': 'market_draw_odds',
            'b365a': 'market_away_odds'
        }
        
        # Rename columns based on the dictionary
        df.rename(columns=columns, inplace=True)
        
        # Specific replacement to handle only the apostrophe in team names
        for col in ['home_team', 'away_team']:
            if col in df.columns:
                df[col] = df[col].str.lower().str.replace("'", "")  # Remove apostrophe specifically
                
        # Lowercase referee column if it exists
        if 'referee' in df.columns:
            df['referee'] = df['referee'].str.lower().replace(' ', '_')
        
        return df

class FeatureEngineering(BaseEstimator, TransformerMixin):
    def __init__(self, target_column='FTR'):
        self.target_column = target_column
    
    def fit(self, X, y=None):
        self.y = y
        return self

    def transform(self, X):        
        df = X.copy()

        # Add the 'full_time_result' to df if it was part of the fit phase
        if hasattr(self, 'y'):
            df['full_time_result'] = self.y  # Make sure this column exists

        # Calculate Home and Away Points based on FTR
        df['home_points'] = df['full_time_result'].apply(lambda x: 3 if x == 'H' else (1 if x == 'D' else 0))
        df['away_points'] = df['full_time_result'].apply(lambda x: 3 if x == 'A' else (1 if x == 'D' else 0))

        # Goal Difference
        df['goal_difference'] = df['home_total_goals'] - df['away_total_goals']

        # Aggregated Match Statistics
        df['total_shots'] = df['home_total_shots'] + df['away_total_shots']
        df['total_shots_on_target'] = df['home_shots_on_target'] + df['away_shots_on_target']
        df['total_fouls'] = df['home_fouls'] + df['away_fouls']
        df['total_corners'] = df['home_corners'] + df['away_corners']
        df['home_shot_accuracy'] = df['home_shots_on_target'] / df['home_total_shots'].replace(0, 1)
        df['away_shot_accuracy'] = df['away_shots_on_target'] / df['away_total_goals'].replace(0, 1)

        # Time-Based Features
        df['original_date'] = df['date']
        df['date'] = pd.to_datetime(df['date'], format='%d/%m/%y', errors='coerce')
        df['date'] = df['date'].combine_first(pd.to_datetime(df['original_date'], format='%d/%m/%Y', errors='coerce'))
        df['date'] = df['date'].dt.strftime('%d/%m/%y')
        df['date'] = pd.to_datetime(df['date'], format='%d/%m/%y')
        df.drop(columns=['original_date'], inplace=True)
        df['day_of_week'] = df['date'].dt.dayofweek
        df['month'] = df['date'].dt.month
        df['day_of_week_sin'] = np.sin(2 * np.pi * df['day_of_week'] / 6.0)
        df['day_of_week_cos'] = np.cos(2 * np.pi * df['day_of_week'] / 6.0)
        df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12.0)
        df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12.0)

        # Team-Based Features
        df['ratio_h_a_shots'] = df['home_total_shots'] / df['away_total_shots'].replace(0, 1)
        df['ratio_h_a_fouls'] = df['home_fouls'] / df['away_fouls'].replace(0, 1)
        df['ratio_a_h_shots'] = df['away_total_shots'] / df['home_total_shots'].replace(0, 1)
        df['ratio_a_h_fouls'] = df['away_fouls'] / df['home_fouls'].replace(0, 1)

        # Betting Odds-Based Features
        df['implied_home_win_prob'] = 1 / df['market_home_odds']
        df['implied_draw_prob'] = 1 / df['market_draw_odds']
        df['implied_away_win_prob'] = 1 / df['market_away_odds']
        total_prob = df['implied_home_win_prob'] + df['implied_draw_prob'] + df['implied_away_win_prob']
        df['implied_home_win_prob'] /= total_prob
        df['implied_draw_prob'] /= total_prob
        df['implied_away_win_prob'] /= total_prob

        # Rolling Averages
        features = ['home_total_goals', 'away_total_goals', 'home_total_shots', 'away_total_shots', 
                    'home_shots_on_target', 'away_shots_on_target', 'home_fouls', 'away_fouls',
                    'home_corners', 'away_corners', 'home_yellow_cards', 'away_yellow_cards',
                    'home_red_cards', 'away_red_cards', 'home_shot_accuracy', 'away_shot_accuracy',
                    'ratio_h_a_shots', 'ratio_h_a_fouls', 'ratio_a_h_shots', 
                    'ratio_a_h_fouls', 'goal_difference']
        new_columns = []
        for i in [3, 5]:
            for feature in features:
                home_rolling = (
                    df.sort_values(['season', 'home_team', 'date'])
                    .groupby(['season', 'home_team'])[feature]
                    .apply(lambda x: x.shift(1).rolling(window=i).mean())
                    .reset_index(level=[0, 1], drop=True)
                    .fillna(0)
                )
                away_rolling = (
                    df.sort_values(['season', 'away_team', 'date'])
                    .groupby(['season', 'away_team'])[feature]
                    .apply(lambda x: x.shift(1).rolling(window=i).mean())
                    .reset_index(level=[0, 1], drop=True)
                    .fillna(0)
                )
                new_columns.append(home_rolling.rename(f'home_roll_{i}_avg_{feature}'))
                new_columns.append(away_rolling.rename(f'away_roll_{i}_avg_{feature}'))
        df = pd.concat([df] + new_columns, axis=1)

        # Cumulative Points Calculation using `full_time_result`
        df['home_cumulative_points'] = df.groupby(['season', 'home_team'])['home_points'].transform('cumsum')
        df['away_cumulative_points'] = df.groupby(['season', 'away_team'])['away_points'].transform('cumsum')
        df.drop(columns=['home_points', 'away_points', 'full_time_result'], inplace=True)

        # Columns to keep for modeling
        columns_to_keep = ['home_team','away_team','away_cumulative_points', 'home_cumulative_points',
                           'home_roll_5_avg_ratio_h_a_shots', 'home_roll_3_avg_ratio_a_h_shots',
                           'away_roll_3_avg_ratio_h_a_shots', 'away_roll_3_avg_ratio_a_h_shots',
                           'home_roll_3_avg_ratio_h_a_shots', 'home_roll_3_avg_ratio_h_a_fouls',
                           'away_roll_3_avg_ratio_h_a_fouls', 'away_roll_3_avg_ratio_a_h_fouls',
                           'home_roll_3_avg_ratio_a_h_fouls']
        df_preparation = df[columns_to_keep]
        return df_preparation


# Define preprocessor for feature columns (scaling and encoding)
numerical_features = ['away_cumulative_points', 'home_cumulative_points', 
                      'home_roll_5_avg_ratio_h_a_shots', 'home_roll_3_avg_ratio_a_h_shots', 
                      'away_roll_3_avg_ratio_h_a_shots', 'away_roll_3_avg_ratio_a_h_shots', 
                      'home_roll_3_avg_ratio_h_a_shots', 'home_roll_3_avg_ratio_h_a_fouls', 
                      'away_roll_3_avg_ratio_h_a_fouls', 'away_roll_3_avg_ratio_a_h_fouls', 
                      'home_roll_3_avg_ratio_a_h_fouls']

categorical_features = ['home_team', 'away_team']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ]
)

# Feature engineering and target encoding separately
full_pipeline = Pipeline([
    ('fix_columns', FixColumnsTeamsReferees()),
    ('feature_engineering', FeatureEngineering())  # Removed target_column='FTR'
])

# XGBoost model parameters
xgboost_params = {
    'alpha': 0.6947849330397046,
    'colsample_bytree': 0.9641403517045772,
    'gamma': 0.12487080962675866,
    'lambda': 0.295633685837714,
    'learning_rate': 0.06582413897454059,
    'max_depth': 6,
    'n_estimators': 127,
    'subsample': 0.76553213116505
}

# Initialize the model
best_model = XGBClassifier(**xgboost_params)

# Define target encoder
target_encoder = TargetEncoder(target_column='FTR')  # Keep target column in the encoder

# Define the complete pipeline (including preprocessing and model training)
pipeline = Pipeline([
    ('full_pipeline', full_pipeline),         # Step for feature and target processing
    ('preprocessor', preprocessor),           # Step for scaling and encoding features
    ('model', best_model)                     # Step for model training
])

# Ensure that X is a DataFrame before splitting
X = df.drop(columns=['FTR'])  # X should already be a DataFrame, but ensure it is
y = df['FTR']

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size=0.2, 
                                                    random_state=42, 
                                                    stratify=y)

# Apply target encoding to the target variable y_train (manually outside of pipeline)
y_train_encoded = target_encoder.fit_transform(pd.DataFrame(y_train), y_train)
y_test_encoded = target_encoder.transform(pd.DataFrame(y_test))

In [328]:
# Now train the pipeline (this will apply feature engineering + scaling + model training)
pipeline

In [329]:
# Fit the pipeline (without manually applying target encoding inside pipeline)
pipeline.fit(X_train, y_train_encoded)

In [331]:
# Now you can use the fitted model to make predictions
predictions = pipeline.predict(X_test)

In [332]:
predictions

array([0, 0, 0, ..., 0, 2, 0])

In [333]:
# Evaluate the model
from sklearn.metrics import classification_report

print(classification_report(y_test_encoded, predictions))


              precision    recall  f1-score   support

           0       0.55      0.82      0.66       673
           1       0.30      0.06      0.10       354
           2       0.50      0.47      0.49       437

    accuracy                           0.53      1464
   macro avg       0.45      0.45      0.41      1464
weighted avg       0.48      0.53      0.47      1464

