# Data Engineering

## OneHotEncoder
we use one hot encoder to transform useless 'position' column into numirical data to be used in our mode.
we achieve that by spliting the column into other columns (each possible value of position) where the respective value for a player should be 1, other non related position values will take value 0.

In [3]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [4]:
#load the data
file_path = '../data/cleaned.xlsx'
df = pd.read_excel(file_path)

## Create Composite Features

In [17]:
# Clean column names to remove unwanted characters
df.columns = df.columns.str.replace(r'\s+', ' ', regex=True).str.strip()

# Calculate the Offensive_Index
df['Offensive_Index'] = (
    df['Shots per game'] +
    df['Dribbles per game'] +
    df['Key passes per game']
)
df['Offensive_Index'] = df['Offensive_Index'].round(1)

df.drop(columns=['Shots per game', 'Dribbles per game', 'Key passes per game'], inplace=True)

In [18]:
df['Passing_Index'] = (
    df['Passes per game'] * df['Pass success percentage'] / 100
)
df['Passing_Index'] = df['Passing_Index'].round(1)
df.drop(columns=['Passes per game', 'Pass success percentage'], inplace=True)

In [19]:
df['Possession_Risk'] = (
    df['Dispossessed per game'] + df['Bad control per game']
)
df['Possession_Risk'] = df['Possession_Risk'].round(1)

df.drop(columns=['Dispossessed per game', 'Bad control per game'], inplace=True)

In [21]:
df['Defensive_Index'] = (
    df['Tackles'] +
    df['Interceptions per game'] +
    df['Clearances per game'] +
    df['Aerials Won per game']
)
df['Defensive_Index'] = df['Defensive_Index'].round(1)
df.drop(columns=['Tackles', 'Interceptions per game', 'Clearances per game', 'Aerials Won per game'], inplace=True)

In [22]:
df['Creativity_Index'] = (
    df['Assists'] +
    df['Through balls per game'] +
    df['Crosses']
)
df['Creativity_Index'] = df['Creativity_Index'].round(1)
df.drop(columns=['Assists', 'Through balls per game', 'Crosses'], inplace=True)

In [23]:
df['Attacking_Efficiency'] = df['Goals'] / (df['Apps'] + 1)
df['Attacking_Efficiency'] = df['Attacking_Efficiency'].round(1)
df.drop(columns=['Goals', 'Apps'], inplace=True)

In [24]:
df['Discipline_Score'] = (
    df['Fouls'] + df['Yel'] + 2 * df['Red']
)
df['Discipline_Score'] = df['Discipline_Score'].round(1)
df.drop(columns=['Fouls', 'Yel', 'Red'], inplace=True)

In [25]:
df.to_excel('../data/featured.xlsx', index=False)