#### Imports & Constants

In [None]:
import sys
import pathlib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

sys.path.append(pathlib.Path("..").resolve().as_posix())

from src.coder import TargetCoder

CLEANED_DATA_PATH = pathlib.Path("..") / "data" / "cleaned" / "cleaned_data.csv"
PROCESS_DATA_PATH = pathlib.Path("..") / "data" / "processed" / "processed_data.csv"

cleaned_df = pd.read_csv(CLEANED_DATA_PATH)
processed_df = pd.read_csv(PROCESS_DATA_PATH)

##### Sample

In [None]:
print(f"Cleaned data shape: {cleaned_df.shape}")
cleaned_df.head()

In [None]:
print(f"Processed data shape: {processed_df.shape}")
processed_df.head()

##### Height & Weight (Analysis & Visualization)

In [None]:
def get_height_weight_statistics_df() -> pd.DataFrame:
    grouped_df: pd.DataFrame = cleaned_df.groupby('position')
    indexed_df: pd.DataFrame = grouped_df[["height_cm", "weight_kg"]]
    agg_df: pd.DataFrame = indexed_df.agg(["mean", "std", "min", "max"])
    # reindex positions to match TargetCoder._TARGET_DECODER
    agg_df = agg_df.reindex(TargetCoder._TARGET_DECODER.values())
    return agg_df
    
    
def scatterplot_height_weight_by_position() -> None:
    plt.figure(figsize=(12, 8))

    plt.scatter(
        cleaned_df["height_cm"],
        cleaned_df["weight_kg"],
        c=processed_df["position"],
        alpha=0.5,
        cmap="viridis",
    )

    plt.xlabel("Height (cm)")
    plt.ylabel("Weight (kg)")
    plt.title("Height vs Weight by Position")

    plt.grid(alpha=0.5)

    positions: list = cleaned_df["position"].unique().tolist()
    positions.sort(key=TargetCoder.encode, reverse=True)

    cbar = plt.colorbar(label="Position", ticks=range(len(positions)))
    cbar.set_ticklabels(positions)

    plt.show()

scatterplot_height_weight_by_position()
get_height_weight_statistics_df()

##### Preferred Foot (Analysis & Visualization)

In [None]:
def get_foot_crosstab() -> pd.DataFrame:
    # Create a crosstab of preferred_foot and position
    foot_crosstab = pd.crosstab(cleaned_df['preferred_foot'], cleaned_df['position'])
    # Divide each column by the sum of the column to get percentages
    foot_crosstab = foot_crosstab.div(foot_crosstab.sum(axis=0), axis=1) * 100
    # Round the percentages to two decimal places
    foot_crosstab = foot_crosstab.round(2)
    # reindex positions to match TargetCoder._TARGET_DECODER
    foot_crosstab = foot_crosstab.sort_index(axis=1, key=lambda x: x.map(TargetCoder.encode))
    return foot_crosstab

def visualize_foot_crosstab(foot_crosstab: pd.DataFrame) -> None:
    # plot the data
    positions = foot_crosstab.columns.tolist()
    x = np.arange(len(positions))
    _, ax = plt.subplots(figsize=(12, 8))
    
    # for each ax (position), plot the left and right foot percentages
    # left bar shift by 0.2 to the left, right bar shift by 0.2 to the right
    ax.bar(x - 0.2, foot_crosstab.loc['Left'], 0.4, label='Left foot')
    ax.bar(x + 0.2, foot_crosstab.loc['Right'], 0.4, label='Right foot')
    
    ax.set_xticks(x)
    ax.set_xticklabels(positions)
    ax.set_ylabel('Percentage')
    ax.set_title('Preferred Foot by Position')
    ax.legend()

    plt.show()

foot_crosstab = get_foot_crosstab()
visualize_foot_crosstab(foot_crosstab)
foot_crosstab

##### Weak Foot (Analysis & Visualization)

In [None]:
def weak_foot_by_position_boxplot() -> None:
    sns.boxplot(
        x='position',
        y='weak_foot',
        data=cleaned_df
    )
    
    plt.xlabel('Position')
    plt.ylabel('Weak Foot')
    
    plt.title('Weak Foot by Position')
    plt.show()

def get_weak_foot_by_position_analysis() -> pd.DataFrame:
    grouped_df = cleaned_df.groupby('position')
    indexed_df = grouped_df[["weak_foot"]]
    agg_df = indexed_df.agg(["describe"])
    # reindex positions to match TargetCoder._TARGET_DECODER
    agg_df = agg_df.reindex(TargetCoder._TARGET_DECODER.values())
    return agg_df
    
    
weak_foot_by_position_boxplot()
analysis = get_weak_foot_by_position_analysis()
analysis

##### Skill Moves (Analysis & Visualization)

In [None]:
def skill_moves_by_position_boxplot() -> None:
    sns.boxplot(
        x='position',
        y='skill_moves',
        data=cleaned_df
    )
    
    plt.xlabel('Position')
    plt.ylabel('Skill Moves')
    
    plt.title('Skill Moves by Position')
    plt.show()

def skill_moves_by_position_distplot() -> None:
    sns.displot(
        x='skill_moves',
        hue='position',
        data=cleaned_df,
        kde=True,
        height=8,
        aspect=1,
        multiple='stack',
        palette='viridis'
    )
    
    plt.xlabel('Skill Moves')
    plt.ylabel('Count')
    
    plt.title('Skill Moves by Position')
    plt.show()

def get_skill_moves_by_position_analysis() -> pd.DataFrame:
    grouped_df = cleaned_df.groupby('position')
    indexed_df = grouped_df[["skill_moves"]]
    agg_df = indexed_df.agg(["describe"])
    # reindex positions to match TargetCoder._TARGET_DECODER
    agg_df = agg_df.reindex(TargetCoder._TARGET_DECODER.values())
    return agg_df

skill_moves_by_position_boxplot()
skill_moves_by_position_distplot()
analysis = get_skill_moves_by_position_analysis()
analysis

##### Pace (Analysis & Visualization)

In [None]:
def visualize_pace_by_position_boxplot() -> None:
    # Create a boxplot of player pace by position
    sns.boxplot(x='position', y='pace', data=cleaned_df)
    plt.title('Player Pace by Position')
    plt.xlabel('Position')
    plt.ylabel('Pace')
    plt.show()

def visualize_pace_by_position_distplot() -> None:
    # Create a distribution plot of player pace by position
    sns.displot(
        x='pace',
        hue='position',
        data=cleaned_df,
        kde=True,
        height=8,
        aspect=1,
        multiple='stack',
        palette='viridis'
    )
    plt.title('Player Pace by Position')
    plt.xlabel('Pace')
    plt.ylabel('Count')
    plt.show()

def get_pace_by_position_analysis() -> pd.DataFrame:
    # Group cleaned_df by position
    grouped_df = cleaned_df.groupby('position')
    # Select the pace column
    indexed_df = grouped_df[["pace"]]
    # Aggregate the columns with describe()
    agg_df = indexed_df.agg(["describe"])
    # Reindex the rows of agg_df
    agg_df = agg_df.reindex(TargetCoder._TARGET_DECODER.values())
    return agg_df

visualize_pace_by_position_boxplot()
visualize_pace_by_position_distplot()
analysis = get_pace_by_position_analysis()
analysis

##### Sample Data

In [None]:
# sample 1000 rows from each position
sample_df = cleaned_df.groupby('position').apply(lambda x: x.sample(n=1000, random_state=42)).reset_index(drop=True)
# shuffle the rows
sample_df = sample_df.sample(frac=1, random_state=42).reset_index(drop=True)

##### Attacking Attributes (Analysis & Visualization)

In [None]:
attacking_attr = ['attacking_crossing', 'attacking_finishing', 'attacking_heading_accuracy', 'attacking_short_passing', 'attacking_volleys']

def visualize_attacking_attributes_by_pairplot() -> None:
    # Create a pairplot and color the points using the 'position' column
    plt.figure(figsize=(10, 10))
    sns.pairplot(
        sample_df[attacking_attr + ['position']],
        hue='position',
        height=3,
        aspect=1,
        palette='viridis'
    )
    plt.show()

def get_attacking_attributes_by_positions_corr() -> pd.DataFrame:
    # Group sample_df by position and select the attacking attributes
    grouped_df = cleaned_df.groupby('position')[attacking_attr]
    # Compute the mean of grouped_df
    mean_df = grouped_df.mean()
    # Create the correlation matrix of mean_df
    corr_df = mean_df.corr()
    return corr_df

corr_df = get_attacking_attributes_by_positions_corr()
    
visualize_attacking_attributes_by_pairplot()
corr_df

##### Defending Attributes (Analysis & Visualization)

In [None]:
defenging_attr = ['defending_marking_awareness', 'defending_standing_tackle', 'defending_sliding_tackle']

def visualize_defending_attributes_by_pairplot() -> None:
    # Create a pairplot and color the points using the 'position' column
    plt.figure(figsize=(10, 10))
    sns.pairplot(
        sample_df[defenging_attr + ['position']],
        hue='position',
        height=3,
        aspect=1,
        palette='viridis'
    )
    plt.show()
    
def get_defending_attributes_by_positions_corr() -> pd.DataFrame:
    # Group sample_df by position and select the defending attributes
    grouped_df = cleaned_df.groupby('position')[defenging_attr]
    # Compute the mean of grouped_df
    mean_df = grouped_df.mean()
    # Create the correlation matrix of mean_df
    corr_df = mean_df.corr()
    return corr_df

corr_df = get_defending_attributes_by_positions_corr()
visualize_defending_attributes_by_pairplot()
corr_df

##### Power Attributes (Analysis & Visualization)

In [None]:
power_atr = ['power_shot_power', 'power_jumping', 'power_stamina', 'power_strength', 'power_long_shots']

def visualize_power_attributes_by_pairplot() -> None:
    # Create a pairplot and color the points using the 'position' column
    plt.figure(figsize=(10, 10))
    sns.pairplot(
        sample_df[power_atr + ['position']],
        hue='position',
        height=3,
        aspect=1,
        palette='viridis'
    )
    plt.show()
    
def get_power_attributes_by_positions_corr() -> pd.DataFrame:
    # Group sample_df by position and select the power attributes
    grouped_df = cleaned_df.groupby('position')[power_atr]
    # Compute the mean of grouped_df
    mean_df = grouped_df.mean()
    # Create the correlation matrix of mean_df
    corr_df = mean_df.corr()
    return corr_df

corr_df = get_power_attributes_by_positions_corr()
visualize_power_attributes_by_pairplot()
corr_df