#### Imports & Constants

In [None]:
import sys
import pathlib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

sys.path.append(pathlib.Path("..").resolve().as_posix())

from src.coder import TargetCoder

CLEANED_DATA_PATH = pathlib.Path("..") / "data" / "cleaned" / "cleaned_data.csv"
PROCESS_DATA_PATH = pathlib.Path("..") / "data" / "processed" / "processed_data.csv"

cleaned_df = pd.read_csv(CLEANED_DATA_PATH)
processed_df = pd.read_csv(PROCESS_DATA_PATH)

##### Sample

In [None]:
print(f"Cleaned data shape: {cleaned_df.shape}")
# values not normalized and positions not mapped
cleaned_df.head()

In [None]:
print(f"Processed data shape: {processed_df.shape}")
# values normalized and positions mapped
processed_df.head()

##### Height & Weight (Analysis & Visualization)

In [None]:
def get_height_weight_statistics_df() -> pd.DataFrame:
    grouped_df: pd.DataFrame = cleaned_df.groupby('position')
    indexed_df: pd.DataFrame = grouped_df[["height_cm", "weight_kg"]]
    agg_df: pd.DataFrame = indexed_df.agg(["mean", "std", "min", "max"])
    # reindex positions to match TargetCoder._TARGET_DECODER
    agg_df = agg_df.reindex(TargetCoder._TARGET_DECODER.values())
    return agg_df
    
    
def scatterplot_height_weight_by_position() -> None:
    plt.figure(figsize=(12, 8))

    plt.scatter(
        cleaned_df["height_cm"],
        cleaned_df["weight_kg"],
        c=processed_df["position"],
        alpha=0.5,
        cmap="viridis",
    )

    plt.xlabel("Height (cm)")
    plt.ylabel("Weight (kg)")
    plt.title("Height vs Weight by Position")

    plt.grid(alpha=0.5)

    positions: list = cleaned_df["position"].unique().tolist()
    positions.sort(key=TargetCoder.encode, reverse=True)

    cbar = plt.colorbar(label="Position", ticks=range(len(positions)))
    cbar.set_ticklabels(positions)

    plt.show()

scatterplot_height_weight_by_position()
get_height_weight_statistics_df()

##### Preferred Foot (Analysis & Visualization)

In [None]:
def get_foot_crosstab() -> pd.DataFrame:
    # Create a crosstab of preferred_foot and position
    foot_crosstab = pd.crosstab(cleaned_df['preferred_foot'], cleaned_df['position'])
    # Divide each column by the sum of the column to get percentages
    foot_crosstab = foot_crosstab.div(foot_crosstab.sum(axis=0), axis=1) * 100
    # Round the percentages to two decimal places
    foot_crosstab = foot_crosstab.round(2)
    # reindex positions to match TargetCoder._TARGET_DECODER
    foot_crosstab = foot_crosstab.sort_index(axis=1, key=lambda x: x.map(TargetCoder.encode))
    return foot_crosstab

def visualize_foot_crosstab(foot_crosstab: pd.DataFrame) -> None:
    # plot the data
    positions = foot_crosstab.columns.tolist()
    x = np.arange(len(positions))
    _, ax = plt.subplots(figsize=(12, 8))
    
    # for each ax (position), plot the left and right foot percentages
    # left bar shift by 0.2 to the left, right bar shift by 0.2 to the right
    ax.bar(x - 0.2, foot_crosstab.loc['Left'], 0.4, label='Left foot')
    ax.bar(x + 0.2, foot_crosstab.loc['Right'], 0.4, label='Right foot')
    
    ax.set_xticks(x)
    ax.set_xticklabels(positions)
    ax.set_ylabel('Percentage')
    ax.set_title('Preferred Foot by Position')
    ax.legend()

    plt.show()

foot_crosstab = get_foot_crosstab()
visualize_foot_crosstab(foot_crosstab)
foot_crosstab

##### Weak Foot (Analysis & Visualization)

In [None]:
def get_weak_foot_crossstab() -> pd.DataFrame:
    # Create a crosstab of weak_foot and position
    weak_foot_crosstab = pd.crosstab(cleaned_df['weak_foot'], cleaned_df['position'])
    # Divide each column by the sum of the column to get percentages
    weak_foot_crosstab = weak_foot_crosstab.div(weak_foot_crosstab.sum(axis=0), axis=1) * 100
    # Round the percentages to two decimal places
    weak_foot_crosstab = weak_foot_crosstab.round(2)
    # reindex positions to match TargetCoder._TARGET_DECODER
    weak_foot_crosstab = weak_foot_crosstab.sort_index(axis=1, key=lambda x: x.map(TargetCoder.encode))
    return weak_foot_crosstab

def position_weak_foot_heatmap(weak_foot_crosstab: pd.DataFrame) -> None:
    # plot the data
    positions = weak_foot_crosstab.columns.tolist()
    _, ax = plt.subplots(figsize=(12, 8))
    
    sns.heatmap(weak_foot_crosstab, annot=True, fmt=".2f", cmap="viridis")
    
    ax.set_xticklabels(positions)
    ax.set_ylabel('Weak Foot')
    ax.set_title('Weak Foot by Position')

    plt.show() 

work_rate_crosstab = get_weak_foot_crossstab()
position_weak_foot_heatmap(work_rate_crosstab)

##### Skill Moves (Analysis & Visualization)

In [None]:
def get_skill_moves_crossstab() -> pd.DataFrame:
    # Create a crosstab of skill_moves and position
    skill_moves_crosstab = pd.crosstab(cleaned_df['skill_moves'], cleaned_df['position'])
    # Divide each column by the sum of the column to get percentages
    skill_moves_crosstab = skill_moves_crosstab.div(skill_moves_crosstab.sum(axis=0), axis=1) * 100
    # Round the percentages to two decimal places
    skill_moves_crosstab = skill_moves_crosstab.round(2)
    # reindex positions to match TargetCoder._TARGET_DECODER
    skill_moves_crosstab = skill_moves_crosstab.sort_index(axis=1, key=lambda x: x.map(TargetCoder.encode))
    return skill_moves_crosstab

def position_skill_moves_heatmap(skill_moves_crosstab: pd.DataFrame) -> None:
    # plot the data
    positions = skill_moves_crosstab.columns.tolist()
    _, ax = plt.subplots(figsize=(12, 8))
    
    sns.heatmap(skill_moves_crosstab, annot=True, fmt=".2f", cmap="viridis")
    
    ax.set_xticklabels(positions)
    ax.set_ylabel('Skill Moves')
    ax.set_title('Skill Moves by Position')

    plt.show()

skill_moves_crosstab = get_skill_moves_crossstab()
position_skill_moves_heatmap(skill_moves_crosstab)

##### Pace (Analysis & Visualization)

In [None]:
def visualize_pace_by_position_boxplot() -> None:
    # Create a boxplot of player pace by position
    sns.boxplot(x='position', y='pace', data=cleaned_df)
    plt.title('Player Pace by Position')
    plt.xlabel('Position')
    plt.ylabel('Pace')
    plt.show()

def visualize_pace_by_position_distplot() -> None:
    # Create a distribution plot of player pace by position
    sns.displot(
        x='pace',
        hue='position',
        data=cleaned_df,
        kde=True,
        height=8,
        aspect=1,
        multiple='stack',
        palette='muted'
    )
    plt.title('Player Pace by Position')
    plt.xlabel('Pace')
    plt.ylabel('Count')
    plt.show()

def get_pace_by_position_analysis() -> pd.DataFrame:
    # Group cleaned_df by position
    grouped_df = cleaned_df.groupby('position')
    # Select the pace column
    indexed_df = grouped_df[["pace"]]
    # Aggregate the columns with describe()
    agg_df = indexed_df.agg(["describe"])
    # Reindex the rows of agg_df
    agg_df = agg_df.reindex(TargetCoder._TARGET_DECODER.values())
    return agg_df

visualize_pace_by_position_boxplot()
visualize_pace_by_position_distplot()
analysis = get_pace_by_position_analysis()
analysis

##### Shooting (Analysis & Visualization)

In [None]:
def visualize_shooting_by_position_boxplot() -> None:
    # Create a boxplot of player shooting by position
    sns.boxplot(x='position', y='shooting', data=cleaned_df)
    plt.title('Player Shooting by Position')
    plt.xlabel('Position')
    plt.ylabel('Shooting')
    plt.show()

def visualize_shooting_by_position_distplot() -> None:
    # Create a distribution plot of player pace by position
    sns.displot(
        x='shooting',
        hue='position',
        data=cleaned_df,
        kde=True,
        height=8,
        aspect=1,
        multiple='stack',
        palette='muted'
    )
    plt.title('Player Shooting by Position')
    plt.xlabel('Shooting')
    plt.ylabel('Count')
    plt.show()

def get_shooting_by_position_analysis() -> pd.DataFrame:
    # Group cleaned_df by position
    grouped_df = cleaned_df.groupby('position')
    # Select the shooting column
    indexed_df = grouped_df[["shooting"]]
    # Aggregate the columns with describe()
    agg_df = indexed_df.agg(["describe"])
    # Reindex the rows of agg_df
    agg_df = agg_df.reindex(TargetCoder._TARGET_DECODER.values())
    return agg_df

visualize_shooting_by_position_boxplot()
visualize_shooting_by_position_distplot()
analysis = get_shooting_by_position_analysis()
analysis

##### Passing (Analysis & Visualization)

In [None]:
def visualize_passing_by_position_boxplot() -> None:
    # Create a boxplot of player passing by position
    sns.boxplot(x='position', y='passing', data=cleaned_df)
    plt.title('Player passing by Position')
    plt.xlabel('Position')
    plt.ylabel('Passing')
    plt.show()

def visualize_passing_by_position_distplot() -> None:
    # Create a distribution plot of player passing by position
    sns.displot(
        x='passing',
        hue='position',
        data=cleaned_df,
        kde=True,
        height=8,
        aspect=1,
        multiple='stack',
        palette='muted'
    )
    plt.title('Player Passing by Position')
    plt.xlabel('Passing')
    plt.ylabel('Count')
    plt.show()

def get_passing_by_position_analysis() -> pd.DataFrame:
    # Group cleaned_df by position
    grouped_df = cleaned_df.groupby('position')
    # Select the shooting column
    indexed_df = grouped_df[["passing"]]
    # Aggregate the columns with describe()
    agg_df = indexed_df.agg(["describe"])
    # Reindex the rows of agg_df
    agg_df = agg_df.reindex(TargetCoder._TARGET_DECODER.values())
    return agg_df

visualize_passing_by_position_boxplot()
visualize_passing_by_position_distplot()
analysis = get_passing_by_position_analysis()
analysis

##### Dribbling (Analysis & Visualization)

In [None]:
def visualize_dribbling_by_position_boxplot() -> None:
    # Create a boxplot of player dribbling by position
    sns.boxplot(x='position', y='dribbling', data=cleaned_df)
    plt.title('Player dribbling by Position')
    plt.xlabel('Position')
    plt.ylabel('Dribbling')
    plt.show()
    
def visualize_dribbling_by_position_distplot() -> None:
    # Create a distribution plot of player dribbling by position
    sns.displot(
        x='dribbling',
        hue='position',
        data=cleaned_df,
        kde=True,
        height=8,
        aspect=1,
        multiple='stack',
        palette='muted'
    )
    plt.title('Player Dribbling by Position')
    plt.xlabel('Dribbling')
    plt.ylabel('Count')
    plt.show()

def get_dribbling_by_position_analysis() -> pd.DataFrame:
    # Group cleaned_df by position
    grouped_df = cleaned_df.groupby('position')
    # Select the shooting column
    indexed_df = grouped_df[["dribbling"]]
    # Aggregate the columns with describe()
    agg_df = indexed_df.agg(["describe"])
    # Reindex the rows of agg_df
    agg_df = agg_df.reindex(TargetCoder._TARGET_DECODER.values())
    return agg_df
visualize_dribbling_by_position_boxplot()
visualize_dribbling_by_position_distplot()
analysis = get_dribbling_by_position_analysis()
analysis

##### Defending (Analysis & Visualization)

In [None]:
def visualize_defending_by_position_boxplot() -> None:
    # Create a boxplot of player defending by position
    sns.boxplot(x='position', y='defending', data=cleaned_df)
    plt.title('Player defending by Position')
    plt.xlabel('Position')
    plt.ylabel('Defending')
    plt.show()
    
def visualize_defending_by_position_distplot() -> None:
    # Create a distribution plot of player defending by position
    sns.displot(
        x='defending',
        hue='position',
        data=cleaned_df,
        kde=True,
        height=8,
        aspect=1,
        # palette='magma'
    )
    plt.title('Player Defending by Position')
    plt.xlabel('Defending')
    plt.ylabel('Count')
    plt.show()
    
def get_defending_by_position_analysis() -> pd.DataFrame:
    # Group cleaned_df by position
    grouped_df = cleaned_df.groupby('position')
    # Select the shooting column
    indexed_df = grouped_df[["defending"]]
    # Aggregate the columns with describe()
    agg_df = indexed_df.agg(["describe"])
    # sort by position ascending
    agg_df = agg_df.reindex(TargetCoder._TARGET_DECODER.values())    
    return agg_df

visualize_defending_by_position_boxplot()
visualize_defending_by_position_distplot()
analysis = get_defending_by_position_analysis()
analysis

##### Physic (Analysis & Visualization)

In [None]:
def visualize_physic_by_position_boxplot() -> None:
    # Create a boxplot of player physic by position
    sns.boxplot(x='position', y='physic', data=cleaned_df)
    plt.title('Player physic by Position')
    plt.xlabel('Position')
    plt.ylabel('Physic')
    plt.show()

def visualize_physic_by_position_distplot() -> None:
    # Create a distribution plot of player physic by position
    sns.displot(
        x='physic',
        hue='position',
        data=cleaned_df,
        kde=True,
        height=8,
        aspect=1,
        multiple='stack',
        palette='muted'
    )
    plt.title('Player Physic by Position')
    plt.xlabel('Physic')
    plt.ylabel('Count')
    plt.show()

def get_physic_by_position_analysis() -> pd.DataFrame:
    # Group cleaned_df by position
    grouped_df = cleaned_df.groupby('position')
    # Select the shooting column
    indexed_df = grouped_df[["physic"]]
    # Aggregate the columns with describe()
    agg_df = indexed_df.agg(["describe"])
    # Reindex the rows of agg_df
    agg_df = agg_df.reindex(TargetCoder._TARGET_DECODER.values())
    return agg_df

visualize_physic_by_position_boxplot()
visualize_physic_by_position_distplot()
analysis = get_physic_by_position_analysis()
analysis

##### Sample Data

In [None]:
# sample 1000 rows from each position
sample_df = cleaned_df.groupby('position').apply(lambda x: x.sample(n=1000, random_state=42)).reset_index(drop=True)
# shuffle the rows
sample_df = sample_df.sample(frac=1, random_state=42).reset_index(drop=True)

##### Attacking Attributes (Analysis & Visualization)

In [None]:
attacking_attr = ['shooting', 'passing', 'attacking_finishing', 'attacking_volleys', 'attacking_heading_accuracy', 'attacking_crossing']

def visualize_attacking_attributes_by_pairplot() -> None:
    # Create a pairplot and color the points using the 'position' column
    plt.figure(figsize=(10, 10))
    sns.pairplot(
        sample_df[attacking_attr + ['position']],
        hue='position',
        height=3,
        aspect=1,
        palette='viridis'
    )
    plt.show()

def get_attacking_attributes_by_positions_corr() -> pd.DataFrame:
    # Group sample_df by position and select the attacking attributes
    grouped_df = cleaned_df.groupby('position')[attacking_attr]
    # Compute the mean of grouped_df
    mean_df = grouped_df.mean()
    # Create the correlation matrix of mean_df
    corr_df = mean_df.corr()
    return corr_df

corr_df = get_attacking_attributes_by_positions_corr()
visualize_attacking_attributes_by_pairplot()
corr_df

##### Skill Attributes (Analysis & Visualization)

In [None]:
skill_attr = ['dribbling', 'passing', 'skill_dribbling', 'skill_curve', 'skill_fk_accuracy', 'skill_ball_control']

def visualize_skill_attributes_by_pairplot() -> None:
    # Create a pairplot and color the points using the 'position' column
    plt.figure(figsize=(10, 10))
    sns.pairplot(
        sample_df[skill_attr + ['position']],
        hue='position',
        height=3,
        aspect=1,
        palette='viridis'
    )
    plt.show()

def  get_skill_attributes_by_positions_corr() -> pd.DataFrame:
    # Group sample_df by position and select the skill attributes
    grouped_df = cleaned_df.groupby('position')[skill_attr]
    # Compute the mean of grouped_df
    mean_df = grouped_df.mean()
    # Create the correlation matrix of mean_df
    corr_df = mean_df.corr()
    return corr_df

corr_df = get_skill_attributes_by_positions_corr()
visualize_skill_attributes_by_pairplot()
corr_df 

##### Movement Attributes (Analysis & Visualization)

In [None]:
movement_attr = ['dribbling',  'physic', 'movement_acceleration', 'movement_sprint_speed', 'movement_agility', 'movement_reactions', 'movement_balance']

def visualize_movement_attributes_by_pairplot() -> None:
    # Create a pairplot and color the points using the 'position' column
    plt.figure(figsize=(10, 10))
    sns.pairplot(
        sample_df[movement_attr + ['position']],
        hue='position',
        height=3,
        aspect=1,
        palette='viridis'
    )
    plt.show()

def get_movement_attributes_by_positions_corr() -> pd.DataFrame:
    # Group sample_df by position and select the movement attributes
    grouped_df = cleaned_df.groupby('position')[movement_attr]
    # Compute the mean of grouped_df
    mean_df = grouped_df.mean()
    # Create the correlation matrix of mean_df
    corr_df = mean_df.corr()
    return corr_df
corr_df = get_movement_attributes_by_positions_corr()
visualize_movement_attributes_by_pairplot()
corr_df

##### Mentality Attributes (Analysis & Visualization)

In [None]:
mentality_attr = ['dribbling', 'passing', 'shooting', 'physic', 'mentality_aggression', 'mentality_interceptions', 'mentality_positioning', 'mentality_vision', 'mentality_penalties']

def visualize_mentality_attributes_by_pairplot() -> None:
    # Create a pairplot and color the points using the 'position' column
    plt.figure(figsize=(10, 10))
    sns.pairplot(
        sample_df[mentality_attr + ['position']],
        hue='position',
        height=3,
        aspect=1,
        palette='viridis'
    )
    plt.show()

def get_mentality_attributes_by_positions_corr() -> pd.DataFrame:
    # Group sample_df by position and select the mentality attributes
    grouped_df = cleaned_df.groupby('position')[mentality_attr]
    # Compute the mean of grouped_df
    mean_df = grouped_df.mean()
    # Create the correlation matrix of mean_df
    corr_df = mean_df.corr()
    return corr_df

corr_df = get_mentality_attributes_by_positions_corr()
visualize_mentality_attributes_by_pairplot()
corr_df

##### Defending Attributes (Analysis & Visualization)

In [None]:
defenging_attr = ['defending', 'defending_marking_awareness', 'defending_standing_tackle', 'defending_sliding_tackle']

def visualize_defending_attributes_by_pairplot() -> None:
    # Create a pairplot and color the points using the 'position' column
    plt.figure(figsize=(10, 10))
    sns.pairplot(
        sample_df[defenging_attr + ['position']],
        hue='position',
        height=3,
        aspect=1,
        palette='viridis'
    )
    plt.show()
    
def get_defending_attributes_by_positions_corr() -> pd.DataFrame:
    # Group sample_df by position and select the defending attributes
    grouped_df = cleaned_df.groupby('position')[defenging_attr]
    # Compute the mean of grouped_df
    mean_df = grouped_df.mean()
    # Create the correlation matrix of mean_df
    corr_df = mean_df.corr()
    return corr_df

corr_df = get_defending_attributes_by_positions_corr()
visualize_defending_attributes_by_pairplot()
corr_df

##### Power Attributes (Analysis & Visualization)

In [None]:
power_atr = ['physic', 'power_jumping', 'power_stamina', 'power_strength', 'power_long_shots']

def visualize_power_attributes_by_pairplot() -> None:
    # Create a pairplot and color the points using the 'position' column
    plt.figure(figsize=(10, 10))
    sns.pairplot(
        sample_df[power_atr + ['position']],
        hue='position',
        height=3,
        aspect=1,
        palette='viridis'
    )
    plt.show()
    
def get_power_attributes_by_positions_corr() -> pd.DataFrame:
    # Group sample_df by position and select the power attributes
    grouped_df = cleaned_df.groupby('position')[power_atr]
    # Compute the mean of grouped_df
    mean_df = grouped_df.mean()
    # Create the correlation matrix of mean_df
    corr_df = mean_df.corr()
    return corr_df

corr_df = get_power_attributes_by_positions_corr()
visualize_power_attributes_by_pairplot()
corr_df

##### Work Rate Attributes (Analysis & Visualization)

In [None]:
def get_work_rate_crossstab() -> pd.DataFrame:
    # Create a crosstab of weak_foot and position
    work_rate_crosstab = pd.crosstab(cleaned_df['work_rate'], cleaned_df['position'])
    # Divide each column by the sum of the column to get percentages
    work_rate_crosstab = work_rate_crosstab.div(work_rate_crosstab.sum(axis=0), axis=1) * 100
    # Round the percentages to two decimal places
    work_rate_crosstab = work_rate_crosstab.round(2)
    # reindex positions to match TargetCoder._TARGET_DECODER
    work_rate_crosstab = work_rate_crosstab.sort_index(axis=1, key=lambda x: x.map(TargetCoder.encode))
    
    columns_sorted = ['High/High', 'High/Medium', 'High/Low', 'Medium/High', 'Medium/Medium', 'Medium/Low', 'Low/High', 'Low/Medium', 'Low/Low']
    work_rate_crosstab = work_rate_crosstab.reindex(columns_sorted, axis=0)
    return work_rate_crosstab

def position_work_rate_heatmap(work_rate_crosstab: pd.DataFrame) -> None:
    # plot the data
    positions = work_rate_crosstab.columns.tolist()
    _, ax = plt.subplots(figsize=(12, 8))
    
    sns.heatmap(work_rate_crosstab, annot=True, fmt=".2f", cmap="viridis")
    
    ax.set_xticklabels(positions)
    ax.set_ylabel('Work Rate')
    ax.set_title('Work Rate by Position')

    plt.show() 

work_rate_crosstab = get_work_rate_crossstab()
position_work_rate_heatmap(work_rate_crosstab)