In [55]:
import pandas as pd
import numpy as np

In [56]:
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [57]:
df_players_profile = pd.read_parquet("s3://tech-challenge-3-curated/PlayersProfile/")

In [58]:
df_players_profile = df_players_profile[df_players_profile["citizenship"].apply(lambda x: "Brazil" in x)]
df_players_profile = df_players_profile[df_players_profile["is_retired"] == False]
df_players_profile = df_players_profile[~df_players_profile["actual_club_id"].isnull()]
df_players_profile = df_players_profile[["player_id", "age", "main_position", "foot", "actual_club_id", "second_position"]]

In [59]:
from sklearn.preprocessing import OneHotEncoder

In [60]:
df_players_profile = df_players_profile[~df_players_profile["foot"].isnull()]

In [61]:
df_players_profile = df_players_profile[
    df_players_profile["main_position"].isin(
        [
            "Centre-Forward",
            "Attacking Midfield",
            "Central Midfield",
            "Second Striker",
            "Right Winger",
            "Left Winger",
        ]
    )
]

In [62]:
df_players_profile = df_players_profile.reset_index(drop=True)

In [63]:
encoder = OneHotEncoder()

In [64]:
encoded_data = encoder.fit_transform(df_players_profile[['foot']]).toarray()

In [65]:
df_players_profile_encode = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(['foot']))

In [66]:
df_players_profile = pd.concat([df_players_profile, df_players_profile_encode], axis=1).drop(columns=['foot'])

In [67]:
df_players_profile['player_position'] = df_players_profile['second_position'].apply(lambda x: x if x is not None else [])

In [68]:
df_players_profile['player_position'] = df_players_profile.apply(
    lambda x: np.append([x['main_position']], x["player_position"]),
    axis=1
)

In [69]:
df_players_profile = df_players_profile.drop(columns=["main_position", "second_position"])

In [70]:
df_players_profile = df_players_profile.explode("player_position")

In [71]:
df_players_profile = df_players_profile.reset_index(drop=True)

In [72]:
encoded_data = encoder.fit_transform(df_players_profile[['player_position']]).toarray()

In [73]:
df_players_profile_encode = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(['player_position']))

In [74]:
df_players_profile = pd.concat([df_players_profile, df_players_profile_encode], axis=1).drop(columns=['player_position'])

In [75]:
df_players_profile = (
    df_players_profile.groupby(
        [
            "player_id",
            "age",
            "actual_club_id",
            "foot_both",
            "foot_left",
            "foot_right",
        ]
    )
    .sum()
    .reset_index()
)

In [76]:
df_players_profile.columns = [col.replace(' ', '_').replace('-', '_').lower() for col in df_players_profile.columns]

In [77]:
df_players_profile["age"] = df_players_profile["age"].astype(int)

In [78]:
df_players_profile

Unnamed: 0,player_id,age,actual_club_id,foot_both,foot_left,foot_right,player_position_attacking_midfield,player_position_central_midfield,player_position_centre_back,player_position_centre_forward,player_position_defensive_midfield,player_position_left_midfield,player_position_left_winger,player_position_left_back,player_position_right_midfield,player_position_right_winger,player_position_right_back,player_position_second_striker
0,100091,34,7178,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
1,1003451,22,967,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,1003457,20,15063,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
3,1004064,21,35499,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,1004065,24,3876,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1601,997491,21,3197,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1602,997718,22,11086,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
1603,997868,22,3348,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1604,99900,34,515,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [79]:
df_players_profile.to_parquet("s3://tech-challenge-3-models/pre-processing/PlayersProfile/players_profile.parquet")