#### Imports & Constants

In [341]:
import pathlib
import pandas as pd

from sklearn.preprocessing import MinMaxScaler, LabelEncoder

scaler = MinMaxScaler()
le = LabelEncoder()

pd.set_option('display.max_columns', 110) # 110 is the number of columns in the dataset
pd.set_option('display.max_rows', 1000)

DATASET_PATH = pathlib.Path("..") / "data" / "raw" / "players.csv"
MAIN_COLUMNS = [7, 14, 15, 31, 32, 33, 35, 41, 42, 43, 44, 45, 46]
DETAILED_COLUMNS =  list(range(47, 76))
USED_COLUMNS = MAIN_COLUMNS + DETAILED_COLUMNS

COLS_TO_NORMALIZE = range(0, 38)

class TargetCoder():
    
    """ 
    
    """
    
    _POSITIONS_MAPPER = {
        "ST": "ST", "CF": "ST",
        "LW": "LW", "LF": "LW", "LM": "LW",
        "RW": "RW", "RM": "RW", "RF": "RW",
        "CM": "CM", "CAM": "CM", "CDM": "CM",
        "RB": "RB", "RWB": "RB",
        "LB": "LB", "LWB": "LB",
        "CB": "CB"
    }

    _TARGET_ENCODER = {
        "ST": 0, "LW": 1, "RW": 2,
        "CM": 3, "RB": 4, "LB": 5, "CB": 6
    }
    
    _TARGET_DECODER = {
        0: "ST", 1: "LW", 2: "RW",
        3: "CM", 4: "RB", 5: "LB", 6: "CB"
    }
    
    @staticmethod
    def encode(target: str) -> int:
        return TargetCoder._TARGET_ENCODER[TargetCoder.map(target)]
    
    @staticmethod
    def decode(target: int) -> str:
        return TargetCoder._TARGET_DECODER[target]
    
    @staticmethod
    def map(target: str) -> str:
        return TargetCoder._POSITIONS_MAPPER[target]
    

print(USED_COLUMNS)

[7, 14, 15, 31, 32, 33, 35, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75]


#### Load Data 

In [342]:
df = pd.read_csv(DATASET_PATH, usecols=USED_COLUMNS)

##### Droppings

In [343]:
def drop_goalkeepers(df: pd.DataFrame) -> None:
    """
    Drops all goalkeepers from the dataset inplace
    """
    df.drop(df[df["player_positions"].str.contains("GK")].index, inplace=True)
    
def drop_mentality_composure(df: pd.DataFrame) -> None:
    """
    Drops all columns related to mentality composure
    Reason:
        - The column has 20k+ Nans
    """
    df.drop(columns="mentality_composure", inplace=True)

### Proccess positions

In [344]:
def process_positions(df: pd.DataFrame) -> None:
    """
    Processes the player_positions column inplace
    Effect:
        - Maps the positions to the main positions (first position in the string)
        - Move the column to the end of the dataframe
        - Renames the column to position to "label"
    """
    # map positions
    df["player_positions"] = df["player_positions"].str.split(",").str[0].map(TargetCoder.encode)
    # move the column to the end
    df["position"] = df.pop("player_positions")

##### One hot encode work rate & preferred_foot

In [345]:
# RUN ONCE BECAUSE THE WORKRATE COLUMNS WOULD HAVE BEEN DROPED
def get_encoded_df(df: pd.DataFrame) -> pd.DataFrame:
    """
    Processes the work_rate column inplace using one hot encoding
    Effect:
        - One hot encodes the work_rate
        - One hot encodes the preferred_foot
    """
    new_df = pd.get_dummies(df, columns=['work_rate', 'preferred_foot'])
    return new_df


##### Normalize Columns

In [346]:
def normalize_df(df: pd.DataFrame) -> None:
    """
    Normalizes the dataframe inplace
    Effect:
        - Normalizes the dataframe inplace
    """
    df.iloc[:, COLS_TO_NORMALIZE] = scaler.fit_transform(df.iloc[:, COLS_TO_NORMALIZE])

#### Process Data

In [347]:
""" 
KEEP ORDER
"""

df = get_encoded_df(df)

drop_goalkeepers(df)
drop_mentality_composure(df)
process_positions(df)
normalize_df(df)

df.head()

Unnamed: 0,height_cm,weight_kg,weak_foot,skill_moves,pace,shooting,passing,dribbling,defending,physic,attacking_crossing,attacking_finishing,attacking_heading_accuracy,attacking_short_passing,attacking_volleys,skill_dribbling,skill_curve,skill_fk_accuracy,skill_long_passing,skill_ball_control,movement_acceleration,movement_sprint_speed,movement_agility,movement_reactions,movement_balance,power_shot_power,power_jumping,power_stamina,power_strength,power_long_shots,mentality_aggression,mentality_interceptions,mentality_positioning,mentality_vision,mentality_penalties,defending_marking_awareness,defending_standing_tackle,defending_sliding_tackle,work_rate_High/High,work_rate_High/Low,work_rate_High/Medium,work_rate_Low/High,work_rate_Low/Low,work_rate_Low/Medium,work_rate_Medium/High,work_rate_Medium/Low,work_rate_Medium/Medium,preferred_foot_Left,preferred_foot_Right,position
0,0.288462,0.295082,0.5,0.75,0.947368,0.9375,0.90411,1.0,0.168831,0.553846,0.879518,0.988235,0.710843,0.92,0.903614,0.988235,0.939759,0.941176,0.7625,1.0,0.987013,0.907895,0.973333,0.972222,0.975,0.811765,0.688312,0.74026,0.518987,0.927711,0.44186,0.144578,0.952941,0.929412,0.764706,0.178571,0.130952,0.117647,0,0,0,0,0,0,0,1,0,1,0,0
1,0.596154,0.508197,0.75,1.0,0.947368,0.9875,0.835616,0.932432,0.233766,0.8,0.86747,1.0,0.891566,0.826667,0.927711,0.952941,0.927711,0.811765,0.7125,0.95,0.922078,0.960526,0.96,0.916667,0.575,0.976471,0.961039,0.896104,0.759494,0.987952,0.616279,0.168675,0.941176,0.823529,0.870588,0.142857,0.25,0.152941,0,1,0,0,0,0,0,0,0,0,1,1
2,0.5,0.508197,0.25,0.75,0.947368,0.9,0.863014,0.945946,0.233766,0.569231,0.831325,0.882353,0.457831,0.88,0.915663,0.952941,0.891566,0.858824,0.7625,0.925,0.948052,0.947368,0.96,0.902778,0.925,0.882353,0.532468,0.753247,0.582278,0.951807,0.430233,0.349398,0.917647,0.858824,0.811765,0.22619,0.190476,0.188235,0,1,0,0,0,0,0,0,0,1,0,2
3,0.788462,0.754098,0.75,0.75,0.723684,0.9625,0.835616,0.864865,0.25974,0.907692,0.783133,0.952941,0.771084,0.853333,0.987952,0.894118,0.831325,0.823529,0.7625,0.925,0.701299,0.736842,0.866667,0.847222,0.3,0.964706,0.675325,0.753247,0.936709,0.927711,0.860465,0.120482,0.882353,0.847059,0.941176,0.178571,0.369048,0.2,0,0,0,0,0,0,0,1,0,0,1,0
5,0.307692,0.262295,0.75,0.75,0.710526,0.725,0.945205,0.932432,0.584416,0.553846,0.891566,0.741176,0.506024,0.973333,0.771084,0.941176,0.831325,0.705882,0.925,0.975,0.727273,0.710526,0.826667,0.916667,0.8625,0.635294,0.441558,0.753247,0.506329,0.771084,0.55814,0.698795,0.894118,0.964706,0.705882,0.559524,0.559524,0.541176,0,0,1,0,0,0,0,0,0,0,1,3


#### Save Data

In [348]:

new_df = scaler.fit_transform(df.iloc[:, :38])
df.iloc[:, :38] = new_df

print(df.head())

   height_cm  weight_kg  weak_foot  skill_moves      pace  shooting   passing  \
0   0.288462   0.295082       0.50         0.75  0.947368    0.9375  0.904110   
1   0.596154   0.508197       0.75         1.00  0.947368    0.9875  0.835616   
2   0.500000   0.508197       0.25         0.75  0.947368    0.9000  0.863014   
3   0.788462   0.754098       0.75         0.75  0.723684    0.9625  0.835616   
5   0.307692   0.262295       0.75         0.75  0.710526    0.7250  0.945205   

   dribbling  defending    physic  attacking_crossing  attacking_finishing  \
0   1.000000   0.168831  0.553846            0.879518             0.988235   
1   0.932432   0.233766  0.800000            0.867470             1.000000   
2   0.945946   0.233766  0.569231            0.831325             0.882353   
3   0.864865   0.259740  0.907692            0.783133             0.952941   
5   0.932432   0.584416  0.553846            0.891566             0.741176   

   attacking_heading_accuracy  attacking_sho