#### Imports & Constants

In [None]:
import pathlib
import pandas as pd

from sklearn.preprocessing import MinMaxScaler, LabelEncoder

scaler = MinMaxScaler()
le = LabelEncoder()

pd.set_option('display.max_columns', 110) # 110 is the number of columns in the dataset
pd.set_option('display.max_rows', 1000)

DATASET_PATH = pathlib.Path("..") / "data" / "raw" / "players.csv"
MAIN_COLUMNS = [7, 14, 15, 31, 32, 33, 35, 41, 42, 43, 44, 45, 46]
DETAILED_COLUMNS =  list(range(47, 76))
USED_COLUMNS = MAIN_COLUMNS + DETAILED_COLUMNS

COLS_TO_NORMALIZE = range(0, 38)

class TargetCoder():
    
    """ 
    
    """
    
    _POSITIONS_MAPPER = {
        "ST": "ST", "CF": "ST",
        "LW": "LW", "LF": "LW", "LM": "LW",
        "RW": "RW", "RM": "RW", "RF": "RW",
        "CM": "CM", "CAM": "CM", "CDM": "CM",
        "RB": "RB", "RWB": "RB",
        "LB": "LB", "LWB": "LB",
        "CB": "CB"
    }

    _TARGET_ENCODER = {
        "ST": 0, "LW": 1, "RW": 2,
        "CM": 3, "RB": 4, "LB": 5, "CB": 6
    }
    
    _TARGET_DECODER = {
        0: "ST", 1: "LW", 2: "RW",
        3: "CM", 4: "RB", 5: "LB", 6: "CB"
    }
    
    @staticmethod
    def encode(target: str) -> int:
        return TargetCoder._TARGET_ENCODER[TargetCoder.map(target)]
    
    @staticmethod
    def decode(target: int) -> str:
        return TargetCoder._TARGET_DECODER[target]
    
    @staticmethod
    def map(target: str) -> str:
        return TargetCoder._POSITIONS_MAPPER[target]
    

print(USED_COLUMNS)

#### Load Data 

In [None]:
df = pd.read_csv(DATASET_PATH, usecols=USED_COLUMNS)

##### Droppings

In [None]:
def drop_goalkeepers(df: pd.DataFrame) -> None:
    """
    Drops all goalkeepers from the dataset inplace
    """
    df.drop(df[df["player_positions"].str.contains("GK")].index, inplace=True)
    
def drop_mentality_composure(df: pd.DataFrame) -> None:
    """
    Drops all columns related to mentality composure
    Reason:
        - The column has 20k+ Nans
    """
    df.drop(columns="mentality_composure", inplace=True)

### Proccess positions

In [None]:
def process_positions(df: pd.DataFrame) -> None:
    """
    Processes the player_positions column inplace
    Effect:
        - Maps the positions to the main positions (first position in the string)
        - Move the column to the end of the dataframe
        - Renames the column to position to "label"
    """
    # map positions
    df["player_positions"] = df["player_positions"].str.split(",").str[0].map(TargetCoder.encode)
    # move the column to the end
    df["position"] = df.pop("player_positions")

##### One hot encode work rate & preferred_foot

In [None]:
# RUN ONCE BECAUSE THE WORKRATE COLUMNS WOULD HAVE BEEN DROPED
def get_encoded_df(df: pd.DataFrame) -> pd.DataFrame:
    """
    Processes the work_rate column inplace using one hot encoding
    Effect:
        - One hot encodes the work_rate
        - One hot encodes the preferred_foot
    """
    new_df = pd.get_dummies(df, columns=['work_rate', 'preferred_foot'])
    return new_df


##### Normalize Columns

In [None]:
def normalize_df(df: pd.DataFrame) -> None:
    """
    Normalizes the dataframe inplace
    Effect:
        - Normalizes the dataframe inplace
    """
    df.iloc[:, COLS_TO_NORMALIZE] = scaler.fit_transform(df.iloc[:, COLS_TO_NORMALIZE])

#### Process Data

In [None]:
""" 
KEEP ORDER
"""

df = get_encoded_df(df)

drop_goalkeepers(df)
drop_mentality_composure(df)

process_positions(df)

normalize_df(df)

df.head()

#### Save Data