#### Imports & Constants

In [None]:
import pathlib
import pandas as pd

pd.set_option('display.max_columns', 110) # 110 is the number of columns in the dataset
pd.set_option('display.max_rows', 1000)

DATASET_PATH = pathlib.Path("..") / "data" / "raw" / "players.csv"
MAIN_COLUMNS = [7, 14, 15, 31, 32, 33, 35, 40, 41, 42, 43, 44, 45, 46]
DETAILED_COLUMNS =  list(range(47, 76))
USED_COLUMNS = MAIN_COLUMNS + DETAILED_COLUMNS
print(USED_COLUMNS)

#### Load Data 

In [None]:
df = pd.read_csv(DATASET_PATH, usecols=USED_COLUMNS)

##### Trimming Columns

In [None]:
# moving the labels to the end of the dataframe
labels = df.pop('player_positions')
df.insert(len(df.columns), 'labels', labels)
print(df['labels'].head())

##### Dropping all GKs

In [None]:
# dropping all records of goalkeepers since we won't be classifying this position
df.drop(df[df['labels'] == 'GK'].index, inplace=True)

### Proccessed Positions

In [None]:
# dict holds all the unique positions and their corresponding custom positions
# the returned value is the custom position that will be used for training and testing
positions = {'lift wing':['LW', 'LM', 'LS', 'LAM', 'LF'],
                 'right wing':['RW', 'RM', 'RS', 'RAM', 'RF'], 
                 'striker':['CF', 'ST', 'CAM', 'SS', 'LS', 'RS'], 
                 'center midfield':['CM', 'CDM', 'CAM'], 
                 'left back':['LB', 'LWB'], 
                 'right back':['RB', 'RWB'], 
                 'center back':['CB', 'LCB', 'RCB']}
def _get_player_positions(pos: list) -> str:
    pass

def proccess_positions(df: pd.Series) -> pd.DataFrame:
    df['labels'] = df['labels'].apply(_get_player_positions)
    return df

# applying the function to the dataframe
df[labels].apply(proccess_positions, inplace=True)


In [None]:
""" 
encode:
    nation_position with custom positions (target)
    work rate with custom work rate
    player traits with custom traits
remove:
    player_positions
    body type
    goalkeepers
    goalkeeping attributes
"""
# for each trait, split by comma and get unique values
def get_unique_values(df, column):
    unique_values = []
    for row in df[column]:
        if type(row) == str:
            for value in row.split(","):
                if value not in unique_values:
                    unique_values.append(value)
        else:
            print(row)
            if row not in unique_values:
                unique_values.append(row)
    return unique_values
df.iloc[:, -1].unique()

#### Save Data