#### Imports & Constants

In [27]:
import sys
import pathlib
import pandas as pd

from sklearn.preprocessing import MinMaxScaler, LabelEncoder
sys.path.append(pathlib.Path("..").resolve().as_posix())

from src.coder import TargetCoder
from src.io_utils import write_stdout_to_file, write_df_to_csv

scaler = MinMaxScaler()
le = LabelEncoder()

# 110 columns in the dataset
# 160k+ rows
pd.set_option('display.max_columns', 110)
pd.set_option('display.max_rows', 1000)

DATASET_PATH = pathlib.Path("..") / "data" / "raw" / "players.csv"
MAIN_COLUMNS = [7, 14, 15, 31, 32, 33, 35, 41, 42, 43, 44, 45, 46]
DETAILED_COLUMNS =  list(range(47, 76))
USED_COLUMNS = MAIN_COLUMNS + DETAILED_COLUMNS

COLS_TO_NORMALIZE = range(0, 38)

print(USED_COLUMNS)

[7, 14, 15, 31, 32, 33, 35, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75]


#### Load Data 

In [28]:
df = pd.read_csv(DATASET_PATH, usecols=USED_COLUMNS)

##### Droppings

In [29]:
def drop_goalkeepers(df: pd.DataFrame) -> None:
    """
    Drops all goalkeepers from the dataset inplace
    """
    df.drop(df[df["player_positions"].str.contains("GK")].index, inplace=True)
    
def drop_mentality_composure(df: pd.DataFrame) -> None:
    """
    Drops all columns related to mentality composure
    Reason:
        - The column has 20k+ Nans
    """
    df.drop(columns="mentality_composure", inplace=True)

### Proccess positions

In [30]:
def map_positions(df: pd.DataFrame) -> None:
    """
    Processes the player_positions column inplace
    Effect:
        - Maps the positions to the main positions (first position in the string)
        - Move the column to the end of the dataframe
        - Renames the column to position to "position"
    """
    # map positions
    df["player_positions"] = df["player_positions"].str.split(",").str[0].map(TargetCoder.map)
    # move the column to the end
    df["position"] = df.pop("player_positions")
    
def encode_positions(df: pd.DataFrame) -> pd.DataFrame:
    """
    Encodes the position column inplace
    """
    df["position"] = df["position"].map(TargetCoder.encode)
    return df

##### One hot encode work rate & preferred_foot

In [31]:
# RUN ONCE BECAUSE THE WORKRATE COLUMNS WOULD HAVE BEEN DROPED
def get_encoded_df(df: pd.DataFrame) -> pd.DataFrame:
    """
    Processes the work_rate column inplace using one hot encoding
    Effect:
        - Custom encode positions
        - One hot encodes the work_rate
        - One hot encodes the preferred_foot
    """
    new_df = pd.get_dummies(df, columns=['work_rate', 'preferred_foot'])
    new_df = encode_positions(new_df)
    new_df["position"] = new_df.pop("position")
    return new_df


##### Normalize Columns

In [32]:
def normalize_df(df: pd.DataFrame) -> pd.DataFrame:
    """
    Normalizes the dataframe inplace
    Effect:
        - Normalizes the dataframe inplace
    """
    normalized_df = df.copy()
    normalized_df.iloc[:, COLS_TO_NORMALIZE] = scaler.fit_transform(df.iloc[:, COLS_TO_NORMALIZE])
    return normalized_df

#### Process Data

In [33]:
""" 
KEEP ORDER
"""

# drop columns
drop_goalkeepers(df)
drop_mentality_composure(df)
# map positions
map_positions(df)

# cleaned data
cleaned_df = df.copy()
# encode & normalize data
encoded_df = get_encoded_df(cleaned_df)
normalized_df = normalize_df(encoded_df)
encoded_df.shape


(143613, 50)

#### Save Data

In [34]:

cleaned_data_path = pathlib.Path("..") / "data" / "cleaned" / "cleaned_data.csv"
cleaned_data_info_path = pathlib.Path("..") / "data" / "cleaned" / "cleaned_data_info.txt"

processed_data_path = pathlib.Path("..") / "data" / "processed" / "processed_data.csv"
processed_data_info_path = pathlib.Path("..") / "data" / "processed" / "processed_data_info.txt"

encoded_data_path = pathlib.Path("..") / "data" / "encoded" / "encoded_data.csv"
encoded_data_info_path = pathlib.Path("..") / "data" / "encoded" / "encoded_data_info.txt"

def create_cleaned_data_files(df: pd.DataFrame) -> None:
    write_df_to_csv(cleaned_data_path, df)
    write_stdout_to_file(cleaned_data_info_path, lambda: df.info(verbose=True, show_counts=True))
    
def create_processed_data_files(df: pd.DataFrame) -> None:
    write_df_to_csv(processed_data_path, df)
    write_stdout_to_file(processed_data_info_path, lambda: df.info(verbose=True, show_counts=True))

def create_encoded_data_files(df: pd.DataFrame) -> None:
    write_df_to_csv(encoded_data_path, df)
    write_stdout_to_file(encoded_data_info_path, lambda: df.info(verbose=True, show_counts=True))
    
# create cleaned data files
create_cleaned_data_files(cleaned_df)
# create encoded data files
create_encoded_data_files(encoded_df)
# create processed data files
create_processed_data_files(normalized_df)