#### Imports & Constants

In [10]:
import sys
import pathlib
import pandas as pd

from sklearn.preprocessing import MinMaxScaler, LabelEncoder

sys.path.append(pathlib.Path("..").resolve().as_posix())

from src.coder import TargetCoder
from src.io_utils import write_stdout_to_file, write_df_to_csv

scaler = MinMaxScaler()
le = LabelEncoder()

# 110 columns in the dataset
# 160k+ rows
pd.set_option('display.max_columns', 110)
pd.set_option('display.max_rows', 1000)

DATASET_PATH = pathlib.Path("..") / "data" / "raw" / "players.csv"
MAIN_COLUMNS = [7, 14, 15, 31, 32, 33, 35, 41, 42, 43, 44, 45, 46]
DETAILED_COLUMNS =  list(range(47, 76))
USED_COLUMNS = MAIN_COLUMNS + DETAILED_COLUMNS

COLS_TO_NORMALIZE = range(0, 38)

print(USED_COLUMNS)

[7, 14, 15, 31, 32, 33, 35, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75]


#### Load Data 

In [11]:
df = pd.read_csv(DATASET_PATH, usecols=USED_COLUMNS)

##### Droppings

In [12]:
def drop_goalkeepers(df: pd.DataFrame) -> None:
    """
    Drops all goalkeepers from the dataset inplace
    """
    df.drop(df[df["player_positions"].str.contains("GK")].index, inplace=True)
    
def drop_mentality_composure(df: pd.DataFrame) -> None:
    """
    Drops all columns related to mentality composure
    Reason:
        - The column has 20k+ Nans
    """
    df.drop(columns="mentality_composure", inplace=True)

### Proccess positions

In [13]:
def map_positions(df: pd.DataFrame) -> None:
    """
    Processes the player_positions column inplace
    Effect:
        - Maps the positions to the main positions (first position in the string)
        - Move the column to the end of the dataframe
        - Renames the column to position to "position"
    """
    # map positions
    df["player_positions"] = df["player_positions"].str.split(",").str[0].map(TargetCoder.map)
    
def encode_positions(df: pd.DataFrame) -> pd.DataFrame:
    """
    Encodes the position column inplace
    """
    df = df.copy()
    df["position"] = df.pop("player_positions")
    df["position"] = df["position"].map(TargetCoder.encode)
    # move the column to the end
    return df

##### One hot encode work rate & preferred_foot

In [14]:
# RUN ONCE BECAUSE THE WORKRATE COLUMNS WOULD HAVE BEEN DROPED
def get_encoded_df(df: pd.DataFrame) -> pd.DataFrame:
    """
    Processes the work_rate column inplace using one hot encoding
    Effect:
        - Custom encode positions
        - One hot encodes the work_rate
        - One hot encodes the preferred_foot
    """
    new_df = pd.get_dummies(new_df, columns=['work_rate', 'preferred_foot'])
    new_df = encode_positions(df)
    return new_df


##### Normalize Columns

In [15]:
def normalize_df(df: pd.DataFrame) -> pd.DataFrame:
    """
    Normalizes the dataframe inplace
    Effect:
        - Normalizes the dataframe inplace
    """
    normalized_df = df.copy()
    normalized_df.iloc[:, COLS_TO_NORMALIZE] = scaler.fit_transform(df.iloc[:, COLS_TO_NORMALIZE])
    return normalized_df

#### Process Data

In [16]:
""" 
KEEP ORDER
"""

# drop columns
drop_goalkeepers(df)
drop_mentality_composure(df)
# map positions
map_positions(df)

# encode & normalize data
normalized_df = normalize_df(df) 
proccessed_df = get_encoded_df(df)


df.head()
#encoded_df.head()

Unnamed: 0,height_cm,weight_kg,preferred_foot,weak_foot,skill_moves,work_rate,pace,shooting,passing,dribbling,defending,physic,attacking_crossing,attacking_finishing,attacking_heading_accuracy,attacking_short_passing,attacking_volleys,skill_dribbling,skill_curve,skill_fk_accuracy,skill_long_passing,skill_ball_control,movement_acceleration,movement_sprint_speed,movement_agility,movement_reactions,movement_balance,power_shot_power,power_jumping,power_stamina,power_strength,power_long_shots,mentality_aggression,mentality_interceptions,mentality_positioning,mentality_vision,mentality_penalties,defending_marking_awareness,defending_standing_tackle,defending_sliding_tackle,position
0,169,67,Left,3,4,Medium/Low,93.0,89.0,86.0,96.0,27.0,63.0,84,94,71,89,85,96,89,90,76,96,96,90,94,94,95,80,73,77,60,88,48,22,92,90,76,25,21,20,ST
1,185,80,Right,4,5,High/Low,93.0,93.0,81.0,91.0,32.0,79.0,83,95,86,82,87,93,88,79,72,92,91,94,93,90,63,94,94,89,79,93,63,24,91,81,85,22,31,23,LW
2,180,80,Left,2,4,High/Low,93.0,86.0,83.0,92.0,32.0,64.0,80,85,50,86,86,93,85,83,76,90,93,93,93,89,91,86,61,78,65,90,47,39,89,84,80,29,26,26,RW
3,195,95,Right,4,4,Medium/Low,76.0,91.0,81.0,86.0,34.0,86.0,76,91,76,84,92,88,80,80,76,90,74,77,86,85,41,93,72,78,93,88,84,20,86,83,91,25,41,27,ST
5,170,65,Right,4,4,High/Medium,75.0,72.0,89.0,91.0,59.0,63.0,85,73,54,93,74,92,80,70,89,94,76,75,83,90,86,65,54,78,59,75,58,68,87,93,71,57,57,56,CM


#### Save Data

In [17]:

cleaned_data_path = pathlib.Path("..") / "data" / "processed" / "cleaned_data.csv"
cleaned_data_info_path = pathlib.Path("..") / "data" / "processed" / "cleaned_data_info.txt"

processed_data_path = pathlib.Path("..") / "data" / "processed" / "processed_data.csv"
processed_data_info_path = pathlib.Path("..") / "data" / "processed" / "processed_data_info.txt"

def create_cleaned_data_files(df: pd.DataFrame) -> None:
    write_df_to_csv(cleaned_data_path, df)
    write_stdout_to_file(cleaned_data_info_path, lambda: df.info(verbose=True, show_counts=True))
    
def create_processed_data_files(df: pd.DataFrame) -> None:
    write_df_to_csv(processed_data_path, df)
    write_stdout_to_file(processed_data_info_path, lambda: df.info(verbose=True, show_counts=True))

# create cleaned data files
create_cleaned_data_files(df)
# create processed data files
create_processed_data_files(encoded_df)