#### Imports & Constants

In [101]:
import sys
import pathlib
import pandas as pd
import numpy as np

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

sys.path.append(pathlib.Path("..").resolve().as_posix())

from src.io_utils import write_stdout_to_file, write_df_to_csv


pd.set_option('display.max_columns', 110)
pd.set_option('display.max_rows', 1000)

# for pca
encoded_df = pd.read_csv(pathlib.Path("..") / "data" / "encoded" / "encoded_data.csv")

# for manual feature selection
processed_df = pd.read_csv(pathlib.Path("..") / "data" / "processed" / "processed_data.csv")

##### Manual Feature Selection Methods

In [102]:
""" 
    Combine height and weight into 1 feature (Height_Weight) and insert it to the first index
    add column to first index and remove the other 2
"""
MAIN_FIFA_CARD_FEATURES = ['pace', 'shooting', 'passing', 'dribbling', 'defending', 'physic']
ATTACKING_FEATURES = ['attacking_finishing', 'attacking_volleys', 'attacking_heading_accuracy', 'attacking_crossing']
SKILL_FEATURES = ['skill_fk_accuracy', 'skill_ball_control', 'skill_dribbling', 'skill_curve', 'skill_long_passing']
MOVE_FEATURES = ['movement_agility', 'movement_reactions', 'movement_balance', 'movement_acceleration', 'movement_sprint_speed']
POWER_FEATURES = ['power_shot_power', 'power_jumping', 'power_stamina', 'power_strength', 'power_long_shots']
MENTAL_FEATURES = ['mentality_aggression', 'mentality_interceptions', 'mentality_positioning', 'mentality_vision', 'mentality_penalties']
DEFENDING_FEATURES = ['defending_standing_tackle', 'defending_sliding_tackle']
EXTRA_FEATURES = [*ATTACKING_FEATURES, *SKILL_FEATURES, *MOVE_FEATURES, *POWER_FEATURES, *MENTAL_FEATURES, *DEFENDING_FEATURES]

def create_height_weight_feature(df: pd.DataFrame):
    df.insert(0, "height_weight", df["height_cm"] + df["weight_kg"] / 2)
    df.drop(["height_cm", "weight_kg"], axis=1, inplace=True)
    return df

def get_redundant_features(df: pd.DataFrame) -> dict:
    redundent_features = set()
    for main_feature in MAIN_FIFA_CARD_FEATURES:
        for extra_feature in EXTRA_FEATURES:
            corr = df[extra_feature].corr(df[main_feature])
            if abs(corr) > 0.8:
                redundent_features.add(extra_feature)
    return list(redundent_features)

def remove_features_with_high_correlation(df: pd.DataFrame, redundant_features: list) -> pd.DataFrame:
    df.drop(redundant_features, axis=1, inplace=True)

def remove_features_with_low_corr_to_position(df: pd.DataFrame) -> pd.DataFrame:
    for feature in df.columns:
        corr = df[feature].corr(df['position'])
        if abs(corr) < 0.1:
            df.drop(feature, axis=1, inplace=True)

##### Removing The Weak Features

In [103]:
create_height_weight_feature(processed_df)
relationship_graph = get_redundant_features(processed_df)
remove_features_with_high_correlation(processed_df, relationship_graph)
remove_features_with_low_corr_to_position(processed_df)

final_data_df = processed_df.copy()

In [104]:
final_data_df.head()

Unnamed: 0,height_weight,weak_foot,skill_moves,pace,shooting,passing,dribbling,defending,physic,attacking_heading_accuracy,skill_curve,skill_fk_accuracy,movement_agility,movement_balance,power_jumping,mentality_aggression,mentality_penalties,defending_marking_awareness,work_rate_High/Low,work_rate_High/Medium,work_rate_Low/High,work_rate_Low/Medium,work_rate_Medium/High,work_rate_Medium/Low,preferred_foot_Left,preferred_foot_Right,position
0,0.436003,0.5,0.75,0.947368,0.9375,0.90411,1.0,0.168831,0.553846,0.710843,0.939759,0.941176,0.973333,0.975,0.688312,0.44186,0.764706,0.178571,0,0,0,0,0,1,1,0,0
1,0.850252,0.75,1.0,0.947368,0.9875,0.835616,0.932432,0.233766,0.8,0.891566,0.927711,0.811765,0.96,0.575,0.961039,0.616279,0.870588,0.142857,1,0,0,0,0,0,0,1,1
2,0.754098,0.25,0.75,0.947368,0.9,0.863014,0.945946,0.233766,0.569231,0.457831,0.891566,0.858824,0.96,0.925,0.532468,0.430233,0.811765,0.22619,1,0,0,0,0,0,1,0,2
3,1.165511,0.75,0.75,0.723684,0.9625,0.835616,0.864865,0.25974,0.907692,0.771084,0.831325,0.823529,0.866667,0.3,0.675325,0.860465,0.941176,0.178571,0,0,0,0,0,1,0,1,0
4,0.43884,0.75,0.75,0.710526,0.725,0.945205,0.932432,0.584416,0.553846,0.506024,0.831325,0.705882,0.826667,0.8625,0.441558,0.55814,0.705882,0.559524,0,1,0,0,0,0,0,1,3


##### PCA Dimensionality Reduction

In [105]:

def standardize_encoded_X(df: pd.DataFrame) -> pd.DataFrame:
    scaler = StandardScaler()
    scaled_df = scaler.fit_transform(df.iloc[:, :-1])
    return pd.DataFrame(scaled_df, columns=df.columns[:-1])
    
def apply_pca(standardized_df: pd.DataFrame, n_components_list: list) -> tuple:
    for n_components in n_components_list:
        pca = PCA(n_components=n_components)
        pca.fit(standardized_df)
        variance_sum = sum(pca.explained_variance_ratio_)
        if variance_sum >= .8:
            return pca.transform(standardized_df), n_components
    return None

n_components = 12
standardized_X = standardize_encoded_X(encoded_df)
pca_nd = apply_pca(standardized_X, n_components)
pca_df = pd.DataFrame(pca_nd, columns=[f"PC{i}" for i in range(1, n_components+1)])

pca_df

To keep 80% of the variance of the dataset we choose best_n_components = 12


Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,PC11,PC12
0,12.905589,-0.138036,0.198703,-0.275086,2.228092,0.555626,1.728678,-0.550206,-2.367464,1.510994,-2.249212,0.262012
1,11.786134,-2.284943,3.994422,-3.617748,0.099447,-0.784116,1.724072,-0.309832,3.089828,-0.913293,-2.408234,1.771607
2,11.444766,-0.439610,0.521081,-0.585214,2.895368,-0.038967,1.874993,-0.312102,2.605173,-1.239531,-2.373096,2.509522
3,9.345703,-3.700461,6.915738,-1.307354,0.569237,-0.318118,-0.317502,-0.866436,-2.988072,0.763043,-2.287614,1.417917
4,9.592615,-3.446318,-1.812692,1.534449,-1.590814,1.125930,-1.963686,-0.137394,0.387136,0.061721,-0.422368,0.161367
...,...,...,...,...,...,...,...,...,...,...,...,...
143608,-3.274679,6.886251,0.854579,1.775411,-0.583341,-0.349629,-0.557766,0.069710,-0.215159,-0.345815,0.652037,0.017529
143609,-8.988642,3.845792,0.185853,0.799956,2.739477,-0.577891,-0.174643,0.167088,-0.283012,-0.219312,0.794219,-0.085677
143610,-4.392515,3.547324,-0.765833,-0.242141,-0.328417,1.699870,-1.808705,0.103230,0.054980,-0.124111,0.678283,-0.270646
143611,-3.530437,7.257623,1.262372,1.280332,-1.076858,-0.036312,0.802410,0.406380,0.511562,0.345821,1.285298,-1.631352


##### Save Files

In [106]:
final_data_path = pathlib.Path("..") / "data" / "final" / "final_data.csv"
final_data_info_path = pathlib.Path("..") / "data" / "final" / "final_data_info.txt"

pca_data_path = pathlib.Path("..") / "data" / "final" / "pca_data.csv"
pca_data_info_path = pathlib.Path("..") / "data" / "final" / "pca_data_info.txt"

def create_final_data_files(df: pd.DataFrame) -> None:
    write_df_to_csv(final_data_path, df)
    write_stdout_to_file(final_data_info_path, lambda: df.info(verbose=True, show_counts=True))
    
def create_pca_data_files(df: pd.DataFrame) -> None:
    write_df_to_csv(pca_data_path, df)
    write_stdout_to_file(pca_data_info_path, lambda: df.info(verbose=True, show_counts=True))
    
create_final_data_files(final_data_df)
create_pca_data_files(pca_df)