#### Imports & Constants

In [None]:
import sys
import pathlib
import pandas as pd
import numpy as np

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

sys.path.append(pathlib.Path("..").resolve().as_posix())

from src.io_utils import write_stdout_to_file, write_df_to_csv


pd.set_option('display.max_columns', 110)
pd.set_option('display.max_rows', 1000)

# for pca
encoded_df = pd.read_csv(pathlib.Path("..") / "data" / "encoded" / "encoded_data.csv")

# for manual feature selection
processed_df = pd.read_csv(pathlib.Path("..") / "data" / "processed" / "processed_data.csv")

##### Manual Feature Selection Methods

In [None]:
""" 
    Combine height and weight into 1 feature (Height_Weight) and insert it to the first index
    add column to first index and remove the other 2
"""
MAIN_FIFA_CARD_FEATURES = ['pace', 'shooting', 'passing', 'dribbling', 'defending', 'physic']
ATTACKING_FEATURES = ['attacking_finishing', 'attacking_volleys', 'attacking_heading_accuracy', 'attacking_crossing']
SKILL_FEATURES = ['skill_fk_accuracy', 'skill_ball_control', 'skill_dribbling', 'skill_curve', 'skill_long_passing']
MOVE_FEATURES = ['movement_agility', 'movement_reactions', 'movement_balance', 'movement_acceleration', 'movement_sprint_speed']
POWER_FEATURES = ['power_shot_power', 'power_jumping', 'power_stamina', 'power_strength', 'power_long_shots']
MENTAL_FEATURES = ['mentality_aggression', 'mentality_interceptions', 'mentality_positioning', 'mentality_vision', 'mentality_penalties']
DEFENDING_FEATURES = ['defending_standing_tackle', 'defending_sliding_tackle']
EXTRA_FEATURES = [*ATTACKING_FEATURES, *SKILL_FEATURES, *MOVE_FEATURES, *POWER_FEATURES, *MENTAL_FEATURES, *DEFENDING_FEATURES]

def create_height_weight_feature(df: pd.DataFrame):
    df.insert(0, "height_weight", df["height_cm"] + df["weight_kg"] / 2)
    df.drop(["height_cm", "weight_kg"], axis=1, inplace=True)
    return df

def get_redundant_features(df: pd.DataFrame) -> dict:
    redundent_features = set()
    for main_feature in MAIN_FIFA_CARD_FEATURES:
        for extra_feature in EXTRA_FEATURES:
            corr = df[extra_feature].corr(df[main_feature])
            if abs(corr) > 0.8:
                redundent_features.add(extra_feature)
    return list(redundent_features)

def remove_features_with_high_correlation(df: pd.DataFrame, redundant_features: list) -> pd.DataFrame:
    df.drop(redundant_features, axis=1, inplace=True)

def remove_features_with_low_corr_to_position(df: pd.DataFrame) -> pd.DataFrame:
    for feature in df.columns:
        corr = df[feature].corr(df['position'])
        if abs(corr) < 0.1:
            df.drop(feature, axis=1, inplace=True)

##### Removing The Weak Features

In [None]:
create_height_weight_feature(processed_df)
relationship_graph = get_redundant_features(processed_df)
remove_features_with_high_correlation(processed_df, relationship_graph)
remove_features_with_low_corr_to_position(processed_df)

final_data_df = processed_df.copy()

In [None]:
final_data_df.head()

##### PCA Dimensionality Reduction

In [None]:

def standardize_encoded_X(df: pd.DataFrame) -> pd.DataFrame:
    scaler = StandardScaler()
    scaled_df = scaler.fit_transform(df.iloc[:, :-1])
    return pd.DataFrame(scaled_df, columns=df.columns[:-1])
    
def apply_pca(df: pd.DataFrame, n_components: int = 6) -> np.ndarray:
    pca = PCA(n_components=n_components)
    pca.fit(df)
    return pca.transform(df)

n_components = 12
standardized_X = standardize_encoded_X(encoded_df)
pca_nd = apply_pca(standardized_X, n_components)
pca_df = pd.DataFrame(pca_nd, columns=[f"PC{i}" for i in range(1, n_components+1)])

pca_df

##### Save Files

In [None]:
final_data_path = pathlib.Path("..") / "data" / "final" / "final_data.csv"
final_data_info_path = pathlib.Path("..") / "data" / "final" / "final_data_info.txt"

pca_data_path = pathlib.Path("..") / "data" / "final" / "pca_data.csv"
pca_data_info_path = pathlib.Path("..") / "data" / "final" / "pca_data_info.txt"

def create_final_data_files(df: pd.DataFrame) -> None:
    write_df_to_csv(final_data_path, df)
    write_stdout_to_file(final_data_info_path, lambda: df.info(verbose=True, show_counts=True))
    
def create_pca_data_files(df: pd.DataFrame) -> None:
    write_df_to_csv(pca_data_path, df)
    write_stdout_to_file(pca_data_info_path, lambda: df.info(verbose=True, show_counts=True))
    
create_final_data_files(final_data_df)
create_pca_data_files(pca_df)