#### Imports & Constants

In [71]:
import sys
import pathlib
import pandas as pd
import numpy as np
from collections import defaultdict

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

sys.path.append(pathlib.Path("..").resolve().as_posix())
pd.set_option('display.max_columns', 110)
pd.set_option('display.max_rows', 1000)

# for pca
encoded_df = pd.read_csv(pathlib.Path("..") / "data" / "encoded" / "encoded_data.csv")

# for manual feature selection
processed_df = pd.read_csv(pathlib.Path("..") / "data" / "processed" / "processed_data.csv")

##### Manual Feature Selection

In [72]:
""" 
    Combine height and weight into 1 feature (Height_Weight) and insert it to the first index
    add column to first index and remove the other 2
"""
MAIN_FIFA_CARD_FEATURES = ['pace', 'shooting', 'passing', 'dribbling', 'defending', 'physic']
ATTACKING_FEATURES = ['attacking_finishing', 'attacking_volleys', 'attacking_heading_accuracy', 'attacking_crossing']
SKILL_FEATURES = ['skill_fk_accuracy', 'skill_ball_control', 'skill_dribbling', 'skill_curve', 'skill_long_passing']
MOVE_FEATURES = ['movement_agility', 'movement_reactions', 'movement_balance', 'movement_acceleration', 'movement_sprint_speed']
POWER_FEATURES = ['power_shot_power', 'power_jumping', 'power_stamina', 'power_strength', 'power_long_shots']
MENTAL_FEATURES = ['mentality_aggression', 'mentality_interceptions', 'mentality_positioning', 'mentality_vision', 'mentality_penalties']
DEFENDING_FEATURES = ['defending_standing_tackle', 'defending_sliding_tackle']
EXTRA_FEATURES = [*ATTACKING_FEATURES, *SKILL_FEATURES, *MOVE_FEATURES, *POWER_FEATURES, *MENTAL_FEATURES, *DEFENDING_FEATURES]

def create_height_weight_feature(df: pd.DataFrame):
    df.insert(0, "height_weight", df["height_cm"] + df["weight_kg"] / 2)
    df.drop(["height_cm", "weight_kg"], axis=1, inplace=True)
    return df

def get_relashionship_graph(df: pd.DataFrame) -> dict:
    
    relashionship_graph = defaultdict(dict)
    for main_feature in MAIN_FIFA_CARD_FEATURES:
        for extra_feature in EXTRA_FEATURES:
            corr = df[extra_feature].corr(df[main_feature])
            if abs(corr) > 0.7:
                relashionship_graph[main_feature][extra_feature] = corr    
    return relashionship_graph

# remove features that have high correlation with other features
def remove_features_with_high_correlation(df: pd.DataFrame, relashionship_graph: dict) -> pd.DataFrame:
    correlated_features_groups =  [list(correlated.keys()) for correlated in relashionship_graph.values()]
    correlated_features = [item for sublist in correlated_features_groups for item in sublist]
    correlated_features = list(set(correlated_features))
    df.drop(correlated_features, axis=1, inplace=True)

relationship_graph = get_relashionship_graph(encoded_df)
remove_features_with_high_correlation(processed_df, relationship_graph)
create_height_weight_feature(processed_df)
processed_df.head()

Unnamed: 0,height_weight,weak_foot,skill_moves,pace,shooting,passing,dribbling,defending,physic,attacking_heading_accuracy,attacking_short_passing,movement_reactions,movement_balance,power_jumping,power_stamina,defending_marking_awareness,work_rate_High/High,work_rate_High/Low,work_rate_High/Medium,work_rate_Low/High,work_rate_Low/Low,work_rate_Low/Medium,work_rate_Medium/High,work_rate_Medium/Low,work_rate_Medium/Medium,preferred_foot_Left,preferred_foot_Right,position
0,0.436003,0.5,0.75,0.947368,0.9375,0.90411,1.0,0.168831,0.553846,0.710843,0.92,0.972222,0.975,0.688312,0.74026,0.178571,0,0,0,0,0,0,0,1,0,1,0,0
1,0.850252,0.75,1.0,0.947368,0.9875,0.835616,0.932432,0.233766,0.8,0.891566,0.826667,0.916667,0.575,0.961039,0.896104,0.142857,0,1,0,0,0,0,0,0,0,0,1,1
2,0.754098,0.25,0.75,0.947368,0.9,0.863014,0.945946,0.233766,0.569231,0.457831,0.88,0.902778,0.925,0.532468,0.753247,0.22619,0,1,0,0,0,0,0,0,0,1,0,2
3,1.165511,0.75,0.75,0.723684,0.9625,0.835616,0.864865,0.25974,0.907692,0.771084,0.853333,0.847222,0.3,0.675325,0.753247,0.178571,0,0,0,0,0,0,0,1,0,0,1,0
4,0.43884,0.75,0.75,0.710526,0.725,0.945205,0.932432,0.584416,0.553846,0.506024,0.973333,0.916667,0.8625,0.441558,0.753247,0.559524,0,0,1,0,0,0,0,0,0,0,1,3


##### PCA Dimensionality Reduction

In [73]:

def standardize_encoded_X(df: pd.DataFrame) -> pd.DataFrame:
    scaler = StandardScaler()
    scaled_df = scaler.fit_transform(df.iloc[:, :-1])
    return pd.DataFrame(scaled_df, columns=df.columns[:-1])
    
def apply_pca(df: pd.DataFrame, n_components: int = 6) -> np.ndarray:
    pca = PCA(n_components=n_components)
    pca.fit(df)
    return pca.transform(df)

n_components = 5
standardized_X = standardize_encoded_X(encoded_df)
pca_df = apply_pca(standardized_X, n_components)
pca_df = pd.DataFrame(pca_df, columns=[f"PC{i}" for i in range(1, n_components+1)])

pca_df

Unnamed: 0,PC1,PC2,PC3,PC4,PC5
0,12.905590,-0.138007,0.198935,-0.273083,2.274317
1,11.786134,-2.284934,3.994539,-3.616807,0.112636
2,11.444766,-0.439604,0.521169,-0.584885,2.909004
3,9.345703,-3.700452,6.915818,-1.306405,0.587441
4,9.592615,-3.446307,-1.812715,1.535232,-1.583321
...,...,...,...,...,...
143608,-3.274679,6.886240,0.854611,1.774255,-0.596945
143609,-8.988643,3.845777,0.185723,0.798543,2.723119
143610,-4.392515,3.547318,-0.765882,-0.242576,-0.337837
143611,-3.530437,7.257620,1.262333,1.280247,-1.081642


In [74]:
from sklearn.model_selection import train_test_split
# from sklearn.ensemble import RandomForestRegressor
# # use random forest regressor
# rf = RandomForestRegressor(n_estimators=70, random_state=42)
# X_train, X_test, y_train, y_test = train_test_split(pca_df, encoded_df['position'], test_size=0.2, random_state=42, shuffle=True)

# rf.fit(X_train, y_train)
# score = rf.score(X_test, y_test)
# print(f'Random Forest Regressor score: {score}')

In [75]:
# use rf classification
# from sklearn.ensemble import RandomForestClassifier
# rf = RandomForestClassifier(n_estimators=70, random_state=42)
# rf.fit(X_train, y_train)
# score = rf.score(X_test, y_test)
# print(f'Random Forest Classifier score: {score}')