#### Imports & Constants

In [91]:
import sys
import pathlib
import pandas as pd
import numpy as np
from collections import defaultdict

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

sys.path.append(pathlib.Path("..").resolve().as_posix())
pd.set_option('display.max_columns', 110)
pd.set_option('display.max_rows', 1000)

# for pca
processed_df = pd.read_csv(pathlib.Path("..") / "data" / "encoded" / "encoded_data.csv")

# for manual feature selection
processed_df = pd.read_csv(pathlib.Path("..") / "data" / "processed" / "processed_data.csv")

##### Manual Feature Selection

In [92]:
""" 
    Combine height and weight into 1 feature (Height_Weight) and insert it to the first index
    add column to first index and remove the other 2
"""
MAIN_FIFA_CARD_FEATURES = ['pace', 'shooting', 'passing', 'dribbling', 'defending', 'physic']
ATTACKING_FEATURES = ['attacking_finishing', 'attacking_volleys', 'attacking_heading_accuracy', 'attacking_crossing']
SKILL_FEATURES = ['skill_fk_accuracy', 'skill_ball_control', 'skill_dribbling', 'skill_curve', 'skill_long_passing']
MOVE_FEATURES = ['movement_agility', 'movement_reactions', 'movement_balance', 'movement_acceleration', 'movement_sprint_speed']
POWER_FEATURES = ['power_shot_power', 'power_jumping', 'power_stamina', 'power_strength', 'power_long_shots']
MENTAL_FEATURES = ['mentality_aggression', 'mentality_interceptions', 'mentality_positioning', 'mentality_vision', 'mentality_penalties']
DEFENDING_FEATURES = ['defending_standing_tackle', 'defending_sliding_tackle']
EXTRA_FEATURES = [*ATTACKING_FEATURES, *SKILL_FEATURES, *MOVE_FEATURES, *POWER_FEATURES, *MENTAL_FEATURES, *DEFENDING_FEATURES]

def create_height_weight_feature(df: pd.DataFrame):
    df.insert(0, "height_weight", df["height_cm"] + df["weight_kg"] / 2)
    df.drop(["height_cm", "weight_kg"], axis=1, inplace=True)
    return df

def get_relashionship_graph(df: pd.DataFrame) -> dict:
    relashionship_graph = defaultdict(dict)
    for main_feature in MAIN_FIFA_CARD_FEATURES:
        for extra_feature in EXTRA_FEATURES:
            corr = df[extra_feature].corr(df[main_feature])
            if abs(corr) > 0.80:
                relashionship_graph[main_feature][extra_feature] = corr    
    return relashionship_graph

# remove features that have high correlation with other features
def remove_features_with_high_correlation(df: pd.DataFrame, relashionship_graph: dict) -> pd.DataFrame:
    correlated_features_groups =  [list(correlated.keys()) for correlated in relashionship_graph.values()]
    correlated_features = [item for sublist in correlated_features_groups for item in sublist]
    correlated_features = list(set(correlated_features))
    df.drop(correlated_features, axis=1, inplace=True)

def remove_features_with_low_corr_to_position(df: pd.DataFrame) -> pd.DataFrame:
    for feature in df.columns:
        corr = df[feature].corr(df['position'])
        if abs(corr) < 0.1:
            df.drop(feature, axis=1, inplace=True)

relationship_graph = get_relashionship_graph(processed_df)
remove_features_with_high_correlation(processed_df, relationship_graph)
remove_features_with_low_corr_to_position(processed_df)
processed_df.head()
# view correlation between all features and position
print(processed_df.shape)
processed_df.corr()['position'].sort_values(ascending=False)

(143613, 28)


position                       1.000000
defending                      0.768655
defending_marking_awareness    0.750503
mentality_aggression           0.384254
physic                         0.291674
height_cm                      0.225397
work_rate_Low/Medium           0.178962
weight_kg                      0.176819
work_rate_Medium/High          0.173468
work_rate_Low/High             0.172157
power_jumping                  0.143098
attacking_heading_accuracy     0.142974
preferred_foot_Left            0.121252
work_rate_High/Medium         -0.113901
preferred_foot_Right          -0.121252
work_rate_High/Low            -0.189278
passing                       -0.200444
weak_foot                     -0.202127
work_rate_Medium/Low          -0.210952
movement_balance              -0.275652
skill_fk_accuracy             -0.275804
pace                          -0.331762
skill_curve                   -0.357610
movement_agility              -0.390077
skill_moves                   -0.397571


##### PCA Dimensionality Reduction

In [93]:

def standardize_encoded_X(df: pd.DataFrame) -> pd.DataFrame:
    scaler = StandardScaler()
    scaled_df = scaler.fit_transform(df.iloc[:, :-1])
    return pd.DataFrame(scaled_df, columns=df.columns[:-1])
    
def apply_pca(df: pd.DataFrame, n_components: int = 6) -> np.ndarray:
    pca = PCA(n_components=n_components)
    pca.fit(df)
    return pca.transform(df)

n_components = 5
standardized_X = standardize_encoded_X(processed_df)
pca_df = apply_pca(standardized_X, n_components)
pca_df = pd.DataFrame(pca_df, columns=[f"PC{i}" for i in range(1, n_components+1)])

pca_df

Unnamed: 0,PC1,PC2,PC3,PC4,PC5
0,8.234458,2.110773,0.137008,2.231613,-0.259536
1,6.160987,4.918875,3.949274,-0.018914,-2.834170
2,7.019848,1.721061,0.498549,2.935235,-0.860064
3,3.546877,5.813705,5.711696,1.488916,-0.105258
4,5.962188,2.357365,-0.794655,-1.334058,1.230385
...,...,...,...,...,...
143608,-0.427009,-4.927318,1.608081,0.197691,1.431929
143609,-4.692150,-3.445982,-0.337696,2.879404,0.110760
143610,-1.919404,-3.423201,-0.097032,-0.380618,-0.412776
143611,-0.548311,-4.811757,1.752746,-0.234245,0.150080


In [94]:
from sklearn.model_selection import train_test_split
# from sklearn.ensemble import RandomForestRegressor
# # use random forest regressor
# rf = RandomForestRegressor(n_estimators=70, random_state=42)
# X_train, X_test, y_train, y_test = train_test_split(pca_df, encoded_df['position'], test_size=0.2, random_state=42, shuffle=True)

# rf.fit(X_train, y_train)
# score = rf.score(X_test, y_test)
# print(f'Random Forest Regressor score: {score}')

In [95]:
# use rf classification
# from sklearn.ensemble import RandomForestClassifier
# rf = RandomForestClassifier(n_estimators=70, random_state=42)
# rf.fit(X_train, y_train)
# score = rf.score(X_test, y_test)
# print(f'Random Forest Classifier score: {score}')