In [120]:
import numpy as np
import pandas as pd
import altair as alt

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline

In [122]:

#Loading clean data set
fifa_data = pd.read_csv("./cleaned/cleaned_data.csv", index_col=0)

X = fifa_data.drop(columns='Wage')
y = fifa_data[['Wage']]
#Identifying the categorical and numeric columns
def get_preprocessing_features(x):
    """ Retrieves a list of categorical and numerical
    features using fifa train data set (X) 
    using data type. 

    Parameters
    ----------
    x : DateFrame
        Fifa Training data.

    Returns
    -------
    categorical and numerical features: tuple

    """
    d_types = x.dtypes
    categorical = []
    numerical = []
    for data_type, features in zip(d_types, d_types.index):
        if data_type == "object":
            categorical.append(features)
        else:
            numerical.append(features)

    return categorical, numerical

#Setting-up data transformer
categorical, numerical = get_preprocessing_features(X)

preprocessor = ColumnTransformer(
    transformers=[
        ('scale', StandardScaler(), numerical),
        ('ohe', OneHotEncoder(drop="first"), categorical)])

# Splitting data into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=123)

# Apply data transformations and convert to dataframe
X_train = pd.DataFrame(preprocessor.fit_transform(X_train),
                       index=X_train.index,
                       columns=(numerical +
                                list(preprocessor.named_transformers_['ohe']
                                     .get_feature_names(categorical))))
X_test = pd.DataFrame(preprocessor.transform(X_test),
                      index=X_test.index,
                      columns=X_train.columns)

ValueError: Shape of passed values is (9521, 1), indices imply (9521, 10054)

In [101]:
#Fitting Logistic Regression

lgr = LogisticRegression(solver='lbfgs')
lgr.fit(X_train, y_train);

Unnamed: 0,Name,Age,Nationality,Overall,Potential,Club,Value,Wage,Special,Preferred.Foot,...,SlidingTackle,GKDiving,GKHandling,GKKicking,GKPositioning,GKReflexes,Release.Clause,BMI,Work.Rate.Former,Work.Rate.Later
1,L. Messi,31,Argentina,94,94,FC Barcelona,110500,565,2202,Left,...,26.0,6.0,11.0,15.0,14.0,8.0,226500,24.9002,Medium,Medium
2,Cristiano Ronaldo,33,Portugal,94,94,Juventus,77000,405,2228,Right,...,23.0,7.0,11.0,15.0,14.0,11.0,127100,23.493243,High,Low
3,Neymar Jr,26,Brazil,92,93,Paris Saint-Germain,118500,290,2143,Right,...,33.0,9.0,9.0,15.0,15.0,11.0,228100,22.148708,High,Medium
5,K. De Bruyne,27,Belgium,91,92,Manchester City,102000,355,2281,Right,...,51.0,15.0,13.0,5.0,10.0,13.0,196400,21.476294,High,High
6,E. Hazard,27,Belgium,91,91,Chelsea,93000,340,2142,Right,...,22.0,11.0,12.0,6.0,8.0,8.0,172100,24.781358,High,Medium


In [None]:

# Most important features
features = X_train.columns  # Get the feature labels from the training data
coefs = abs(lgr.coef_.flatten())  # Get the absolute value of the coefficients

# Order the coefficients in descending order
coefs_ordered = np.argsort(-coefs)

# Retrieve the top 5 and store in a list
top5 = list(features[coefs_ordered][:5])
print(f"The top 5 features are: {top5}")