In [1]:
import numpy as np
import pandas as pd
import altair as alt

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline

In [8]:

#Loading clean data set
fifa_data = pd.read_csv("../data/cleaned/cleaned_data.csv", index_col=0)

X = fifa_data.drop(columns='Wage')
y = fifa_data[['Wage']]
#Identifying the categorical and numeric columns
def get_preprocessing_features(x):
    """ Retrieves a list of categorical and numerical
    features using fifa train data set (X) 
    using data type. 

    Parameters
    ----------
    x : DateFrame
        Fifa Training data.

    Returns
    -------
    categorical and numerical features: tuple

    """
    d_types = x.dtypes
    categorical = []
    numerical = []
    for data_type, features in zip(d_types, d_types.index):
        if data_type == "object":
            categorical.append(features)
        else:
            numerical.append(features)

    return categorical, numerical

#Setting-up data transformer
categorical, numerical = get_preprocessing_features(X)

preprocessor = ColumnTransformer(
    transformers=[
        ('scale', StandardScaler(), numerical),
        ('ohe', OneHotEncoder(drop="first"), categorical)])

# Splitting data into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=123)

X_train_processed = preprocessor.fit_transform(X_train)
X_train_processed

# Apply data transformations and convert to dataframe
#X_train = pd.DataFrame(preprocessor.fit_transform(X_train),
#                       index=X_train.index,
#                       columns=(numerical +
#                                list(preprocessor.named_transformers_['ohe']
#                                     .get_feature_names(categorical))))
#X_test = pd.DataFrame(preprocessor.transform(X_test),
#                      index=X_test.index,
#                      columns=X_train.columns)

<9521x10054 sparse matrix of type '<class 'numpy.float64'>'
	with 478359 stored elements in Compressed Sparse Row format>

In [5]:
#Fitting Logistic Regression

lgr = LogisticRegression(solver='lbfgs')
lgr.fit(X_train, y_train);

In [None]:

# Most important features
features = X_train.columns  # Get the feature labels from the training data
coefs = abs(lgr.coef_.flatten())  # Get the absolute value of the coefficients

# Order the coefficients in descending order
coefs_ordered = np.argsort(-coefs)

# Retrieve the top 5 and store in a list
top5 = list(features[coefs_ordered][:5])
print(f"The top 5 features are: {top5}")