In [20]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFECV
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import LinearSVR, NuSVR, SVR
from sklearn.linear_model import SGDRegressor

In [21]:
# Help functions

SEED=42

def read_file(filepath):
        filepath = filepath
        return pd.read_csv(filepath)

def clean_data( dataset, columns: list):
        return dataset.drop(columns=columns)
    
def encode( dataset, columns: list):
    # Encode the labels
    ohe = OneHotEncoder()
    encoded = ohe.fit_transform(dataset[columns]).toarray()
    columns = ohe.get_feature_names_out(columns)
    encoded_df = pd.DataFrame(encoded, columns=columns)
    dataset = pd.concat([dataset, encoded_df], axis=1)
    return dataset.drop(columns=columns)

def normalize(dataset, columns: list):
    # Normalize the dataset
    norm = StandardScaler()
    norm_ = norm.fit_transform(dataset[columns])
    norm_df = pd.DataFrame(norm_, columns=[col + "_norm" for col in columns])
    dataset = pd.concat([dataset, norm_df], axis=1)
    return dataset.drop(columns=columns)


def split_dataset(dataset, columns_X, column_y):
    X = dataset.drop(columns=columns_X, axis=1)
    y = dataset[column_y]
    return X, y

def print_r2_score(y_test, y_pred):
    acc = r2_score(y_test, y_pred) * 100
    print(f"R2 Score: {acc} %")
    return acc


# Reading the data

In [22]:
np.random.seed(SEED)
dataset = read_file("../FM 2023.csv")

dataset = clean_data(dataset, columns=["Rental club",
                                "Salary",
                                "Values",
                                "Race",
                                "UID", 
                                "Date of birth",
                                "Colour of skin",
                                "RCA",
                                "Race",
                                "Club",
                                "Nationality",
                                "Name",
                                "Position",
                                "Current reputation", 
                                "Domestic reputation",
                                "World reputation"
                                ]
                )

# Training a Random Forest Regressor

In [23]:
#baseline = RandomForestRegressor(n_estimators=3000, n_jobs=-1, max_depth=50, max_leaf_nodes=55)
#without_reputation_baseline = RandomForestRegressor(n_estimators=2000, n_jobs=-1, max_depth=10)

X = dataset.drop(columns=["ca", "pa"], axis=1)
y = dataset["ca"]
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.3,
                                                    random_state=SEED
)

model = RandomForestRegressor(n_estimators=2000, n_jobs=-1, max_depth=10)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

acc = print_r2_score(y_test, y_pred)

R2 Score: 84.41210126314476 %


# Normalizing the dataset to train SVM models

In [24]:
dataset = normalize(dataset, dataset.columns)

# Training Linear SVR

In [25]:

X = dataset.drop(columns=["ca_norm", "pa_norm"], axis=1)
y = dataset["ca_norm"]
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.3,
                                                    random_state=SEED
)

lin_svr = LinearSVR()
lin_svr.fit(X_train, y_train)

y_pred = lin_svr.predict(X_test)

acc = print_r2_score(y_test, y_pred)


R2 Score: 91.2975482546584 %




# Training NuSVR

In [26]:
nu_svr = NuSVR()
nu_svr.fit(X_train, y_train)

y_pred = nu_svr.predict(X_test)

acc = print_r2_score(y_test, y_pred)

R2 Score: 95.52816097530203 %


# Training SVR

In [27]:
svr = SVR()
svr.fit(X_train, y_train)

y_pred = svr.predict(X_test)

acc = print_r2_score(y_test, y_pred)

R2 Score: 95.2074077468832 %


# Training SGDRegressor

In [28]:
sgd = SGDRegressor()
sgd.fit(X_train, y_train)

y_pred = sgd.predict(X_test)

acc = print_r2_score(y_test, y_pred)

R2 Score: 91.85531133221053 %
