In [51]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_selection import RFECV
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import LinearSVR, NuSVR, SVR
from sklearn.linear_model import SGDRegressor

In [52]:
# Help functions

SEED=42

def read_file(filepath):
        filepath = filepath
        return pd.read_csv(filepath)

def clean_data( dataset, columns: list):
        return dataset.drop(columns=columns)
    
def encode( dataset, columns: list):
    # Encode the labels
    ohe = OneHotEncoder()
    encoded = ohe.fit_transform(dataset[columns]).toarray()
    columns = ohe.get_feature_names_out(columns)
    encoded_df = pd.DataFrame(encoded, columns=columns)
    dataset = pd.concat([dataset, encoded_df], axis=1)
    return dataset.drop(columns=columns)

def normalize(dataset, columns: list):
    # Normalize the dataset
    norm = StandardScaler()
    norm_ = norm.fit_transform(dataset[columns])
    norm_df = pd.DataFrame(norm_, columns=[col + "_norm" for col in columns])
    dataset = pd.concat([dataset, norm_df], axis=1)
    return dataset.drop(columns=columns)


def split_dataset(dataset, columns_X, column_y):
    X = dataset.drop(columns=columns_X, axis=1)
    y = dataset[column_y]
    return X, y

def print_r2_score(y_test, y_pred):
    acc = r2_score(y_test, y_pred)
    print(f"R2 Score: {acc} ")
    return acc

def print_mse(y_test, y_pred):
    acc = mean_squared_error(y_test, y_pred)
    print(f"MSE Score: {acc} ")
    return acc

def cross(model, X, y, cv=5, scoring="neg_mean_squared_error"):
    score = cross_val_score(model, X, y, cv=cv, scoring=scoring)
    lin_r2_scores = np.sqrt(-score)
    print(f"Scores: {lin_r2_scores}")
    print(f"Mean: {lin_r2_scores.mean()}")
    print(f"Standard Deviation: {lin_r2_scores.std()}")


def reduce(model, X_train, X_test,y_train,step=1, cv=5):
    selector = RFECV(model, step=step, cv=cv, scoring='r2')
    selector = selector.fit(X_train, y_train)
    X_train_rfecv = selector.transform(X_train)
    X_test_rfecv = selector.transform(X_test)
    return X_train_rfecv, X_test_rfecv

# Reading the data

In [53]:
np.random.seed(SEED)
def fetch_data():
    dataset = read_file("../FM 2023.csv")

    dataset = clean_data(dataset, columns=["Rental club",
                                    "Salary",
                                    "Values",
                                    "Race",
                                    "UID", 
                                    "Date of birth",
                                    "Colour of skin",
                                    "RCA",
                                    "Race",
                                    "Club",
                                    "Nationality",
                                    "Name",
                                    "Position",
                                    "Current reputation", 
                                    "Domestic reputation",
                                    "World reputation"
                                    ]
                    )
    return dataset

# Training a Random Forest Regressor

In [68]:
# data separation for training random forest regressor
dataset = fetch_data()
X = dataset.drop(columns=["ca", "pa"], axis=1)
y = dataset["ca"]
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.3,
                                                    random_state=SEED
)

In [69]:
run = False
if run: 
    model = RandomForestRegressor(bootstrap=True, criterion="squared_error", max_depth=None,
                                max_features=6, max_leaf_nodes=None, min_samples_leaf=1,
                                min_samples_split=2, min_weight_fraction_leaf=0.0,
                                n_estimators=30, n_jobs=1, oob_score=False, random_state=None,
                                verbose=0, warm_start=False
    )
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    mse = print_mse(y_test, y_pred)
    r2 = print_r2_score(y_test, y_pred)
    cross(model, X, y, cv=10)

MSE Score: 49.37435243603224 
R2 Score: 0.852451293315564 
Scores: [20.16986565  6.96620807  4.28310673  3.76532049  3.8685325   4.33958523
  5.36204709  6.03787454  8.96522576 23.80515406]
Mean: 8.756292012644952
Standard Deviation: 6.83530052799836


# Normalizing the dataset to train SVM models

In [66]:
dataset = fetch_data()
dataset = normalize(dataset, dataset.columns)

X = dataset.drop(columns=["ca_norm", "pa_norm"], axis=1)
y = dataset["ca_norm"]
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.3,
                                                    random_state=SEED
)



# Training Linear SVR

In [57]:
run = False
if run: 
    lin_svr = LinearSVR(max_iter=100000)

    lin_svr.fit(X_train, y_train)

    y_pred = lin_svr.predict(X_test)

    mse = print_mse(y_test, y_pred)
    r2 = print_r2_score(y_test, y_pred)
    cross(lin_svr, X, y)

# Training NuSVR

In [61]:
run = False
if run: 
    nu_svr = NuSVR()
    nu_svr.fit(X_train, y_train)

    y_pred = nu_svr.predict(X_test)

    mse = print_mse(y_test, y_pred)
    r2 = print_r2_score(y_test, y_pred)
    cross(nu_svr, X, y, cv=10)

MSE Score: 0.04660698664542275 
R2 Score: 0.9552816097530203 
Scores: [0.64747532 0.15859356 0.14022294 0.1355774  0.13189186 0.14505592
 0.14649644 0.16860919 0.24662369 0.85559248]
Mean: 0.27761388105255996
Standard Deviation: 0.24349824058068015


# Training SVR

In [59]:
run = False
if run:
    svr = SVR()
    svr.fit(X_train, y_train)

    y_pred = svr.predict(X_test)

    mse = print_mse(y_test, y_pred)
    r2 = print_r2_score(y_test, y_pred)
    cross(svr, X, y, cv=10)

MSE Score: 0.04994998297217503 
R2 Score: 0.9520740774688321 
Scores: [0.71036781 0.16600078 0.14655577 0.13930497 0.13700243 0.14891674
 0.15191355 0.17240578 0.25081231 0.9250833 ]
Mean: 0.2948363447131966
Standard Deviation: 0.26762132364100627


# Training SGDRegressor

In [60]:
run = False
if run:
    sgd = SGDRegressor()
    sgd.fit(X_train, y_train)

    y_pred = sgd.predict(X_test)

    mse = print_mse(y_test, y_pred)
    r2 = print_r2_score(y_test, y_pred)
    cross(sgd, X, y, cv=10)

MSE Score: 0.08921468900994277 
R2 Score: 0.914400445811672 
Scores: [0.46536441 0.26520274 0.26960462 0.25351387 0.25627827 0.25601392
 0.26685587 0.26724021 0.32512856 0.42065483]
Mean: 0.30458572914993254
Standard Deviation: 0.07255199621095787
