In [2]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV
from sklearn.feature_selection import  SelectKBest, f_regression
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import  NuSVR, SVR
from sklearn.linear_model import SGDRegressor

In [3]:
# Help functions

SEED=42

def read_file(filepath):
        filepath = filepath
        return pd.read_csv(filepath)

def clean_data( dataset, columns: list):
        return dataset.drop(columns=columns)
    
def encode( dataset, columns: list):
    # Encode the labels
    ohe = OneHotEncoder()
    encoded = ohe.fit_transform(dataset[columns]).toarray()
    _columns = ohe.get_feature_names_out(columns)
    encoded_df = pd.DataFrame(encoded, columns=_columns)
    dataset = pd.concat([dataset, encoded_df], axis=1)
    return dataset.drop(columns=columns)

def normalize(dataset, columns: list):
    # Normalize the dataset
    norm = StandardScaler()
    norm_ = norm.fit_transform(dataset[columns])
    norm_df = pd.DataFrame(norm_, columns=[col + "_norm" for col in columns])
    dataset = pd.concat([dataset, norm_df], axis=1)
    return dataset.drop(columns=columns)


def split_dataset(dataset, columns_X, column_y):
    X = dataset.drop(columns=columns_X, axis=1)
    y = dataset[column_y]
    return X, y

def print_r2_score(y_test, y_pred):
    acc = r2_score(y_test, y_pred)
    print(f"R2 Score: {acc} ")
    return acc

def print_mse(y_test, y_pred):
    acc = mean_squared_error(y_test, y_pred)
    print(f"MSE Score: {acc} ")
    return acc

def cross(model, X, y, cv=5, scoring="neg_mean_squared_error"):
    score = cross_val_score(model, X, y, cv=cv, scoring=scoring)
    lin_r2_scores = np.sqrt(-score)
    print(f"Scores: {lin_r2_scores}")
    print(f"Mean: {lin_r2_scores.mean()}")
    print(f"Standard Deviation: {lin_r2_scores.std()}")


def reduce(X_train, X_test,y_train, k=50):
    selector = SelectKBest(score_func=f_regression, k=k)
    selector = selector.fit(X_train, y_train)
    X_train_selector = selector.transform(X_train)
    X_test_selector = selector.transform(X_test)
    return X_train_selector, X_test_selector

def param_search(model, param_dist, X_train, y_train):
    search = RandomizedSearchCV(model, random_state=SEED,param_distributions=param_dist, n_iter=10, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
    search.fit(X_train, y_train)
    print(search.best_params_)

# Reading the data

In [4]:
np.random.seed(SEED)
def fetch_data():
    dataset = read_file("../FM 2023.csv")

    dataset = clean_data(dataset, columns=["Rental club",
                                    "Age",
                                    "Position",
                                    "Salary",
                                    "Values",
                                    "Race",
                                    "UID", 
                                    "Date of birth",
                                    "Colour of skin",
                                    "RCA",
                                    "Race",
                                    "Club",
                                    "Nationality",
                                    "Name",
                                    "Current reputation", 
                                    "Domestic reputation",
                                    "World reputation"
                                    ]
                    )
    return dataset

# Training a Random Forest Regressor

In [5]:
# data separation for training random forest regressor
dataset = fetch_data()
dataset = normalize(dataset, columns=dataset.columns)
#dataset = encode(dataset, columns=['Position'])

X = dataset.drop(columns=["ca_norm", "pa_norm"], axis=1)
y = dataset["ca_norm"]
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.3,
                                                    random_state=SEED
)

In [6]:
dataset.head(5)


Unnamed: 0,ca_norm,pa_norm,Corners_norm,Crossing_norm,Dribbling_norm,Finishing_norm,First Touch_norm,Free Kick Taking_norm,Heading_norm,Long Shots_norm,...,AML_norm,AMC_norm,AMR_norm,ST_norm,Height_norm,Weight_norm,Left Foot_norm,Right Foot_norm,Number of national team appearances_norm,Goals scored for the national team_norm
0,3.861125,1.117237,1.740862,2.788835,1.169449,1.936912,1.798904,2.630423,-1.073811,2.223686,...,0.959273,1.748703,0.981955,0.766982,-0.033432,0.112617,0.710637,0.571938,3.996453,4.867143
1,3.805317,1.211812,1.443758,1.039427,2.008679,2.220526,2.707385,1.059788,-0.788551,1.046338,...,1.636738,-0.821009,1.39165,1.829496,-0.464392,0.301174,-0.518819,0.571938,2.335918,5.506573
2,3.6937,1.129059,-1.527279,-0.418413,0.609961,2.787755,2.707385,2.002169,2.064056,0.752001,...,0.4173,0.666719,0.162563,1.829496,0.541182,0.602866,0.095909,0.571938,5.998863,15.950598
3,3.637892,1.188168,-0.338864,0.164723,0.889705,2.504141,1.798904,1.373915,1.493535,1.046338,...,-0.802138,-0.821009,-0.793394,1.829496,1.977717,0.866847,1.530274,-1.261367,0.577704,4.01457
4,3.637892,1.093594,1.146654,1.330995,1.728936,2.220526,2.253144,1.059788,0.067231,1.046338,...,1.365752,0.666719,1.801346,1.696682,-0.895353,0.263463,1.530274,-1.872469,3.703417,9.769441


In [7]:
run = True
if run:
    param_dist = {
    'criterion': ["squared_error"],
    'n_estimators': [50, 100, 200],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 4, 8],
    'min_samples_leaf': [1, 2, 4]
    }
    rfr = RandomForestRegressor(n_estimators=100, min_samples_split=2,min_samples_leaf=1, max_depth=20, criterion="squared_error",random_state=SEED,n_jobs=1)
    #param_search(rfr, param_dist, X_train, y_train)

    rfr.fit(X_train, y_train)
    y_pred = rfr.predict(X_test)
    mse = print_mse(y_test, y_pred)
    r2 = print_r2_score(y_test, y_pred)
    cross(rfr, X, y, cv=5)

MSE Score: 0.15981000908212073 
R2 Score: 0.8466657712527853 
Scores: [1.01219168 0.29531822 0.27920062 0.39125839 1.50439485]
Mean: 0.6964727515447386
Standard Deviation: 0.48592455229900566


# Training NuSVR

In [8]:
run = True
if run: 
    nu_svr = NuSVR()
    nu_svr.fit(X_train, y_train)

    y_pred = nu_svr.predict(X_test)

    mse = print_mse(y_test, y_pred)
    r2 = print_r2_score(y_test, y_pred)
    cross(nu_svr, X, y, cv=5)

MSE Score: 0.04737139157980462 
R2 Score: 0.9545481798399804 
Scores: [0.62224379 0.14507434 0.14520061 0.16327796 1.08357362]
Mean: 0.4318740630629173
Standard Deviation: 0.3735057063160188


# Training SVR

In [9]:
run = True
if run:
    svr = SVR()
    #X_train_selector, X_test_selector = reduce(X_train, X_test, y_train, k=200)
    svr.fit(X_train, y_train)
    y_pred = svr.predict(X_test)

    mse = print_mse(y_test, y_pred)
    r2 = print_r2_score(y_test, y_pred)
    cross(svr, X, y, cv=5)

MSE Score: 0.050626067374704826 
R2 Score: 0.9514253891856304 
Scores: [0.68972014 0.14942945 0.14965631 0.16815288 1.15017264]
Mean: 0.46142628486758064
Standard Deviation: 0.40175650647040523


# Training SGDRegressor

In [29]:
run = True
np.random.seed(SEED)
if run:
    param_dist= {
        'penalty': ['l1', 'l2', 'elasticnet'],
        'alpha': [0.0001, 0.001, 0.01, 0.1],
        'fit_intercept': [True, False],
        'learning_rate': ['optimal', 'invscaling']
    }
    sgd = SGDRegressor(penalty='l2', learning_rate='invscaling', fit_intercept=True, alpha=0.0001)

    #param_search(sgd, param_dist, X_train, y_train)

    sgd.fit(X_train, y_train)
    y_pred = sgd.predict(X_test)
    mse = print_mse(y_test, y_pred)
    r2 = print_r2_score(y_test, y_pred)
    cross(sgd, X, y, cv=5)

MSE Score: 0.0896312356827953 
R2 Score: 0.9140007783366126 
Scores: [0.36128344 0.28030951 0.26253168 0.27012124 0.48896188]
Mean: 0.33264155230697795
Standard Deviation: 0.08581285450039756


# Best Model SGD 
## TODO: 
    - [ ] Use the param search for NuSVR and SVR 