In [60]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV
from sklearn.feature_selection import RFECV, SelectKBest, f_regression
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import LinearSVR, NuSVR, SVR
from sklearn.linear_model import SGDRegressor

In [61]:
# Help functions

SEED=42

def read_file(filepath):
        filepath = filepath
        return pd.read_csv(filepath)

def clean_data( dataset, columns: list):
        return dataset.drop(columns=columns)
    
def encode( dataset, columns: list):
    # Encode the labels
    ohe = OneHotEncoder()
    encoded = ohe.fit_transform(dataset[columns]).toarray()
    _columns = ohe.get_feature_names_out(columns)
    encoded_df = pd.DataFrame(encoded, columns=_columns)
    dataset = pd.concat([dataset, encoded_df], axis=1)
    return dataset.drop(columns=columns)

def normalize(dataset, columns: list):
    # Normalize the dataset
    norm = StandardScaler()
    norm_ = norm.fit_transform(dataset[columns])
    norm_df = pd.DataFrame(norm_, columns=[col + "_norm" for col in columns])
    dataset = pd.concat([dataset, norm_df], axis=1)
    return dataset.drop(columns=columns)


def split_dataset(dataset, columns_X, column_y):
    X = dataset.drop(columns=columns_X, axis=1)
    y = dataset[column_y]
    return X, y

def print_r2_score(y_test, y_pred):
    acc = r2_score(y_test, y_pred)
    print(f"R2 Score: {acc} ")
    return acc

def print_mse(y_test, y_pred):
    acc = mean_squared_error(y_test, y_pred)
    print(f"MSE Score: {acc} ")
    return acc

def cross(model, X, y, cv=5, scoring="neg_mean_squared_error"):
    score = cross_val_score(model, X, y, cv=cv, scoring=scoring)
    lin_r2_scores = np.sqrt(-score)
    print(f"Scores: {lin_r2_scores}")
    print(f"Mean: {lin_r2_scores.mean()}")
    print(f"Standard Deviation: {lin_r2_scores.std()}")


def reduce(X_train, X_test,y_train, k=50):
    selector = SelectKBest(score_func=f_regression, k=k)
    selector = selector.fit(X_train, y_train)
    X_train_selector = selector.transform(X_train)
    X_test_selector = selector.transform(X_test)
    return X_train_selector, X_test_selector

def param_search(model, param_dist, X_train, y_train):
    search = RandomizedSearchCV(model, random_state=SEED,param_distributions=param_dist, n_iter=10, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
    search.fit(X_train, y_train)
    print(search.best_params_)

# Reading the data

In [62]:
np.random.seed(SEED)
def fetch_data():
    dataset = read_file("../FM 2023.csv")

    dataset = clean_data(dataset, columns=["Rental club",
                                    "Age",
                                    "Salary",
                                    "Values",
                                    "Race",
                                    "UID", 
                                    "Date of birth",
                                    "Colour of skin",
                                    "RCA",
                                    "Race",
                                    "Club",
                                    "Nationality",
                                    "Name",
                                    "Current reputation", 
                                    "Domestic reputation",
                                    "World reputation"
                                    ]
                    )
    return dataset

# Training a Random Forest Regressor

In [63]:
# data separation for training random forest regressor
dataset = fetch_data()
dataset = normalize(dataset, columns=['ca', 'pa', 'Corners', 'Crossing', 'Dribbling', 'Finishing',
       'First Touch', 'Free Kick Taking', 'Heading', 'Long Shots',
       'Long Throws', 'Marking', 'Passing', 'Penalty Taking', 'Tackling',
       'Technique', 'Aggressiion', 'Anticipation', 'Bravery', 'Composure',
       'Concentration', 'Vision', 'Decision', 'Determination', 'Flair',
       'Leadership', 'Off The Ball', 'Position.1', 'Teamwork', 'Work Rate',
       'Acceleration', 'Agility', 'Balance', 'Jumping Reach',
       'Natural Fitness', 'Pace', 'Stamina', 'Strength', 'Stability', 'Foul',
       'Contest performance', 'Injury', 'diversity', 'Aerial Reach',
       'Command Of Area', 'Communication', 'Eccentricity', 'Handling',
       'Kicking', 'One On Ones', 'Reflexes', 'Rushing Out', 'Punching',
       'Throwing', 'Adaptation', 'Ambition', 'Argue', 'Loyal',
       'Resistant to stress', 'Professional', 'Sportsmanship',
       'Emotional control', 'GK', 'DL', 'DC', 'DR', 'WBL', 'WBR', 'DM', 'ML',
       'MC', 'MR', 'AML', 'AMC', 'AMR', 'ST', 'Height', 'Weight', 'Left Foot',
       'Right Foot', 'Number of national team appearances',
       'Goals scored for the national team'])

dataset = encode(dataset, columns=['Position'])

X = dataset.drop(columns=["ca_norm", "pa_norm"], axis=1)
y = dataset["ca_norm"]
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.3,
                                                    random_state=SEED
)

In [64]:
dataset.head(5)


Unnamed: 0,ca_norm,pa_norm,Corners_norm,Crossing_norm,Dribbling_norm,Finishing_norm,First Touch_norm,Free Kick Taking_norm,Heading_norm,Long Shots_norm,...,Position_WB/M/AM L,Position_WB/M/AM LC,Position_WB/M/AM R,Position_WB/M/AM RC,Position_WB/M/AM RL,Position_WB/M/AM RLC,Position_WB/M/AM/S R,Position_WB/M/AM/S RC,Position_WB/M/AM/S RL,Position_WB/M/AM/S RLC
0,3.861125,1.117237,1.740862,2.788835,1.169449,1.936912,1.798904,2.630423,-1.073811,2.223686,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,3.805317,1.211812,1.443758,1.039427,2.008679,2.220526,2.707385,1.059788,-0.788551,1.046338,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3.6937,1.129059,-1.527279,-0.418413,0.609961,2.787755,2.707385,2.002169,2.064056,0.752001,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3.637892,1.188168,-0.338864,0.164723,0.889705,2.504141,1.798904,1.373915,1.493535,1.046338,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,3.637892,1.093594,1.146654,1.330995,1.728936,2.220526,2.253144,1.059788,0.067231,1.046338,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [70]:
run = True
if run:
    param_dist = {
    'criterion': ["squared_error"],
    'n_estimators': [50, 100, 200],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 4, 8],
    'min_samples_leaf': [1, 2, 4]
    }
    rfr = RandomForestRegressor(n_estimators=100, min_samples_split=2,min_samples_leaf=1, max_depth=20, criterion="squared_error",random_state=SEED,n_jobs=1)
    #param_search(rfr, param_dist, X_train, y_train)

    rfr.fit(X_train, y_train)
    y_pred = rfr.predict(X_test)
    mse = print_mse(y_test, y_pred)
    r2 = print_r2_score(y_test, y_pred)
    cross(rfr, X, y, cv=5)

MSE Score: 0.16018535542908133 
R2 Score: 0.8463056345945444 
Scores: [1.01076563 0.29629597 0.27921609 0.39022642 1.50378395]
Mean: 0.6960576106291668
Standard Deviation: 0.48550219497352615


# Training NuSVR

In [67]:
run = True
if run: 
    nu_svr = NuSVR()
    nu_svr.fit(X_train, y_train)

    y_pred = nu_svr.predict(X_test)

    mse = print_mse(y_test, y_pred)
    r2 = print_r2_score(y_test, y_pred)
    cross(nu_svr, X, y, cv=5)

MSE Score: 0.04569384780548777 
R2 Score: 0.9561577466143143 
Scores: [0.61266142 0.13976367 0.14090741 0.15811273 1.06447725]
Mean: 0.42318449629216487
Standard Deviation: 0.3680838650815868


# Training SVR

In [73]:
run = True
if run:
    svr = SVR()
    #X_train_selector, X_test_selector = reduce(X_train, X_test, y_train, k=200)
    svr.fit(X_train, y_train)
    y_pred = svr.predict(X_test)

    mse = print_mse(y_test, y_pred)
    r2 = print_r2_score(y_test, y_pred)
    cross(svr, X, y, cv=5)

MSE Score: 0.049521151632730895 
R2 Score: 0.9524855318143638 
Scores: [0.6840784  0.14590535 0.14567489 0.16475348 1.13973545]
Mean: 0.4560295176784231
Standard Deviation: 0.39919890268212876


# Training SGDRegressor

In [75]:
run = True
if run:
    param_dist= {
        'penalty': ['l1', 'l2', 'elasticnet'],
        'alpha': [0.0001, 0.001, 0.01, 0.1],
        'fit_intercept': [True, False],
        'learning_rate': ['optimal', 'invscaling']
    }
    sgd = SGDRegressor(penalty='l2', learning_rate='invscaling', fit_intercept=True, alpha=0.0001)

    #param_search(sgd, param_dist, X_train, y_train)

    sgd.fit(X_train, y_train)
    y_pred = sgd.predict(X_test)
    mse = print_mse(y_test, y_pred)
    r2 = print_r2_score(y_test, y_pred)
    cross(sgd, X, y, cv=5)

MSE Score: 0.08904080437598619 
R2 Score: 0.9145672843369425 
Scores: [0.36619616 0.26935303 0.26113115 0.26663457 0.47109856]
Mean: 0.32688269364776595
Standard Deviation: 0.08198356936354088


# Best Model SGD 
## TODO: 
    - [ ] Use the param search for NuSVR and SVR 