In [11]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_selection import RFECV, SelectKBest, f_regression
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import LinearSVR, NuSVR, SVR
from sklearn.linear_model import SGDRegressor

In [12]:
# Help functions

SEED=42

def read_file(filepath):
        filepath = filepath
        return pd.read_csv(filepath)

def clean_data( dataset, columns: list):
        return dataset.drop(columns=columns)
    
def encode( dataset, columns: list):
    # Encode the labels
    ohe = OneHotEncoder()
    encoded = ohe.fit_transform(dataset[columns]).toarray()
    columns = ohe.get_feature_names_out(columns)
    encoded_df = pd.DataFrame(encoded, columns=columns)
    dataset = pd.concat([dataset, encoded_df], axis=1)
    return dataset.drop(columns=columns)

def normalize(dataset, columns: list):
    # Normalize the dataset
    norm = StandardScaler()
    norm_ = norm.fit_transform(dataset[columns])
    norm_df = pd.DataFrame(norm_, columns=[col + "_norm" for col in columns])
    dataset = pd.concat([dataset, norm_df], axis=1)
    return dataset.drop(columns=columns)


def split_dataset(dataset, columns_X, column_y):
    X = dataset.drop(columns=columns_X, axis=1)
    y = dataset[column_y]
    return X, y

def print_r2_score(y_test, y_pred):
    acc = r2_score(y_test, y_pred)
    print(f"R2 Score: {acc} ")
    return acc

def print_mse(y_test, y_pred):
    acc = mean_squared_error(y_test, y_pred)
    print(f"MSE Score: {acc} ")
    return acc

def cross(model, X, y, cv=5, scoring="neg_mean_squared_error"):
    score = cross_val_score(model, X, y, cv=cv, scoring=scoring)
    lin_r2_scores = np.sqrt(-score)
    print(f"Scores: {lin_r2_scores}")
    print(f"Mean: {lin_r2_scores.mean()}")
    print(f"Standard Deviation: {lin_r2_scores.std()}")


def reduce(X_train, X_test,y_train, k=50):
    selector = SelectKBest(score_func=f_regression, k=k)
    selector = selector.fit(X_train, y_train)
    X_train_selector = selector.transform(X_train)
    X_test_selector = selector.transform(X_test)
    return X_train_selector, X_test_selector

# Reading the data

In [13]:
np.random.seed(SEED)
def fetch_data():
    dataset = read_file("../FM 2023.csv")

    dataset = clean_data(dataset, columns=["Rental club",
                                    "Age",
                                    "Salary",
                                    "Values",
                                    "Race",
                                    "UID", 
                                    "Date of birth",
                                    "Colour of skin",
                                    "RCA",
                                    "Race",
                                    "Club",
                                    "Nationality",
                                    "Name",
                                    "Position",
                                    "Current reputation", 
                                    "Domestic reputation",
                                    "World reputation"
                                    ]
                    )
    return dataset

# Training a Random Forest Regressor

In [14]:
# data separation for training random forest regressor
dataset = fetch_data()
dataset = normalize(dataset, columns=dataset.columns)
X = dataset.drop(columns=["ca_norm", "pa_norm"], axis=1)
y = dataset["ca_norm"]
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.3,
                                                    random_state=SEED
)

In [15]:
dataset.head(5)

Unnamed: 0,ca_norm,pa_norm,Corners_norm,Crossing_norm,Dribbling_norm,Finishing_norm,First Touch_norm,Free Kick Taking_norm,Heading_norm,Long Shots_norm,...,AML_norm,AMC_norm,AMR_norm,ST_norm,Height_norm,Weight_norm,Left Foot_norm,Right Foot_norm,Number of national team appearances_norm,Goals scored for the national team_norm
0,3.861125,1.117237,1.740862,2.788835,1.169449,1.936912,1.798904,2.630423,-1.073811,2.223686,...,0.959273,1.748703,0.981955,0.766982,-0.033432,0.112617,0.710637,0.571938,3.996453,4.867143
1,3.805317,1.211812,1.443758,1.039427,2.008679,2.220526,2.707385,1.059788,-0.788551,1.046338,...,1.636738,-0.821009,1.39165,1.829496,-0.464392,0.301174,-0.518819,0.571938,2.335918,5.506573
2,3.6937,1.129059,-1.527279,-0.418413,0.609961,2.787755,2.707385,2.002169,2.064056,0.752001,...,0.4173,0.666719,0.162563,1.829496,0.541182,0.602866,0.095909,0.571938,5.998863,15.950598
3,3.637892,1.188168,-0.338864,0.164723,0.889705,2.504141,1.798904,1.373915,1.493535,1.046338,...,-0.802138,-0.821009,-0.793394,1.829496,1.977717,0.866847,1.530274,-1.261367,0.577704,4.01457
4,3.637892,1.093594,1.146654,1.330995,1.728936,2.220526,2.253144,1.059788,0.067231,1.046338,...,1.365752,0.666719,1.801346,1.696682,-0.895353,0.263463,1.530274,-1.872469,3.703417,9.769441


In [27]:
run = True
if run: 
    rfr = RandomForestRegressor(bootstrap=True, criterion="squared_error", max_depth=20,
                                max_features=80, max_leaf_nodes=None, min_samples_leaf=1,
                                min_samples_split=2, min_weight_fraction_leaf=0.0,
                                n_estimators=30, n_jobs=1, oob_score=False, random_state=SEED,
                                verbose=0, warm_start=False
    )
    rfr.fit(X_train, y_train)
    y_pred = rfr.predict(X_test)

    mse = print_mse(y_test, y_pred)
    r2 = print_r2_score(y_test, y_pred)
    cross(rfr, X, y, cv=10)

MSE Score: 0.16702211405349304 
R2 Score: 0.8397459133553926 
Scores: [1.07199234 0.40499454 0.3131966  0.26512991 0.25996459 0.29175635
 0.30407231 0.42090597 0.58375446 1.28499665]
Mean: 0.5200763716666639
Standard Deviation: 0.3451218084833153


# Normalizing the dataset to train SVM models

In [17]:
dataset = fetch_data()
dataset = normalize(dataset, dataset.columns)

X = dataset.drop(columns=["ca_norm", "pa_norm"], axis=1)
y = dataset["ca_norm"]
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.3,
                                                    random_state=SEED
)



# Training Linear SVR

In [18]:
run = False
if run:
    lin_svr = LinearSVR(max_iter=100000, loss="squared_epsilon_insensitive", random_state=SEED)

    lin_svr.fit(X_train, y_train)

    y_pred = lin_svr.predict(X_test)

    mse = print_mse(y_test, y_pred)
    r2 = print_r2_score(y_test, y_pred)
    cross(lin_svr, X, y)

# Training NuSVR

In [19]:
run = False
if run: 
    nu_svr = NuSVR()
    nu_svr.fit(X_train, y_train)

    y_pred = nu_svr.predict(X_test)

    mse = print_mse(y_test, y_pred)
    r2 = print_r2_score(y_test, y_pred)
    cross(nu_svr, X, y, cv=10)

# Training SVR

In [20]:
run = True
if run:
    svr = SVR()
    X_train_selector, X_test_selector = reduce(X_train, X_test, y_train, k=60)
    svr.fit(X_train_selector, y_train)
    y_pred = svr.predict(X_test_selector)

    mse = print_mse(y_test, y_pred)
    r2 = print_r2_score(y_test, y_pred)
    cross(svr, X, y, cv=10)

MSE Score: 0.07546835139275933 
R2 Score: 0.9275897578499845 
Scores: [0.71336982 0.16437635 0.14766506 0.13990137 0.14039198 0.15303788
 0.15258805 0.17463308 0.25360465 0.91380441]
Mean: 0.2953372659416501
Standard Deviation: 0.26482382591928166


# Training SGDRegressor

In [88]:
run = True
if run:
    sgd = SGDRegressor()
    sgd.fit(X_train, y_train)

    y_pred = sgd.predict(X_test)

    mse = print_mse(y_test, y_pred)
    r2 = print_r2_score(y_test, y_pred)
    cross(sgd, X, y, cv=10)

MSE Score: 0.09000654528067772 
R2 Score: 0.9136406769383132 
Scores: [0.40105681 0.26192122 0.27786763 0.25642862 0.2566812  0.2560951
 0.26409673 0.28173179 0.33956871 0.42193059]
Mean: 0.30173783952874594
Standard Deviation: 0.05990013538551821
