In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFECV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.dummy import DummyRegressor
from sklearn.ensemble import RandomForestRegressor

In [2]:
class Utils:
    def __init__(self, filepath, SEED=42):
        self.filepath = filepath
        self.dataset = pd.read_csv(self.filepath)
        self.SEED = SEED
        np.random.seed(SEED)
    
    def clean_data(self, columns: list):
        self.dataset = self.dataset.drop(columns=columns)
    
    def encode(self, columns: list):
        # Encode the labels
        ohe = OneHotEncoder()
        encoded = ohe.fit_transform(self.dataset[columns]).toarray()
        columns = ohe.get_feature_names_out(columns)
        encoded_df = pd.DataFrame(encoded, columns=columns)
        self.dataset = pd.concat([self.dataset, encoded_df], axis=1)
        self.dataset = dataset.drop(columns=columns)
    
    def normalize(self, columns: list):
        # Normalize the dataset
        norm = StandardScaler()
        norm_ = norm.fit_transform(self.dataset[columns])
        norm_df = pd.DataFrame(norm_, columns=[col + "_norm" for col in columns])
        self.dataset = pd.concat([self.dataset, norm_df], axis=1)
        self.dataset = self.dataset.drop(columns=columns)
    
    def train_split(self, X, y, test_size=0.3):
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(X,
                                                                                y,
                                                                                test_size=test_size,
                                                                                random_state=self.SEED
                                                                               )
    
    def train(self, model,baseline=None,feature_selection=False):
        np.random.seed(self.SEED)
        if feature_selection:
            rfecv = RFECV(estimator=model, cv=5, scoring='r2', random_state=self.SEED)
            rfecv.fit(self.X_train, self.y_train)
            self.X_train_rfecv = rfecv.transform(self.X_train)
            self.X_test_rfecv = rfecv.transform(self.X_test)

            # Train the model
            model.fit(self.X_train_rfecv, self.y_train)
            # Test the model
            self.dummy()
            self.test(model, feature_selection)
            return model
        
        # Train the model without features selection
        model.fit(self.X_train, self.y_train)
        # Test the model
        self.test(model, feature_selection)
        if baseline != None:
           print(f"Baseline Score: {baseline} %") 
        self.dummy()
        
        return model
    
    def test(self, model, feature_selection=False, show=True):
        if feature_selection:
            # Test the model
            y_pred = model.predict(self.X_test_rfecv)
        else:
            y_pred = model.predict(self.X_test)
        # Check the accuracy using the R2 Score
        acc = r2_score(self.y_test, y_pred) * 100
        if show: print(f"R2 Score: {acc} %")
        return acc
    
    def dummy(self):
        # Creating a baseline
        dummy_regressor = DummyRegressor(strategy="mean")
        dummy_regressor.fit(self.X_train, self.y_train)
        y_pred = dummy_regressor.predict(self.X_test)
        acc = r2_score(self.y_test, y_pred) * 100
        print(f"Dummy Regressor R2 Score: {acc} %")    
    


In [3]:
# Training Random Forest Regressor

fm = Utils("../FM 2023.csv")
fm.clean_data(columns=["Rental club",
                                "Salary",
                                "Values",
                                "Race",
                                "UID", 
                                "Date of birth",
                                "Colour of skin",
                                "RCA",
                                "Race",
                                "Club",
                                "Nationality",
                                "Name",
                                "Position",
                               "Current reputation", 
                               "Domestic reputation",
                               "World reputation"
                               ]
             )

fm.normalize(columns=fm.dataset.columns)
def train_ca(model, baseline=None):
    global fm
    # Function train a model for Current Ability prediction
    # The X values cannot have the ca_norm without the pa_norm to avoid overfit
    X = fm.dataset.drop(columns=["ca_norm", "pa_norm"], axis=1)
    y = fm.dataset['ca_norm']
    fm.train_split(X, y)
    return fm.train(model=model, baseline=baseline)



# Training a Random Forest Regressor

In [4]:
#baseline = RandomForestRegressor(n_estimators=3000, n_jobs=-1, max_depth=50, max_leaf_nodes=55)
#without_reputation_baseline = RandomForestRegressor(n_estimators=2000, n_jobs=-1, max_depth=10)
model = RandomForestRegressor(n_estimators=2000, n_jobs=-1, max_depth=10)
model = train_ca(model, baseline=84.41210126314476)

R2 Score: 84.42058122805432 %
Baseline Score: 84.41210126314476 %
Dummy Regressor R2 Score: -0.02595916467142967 %


In [5]:
from sklearn.svm import LinearSVR

model = LinearSVR()
model = train_ca(model, baseline=91.76651445710918)

R2 Score: 91.76651445710918 %
Baseline Score: 91.76651445710918 %
Dummy Regressor R2 Score: -0.02595916467142967 %




In [6]:
from sklearn.svm import NuSVR

model = NuSVR()
model = train_ca(model, baseline=95.52816097530203)

R2 Score: 95.52816097530203 %
Baseline Score: 95.52816097530203 %
Dummy Regressor R2 Score: -0.02595916467142967 %


In [7]:
from sklearn.svm import SVR

model = SVR()
model = train_ca(model)

R2 Score: 95.2074077468832 %
Dummy Regressor R2 Score: -0.02595916467142967 %


In [8]:
from sklearn.linear_model import SGDRegressor

model = SGDRegressor()
model = train_ca(model)

R2 Score: 91.70875802721275 %
Dummy Regressor R2 Score: -0.02595916467142967 %
