In [33]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFECV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.dummy import DummyRegressor
from sklearn.ensemble import RandomForestRegressor

In [42]:
class FootballManagerRegressor:
    def __init__(self, filepath, SEED=42):
        self.filepath = filepath
        self.dataset = pd.read_csv(self.filepath)
        self.SEED = SEED
        np.random.seed(SEED)
    
    def clean_data(self, columns: list):
        self.dataset = self.dataset.drop(columns=columns)
    
    def encode(self, columns: list):
        # Encode the labels
        ohe = OneHotEncoder()
        encoded = ohe.fit_transform(self.dataset[columns]).toarray()
        columns = ohe.get_feature_names_out(columns)
        encoded_df = pd.DataFrame(encoded, columns=columns)
        self.dataset = pd.concat([self.dataset, encoded_df], axis=1)
        self.dataset = dataset.drop(columns=columns)
    
    def normalize(self, columns: list):
        # Normalize the dataset
        norm = StandardScaler()
        norm_ = norm.fit_transform(self.dataset[columns])
        norm_df = pd.DataFrame(norm_, columns=[col + "_norm" for col in columns])
        self.dataset = pd.concat([self.dataset, norm_df], axis=1)
        self.dataset = self.dataset.drop(columns=columns)
    
    def train_split(self, X, y, test_size=0.3):
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(X,
                                                                                y,
                                                                                test_size=test_size,
                                                                               )
    
    def train(self, feature_selection=False,model=RandomForestRegressor(n_estimators=100)):
        if feature_selection:
            rfecv = RFECV(estimator=model, cv=5, scoring='r2')
            rfecv.fit(self.X_train, self.y_train)
            self.X_train_rfecv = rfecv.transform(self.X_train)
            self.X_test_rfecv = rfecv.transform(self.X_test)

            # Train the model
            model.fit(self.X_train_rfecv, self.y_train)
            # Test the model
            self.dummy()
            self.test(model, feature_selection)
            return model
        
        # Train the model without features selection
        model.fit(self.X_train, self.y_train)
        # Test the model
        self.test(model, feature_selection)
        self.dummy()
        return model
    
    def test(self, model, feature_selection):
        if feature_selection:
            # Test the model
            y_pred = model.predict(self.X_test_rfecv)
        else:
            y_pred = model.predict(self.X_test)
        # Check the accuracy using the R2 Score
        acc = r2_score(self.y_test, y_pred) * 100
        print(f"R2 Score: {acc} %")
    
    def dummy(self):
        dummy_regressor = DummyRegressor(strategy="mean")
        dummy_regressor.fit(self.X_train, self.y_train)
        y_pred = dummy_regressor.predict(self.X_test)
        acc = r2_score(self.y_test, y_pred) * 100
        print(f"Dummy Regressor R2 Score: {acc} %")
    
    def predict():
        pass
    
    


In [57]:
# Training Random Forest Regressor

fm = FootballManagerRegressor("./FM 2023.csv")
fm.clean_data(columns=["Rental club",
                                "Salary",
                                "Values",
                                "Race",
                                "UID", 
                                "Date of birth",
                                "Colour of skin",
                                "RCA",
                                "Race",
                                "Club",
                                "Nationality",
                                "Name",
                                "Position"
                               ]
             )

fm.normalize(columns=["ca", 
                      "pa", 
                      "Current reputation",
                      "Domestic reputation",
                      "World reputation",
                      "Number of national team appearances",
                      "Goals scored for the national team",
                      "Age",
                     ])

X = fm.dataset.drop(columns=["ca_norm"], axis=1)
y = fm.dataset['ca_norm']
fm.train_split(X, y)
model = RandomForestRegressor(n_estimators=100, n_jobs=-1)

model = fm.train(model=model)

R2 Score: 96.54380799734803 %
Dummy Regressor R2 Score: -0.02595916467142967 %


In [56]:
fm.dataset.head()

Unnamed: 0,Corners,Crossing,Dribbling,Finishing,First Touch,Free Kick Taking,Heading,Long Shots,Long Throws,Marking,...,Left Foot,Right Foot,ca_norm,pa_norm,Current reputation_norm,Domestic reputation_norm,World reputation_norm,Number of national team appearances_norm,Goals scored for the national team_norm,Age_norm
0,14,19,15,16,16,17,6,17,7,9,...,16,20,3.861125,1.117237,3.721475,3.625885,3.617004,3.996453,4.867143,1.113966
1,13,13,18,17,18,12,7,13,4,4,...,10,20,3.805317,1.211812,3.666249,3.623812,3.517439,2.335918,5.506573,-0.368632
2,3,8,13,19,18,15,17,12,3,7,...,13,20,3.6937,1.129059,3.562169,3.625885,3.518749,5.998863,15.950598,1.484616
3,7,10,14,18,16,13,15,13,5,6,...,20,11,3.637892,1.188168,3.509067,3.470374,3.191232,0.577704,4.01457,-0.553957
4,12,14,17,17,17,12,10,13,6,7,...,20,8,3.637892,1.093594,3.668373,3.672538,3.191232,3.703417,9.769441,0.928641
