In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score,KFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, mean_absolute_percentage_error
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb

In [None]:
# Adatok beolvasása az állományból
data = pd.read_csv('C:/Users/Tamás/Desktop/Gazdinfó/Allamvizsga_masodikfelvonas2/Adatok/AllStatistics1.csv', encoding='utf-8-sig')#encoding='ISO-8859-1')

# A "Minute Played" oszlopból eltávolítani a "'" karaktert, és átkonvertálni "int"-be 
data['Minutes Played'] = data['Minutes Played'].str.replace("'", "").astype('int64')

# A kategorikus változók numerikussá alakítása One-Hot Encoding-el
data = pd.get_dummies(data, columns=["Position"], dtype=int)

# Új statisztikai adatok behozatala
data["Duel Win Rate"] = (data["Duels won"] / data["Duels"]).round(2)
data["Ground Duel Win Rate"] = (data["Ground Duels won"] / data["Ground Duels"]).round(2)
data["Aerial Duel Win Rate"] = (data["Aerial Duels won"] / data["Aerial Duels"]).round(2)
data["Tackel Success Rate"] = (data["Total tackels"] / (data["Total tackels"] + data["Dribbled past"])).round(2)
data["Possession Loss Rate"] = (data["Possession lost"] / data["Touches"]).round(2)
data["Successful Long Ball Rate"] = (data["Succesfull Long balls"] / data["Long balls attempts"]).round(2)
data["Successful Cross Rate"] = (data["Succesfull Crosses"] / data["Crosses attempts"]).round(2)
data["Interception Efficiency"] = (data["Interceptions"] / data["Defensive actions"]).round(2)
data["Clearance Efficiency"] = (data["Clearances"] / data["Defensive actions"]).round(2)
data["Foul Rate"] = (data["Fouls"] / data["Duels"]).round(2)

# A módosított adatok mentése vissza CSV-be (UTF-8-BOM-mal, hogy az ékezetek is megmaradjanak)
data.to_csv('C:/Users/Tamás/Desktop/Gazdinfó/Allamvizsga_masodikfelvonas2/AllStatistics1.csv', index=False, encoding='utf-8-sig')

In [None]:
print(data.columns)

In [34]:
# Játékos oszlopok azonosítása
allPlayers = set(data["Player"].unique())
playerColumns = [col for col in data.columns if col in allPlayers]

statisticsColumns = data.columns.difference(playerColumns + ["MatchID", "Team", "Player", "Notes", "Defence notes", "Pass notes"])
statisticsColumns = [col for col in statisticsColumns if data[col].dtype in [int, float]]

# Meccsenként a játékosok szétválogatása csapatokra minden MatchID-re
teamInfoPerMatch = {}
allMatchIDs = data["MatchID"].unique()

for matchID in allMatchIDs:
    matchData = data[data["MatchID"] == matchID]
    team1 = set()
    team2 = set()
    playersInMatch = matchData["Player"].tolist()
    matchPlayerColumns = [col for col in data.columns if col in playersInMatch]

    for _, row in matchData.iterrows():
        playerName = row["Player"]
        for teamMate in matchPlayerColumns:
            value = row[teamMate]
            if value == matchID:
                team1.add(playerName)
            elif value == -matchID:
                team2.add(playerName)

    teamInfoPerMatch[matchID] = {
        "team1": sorted(team1),
        "team2": sorted(team2)
    }

In [35]:
# Aggregált csapatstatisztikák hozzárendelése minden játékoshoz
agregatedRows = []

for matchID in allMatchIDs:
    matchData = data[data["MatchID" ] == matchID]
    teams = teamInfoPerMatch[matchID]
    team1 = teams["team1"]
    team2 = teams["team2"]

    team1Statistics = matchData[matchData["Player"].isin(team1)][statisticsColumns]
    team2Statistics = matchData[matchData["Player"].isin(team2)][statisticsColumns]

    team1Summary = {
        "Team Goals": team1Statistics["Goals"].sum(),
        "Team Assists": team1Statistics["Assists"].sum(),
        "Team Shots": team1Statistics["Shots on target"].sum(),
        "Team Average xG": team1Statistics["Expected goals (xG)"].mean(),
        "Team Shots off target": team1Statistics["Shots off target"].sum(),
        "Team Shots blocked": team1Statistics["Shots blocked"].sum(),
        "Team Defensive actions": team1Statistics["Defensive actions"].mean(),
        "Team clearances": team1Statistics["Clearances"].mean(),
        "Team Blocked shots": team1Statistics["Blocked shots"].sum(),
        "Team interceptions": team1Statistics["Interceptions"].sum(),
        "Team Total tackels": team1Statistics["Total tackels"].sum(),
        "Team Dribbled past": team1Statistics["Dribbled past"].sum(),
        "Team Average touches": team1Statistics["Touches"].mean(),
        "Team Average Accurate Pass Rating": team1Statistics["Accurate Pass Rating"].mean(),
        "Team Average Duel Win Rate": team1Statistics["Duel Win Rate"].mean(),
        "Team Average Ground Duel Win Rate": team1Statistics["Ground Duel Win Rate"].mean(),
        "Team Average Aerial Duel Win Rate": team1Statistics["Aerial Duel Win Rate"].mean(),
        "Team Possession Loss Rate": team1Statistics["Possession Loss Rate"].mean(),
        "Team Average Successful Long Balls": team1Statistics["Successful Long Ball Rate"].mean(),
        "Team Average Successful Crosses": team1Statistics["Successful Cross Rate"].mean(),
        "Team Average Fouls": team1Statistics["Foul Rate"].mean(),
        "Team Total Key passes": team1Statistics["Key passes"].sum()
    }

    team2Summary = {
        "Team Goals": team2Statistics["Goals"].sum(),
        "Team Assists": team2Statistics["Assists"].sum(),
        "Team Shots": team2Statistics["Shots on target"].sum(),
        "Team Average xG": team2Statistics["Expected goals (xG)"].mean(),
        "Team Shots off target": team2Statistics["Shots off target"].sum(),
        "Team Shots blocked": team2Statistics["Shots blocked"].sum(),
        "Team Defensive actions": team2Statistics["Defensive actions"].mean(),
        "Team clearances": team2Statistics["Clearances"].mean(),
        "Team Blocked shots": team2Statistics["Blocked shots"].sum(),
        "Team interceptions": team2Statistics["Interceptions"].sum(),
        "Team Total tackels": team2Statistics["Total tackels"].sum(),
        "Team Dribbled past": team2Statistics["Dribbled past"].sum(),
        "Team Average touches": team2Statistics["Touches"].mean(),
        "Team Average Accurate Pass Rating": team2Statistics["Accurate Pass Rating"].mean(),
        "Team Average Duel Win Rate": team2Statistics["Duel Win Rate"].mean(),
        "Team Average Ground Duel Win Rate": team2Statistics["Ground Duel Win Rate"].mean(),
        "Team Average Aerial Duel Win Rate": team2Statistics["Aerial Duel Win Rate"].mean(),
        "Team Possession Loss Rate": team2Statistics["Possession Loss Rate"].mean(),
        "Team Average Successful Long Balls": team2Statistics["Successful Long Ball Rate"].mean(),
        "Team Average Successful Crosses": team2Statistics["Successful Cross Rate"].mean(),
        "Team Average Fouls": team2Statistics["Foul Rate"].mean(),
        "Team Total Key passes": team2Statistics["Key passes"].sum()
    }

    for _, row in matchData.iterrows():
        player = row["Player"]
        rowData = row[["Player", "MatchID"] + statisticsColumns].to_dict()

        if player in team1:
            rowData.update({
                "Team Goals": team1Summary["Team Goals"],
                "Team Assists": team1Summary["Team Assists"],
                "Team Shots": team1Summary["Team Shots"],
                "Team Average xG": team1Summary["Team Average xG"],
                "Team Shots off target": team1Summary["Team Shots off target"],
                "Team Shots blocked": team1Summary["Team Shots blocked"],
                "Team Defensive actions": team1Summary["Team Defensive actions"],
                "Team clearances": team1Summary["Team clearances"],
                "Team Blocked shots": team1Summary["Team Blocked shots"],
                "Team interceptions": team1Summary["Team interceptions"],
                "Team Total tackels": team1Summary["Team Total tackels"],
                "Team Dribbled past": team1Summary["Team Dribbled past"],
                "Team Average touches": team1Summary["Team Average touches"],
                "Team Average Accurate Pass Rating": team1Summary["Team Average Accurate Pass Rating"],
                "Team Average Duel Win Rate": team1Summary["Team Average Duel Win Rate"],
                "Team Average Ground Duel Win Rate": team1Summary["Team Average Ground Duel Win Rate"],
                "Team Average Aerial Duel Win Rate": team1Summary["Team Average Aerial Duel Win Rate"],
                "Team Possession Loss Rate": team1Summary["Team Possession Loss Rate"],
                "Team Average Successful Long Balls": team1Summary["Team Average Successful Long Balls"],
                "Team Average Successful Crosses": team1Summary["Team Average Successful Crosses"],
                "Team Average Fouls": team1Summary["Team Average Fouls"],
                "Team Total Key passes": team1Summary["Team Total Key passes"],
                "OpponentTeam Goals": team2Summary["Team Goals"],
                "OpponentTeam Assists": team2Summary["Team Assists"],
                "OpponentTeam Shots": team2Summary["Team Shots"],
                "OpponentTeam Average xG": team2Summary["Team Average xG"],
                "OpponentTeam Shots off target": team2Summary["Team Shots off target"],
                "OpponentTeam Shots blocked": team2Summary["Team Shots blocked"],
                "OpponentTeam Defensive actions": team2Summary["Team Defensive actions"],
                "OpponentTeam clearances": team2Summary["Team clearances"],
                "OpponentTeam Blocked shots": team2Summary["Team Blocked shots"],
                "OpponentTeam interceptions": team2Summary["Team interceptions"],
                "OpponentTeam Total tackels": team2Summary["Team Total tackels"],
                "OpponentTeam Dribbled past": team2Summary["Team Dribbled past"],
                "OpponentTeam Average touches": team2Summary["Team Average touches"],
                "OpponentTeam Average Accurate Pass Rating": team2Summary["Team Average Accurate Pass Rating"],
                "OpponentTeam Average Duel Win Rate": team2Summary["Team Average Duel Win Rate"],
                "OpponentTeam Average Ground Duel Win Rate": team2Summary["Team Average Ground Duel Win Rate"],
                "OpponentTeam Average Aerial Duel Win Rate": team2Summary["Team Average Aerial Duel Win Rate"],
                "OpponentTeam Possession Loss Rate": team2Summary["Team Possession Loss Rate"],
                "OpponentTeam Average Successful Long Balls": team2Summary["Team Average Successful Long Balls"],
                "OpponentTeam Average Successful Crosses": team2Summary["Team Average Successful Crosses"],
                "OpponentTeam Average Fouls": team2Summary["Team Average Fouls"],
                "OpponentTeam Total Key passes": team2Summary["Team Total Key passes"],
            })
        elif player in team2:
            rowData.update({
                "Team Goals": team2Summary["Team Goals"],
                "Team Assists": team2Summary["Team Assists"],
                "Team Shots": team2Summary["Team Shots"],
                "Team Average xG": team2Summary["Team Average xG"],
                "Team Shots off target": team2Summary["Team Shots off target"],
                "Team Shots blocked": team2Summary["Team Shots blocked"],
                "Team Defensive actions": team2Summary["Team Defensive actions"],
                "Team clearances": team2Summary["Team clearances"],
                "Team Blocked shots": team2Summary["Team Blocked shots"],
                "Team interceptions": team2Summary["Team interceptions"],
                "Team Total tackels": team2Summary["Team Total tackels"],
                "Team Dribbled past": team2Summary["Team Dribbled past"],
                "Team Average touches": team2Summary["Team Average touches"],
                "Team Average Accurate Pass Rating": team2Summary["Team Average Accurate Pass Rating"],
                "Team Average Duel Win Rate": team2Summary["Team Average Duel Win Rate"],
                "Team Average Ground Duel Win Rate": team2Summary["Team Average Ground Duel Win Rate"],
                "Team Average Aerial Duel Win Rate": team2Summary["Team Average Aerial Duel Win Rate"],
                "Team Possession Loss Rate": team2Summary["Team Possession Loss Rate"],
                "Team Average Successful Long Balls": team2Summary["Team Average Successful Long Balls"],
                "Team Average Successful Crosses": team2Summary["Team Average Successful Crosses"],
                "Team Average Fouls": team2Summary["Team Average Fouls"],
                "Team Total Key passes": team2Summary["Team Total Key passes"],
                "OpponentTeam Goals": team1Summary["Team Goals"],
                "OpponentTeam Assists": team1Summary["Team Assists"],
                "OpponentTeam Shots": team1Summary["Team Shots"],
                "OpponentTeam Average xG": team1Summary["Team Average xG"],
                "OpponentTeam Shots off target": team1Summary["Team Shots off target"],
                "OpponentTeam Shots blocked": team1Summary["Team Shots blocked"],
                "OpponentTeam Defensive actions": team1Summary["Team Defensive actions"],
                "OpponentTeam clearances": team1Summary["Team clearances"],
                "OpponentTeam Blocked shots": team1Summary["Team Blocked shots"],
                "OpponentTeam interceptions": team1Summary["Team interceptions"],
                "OpponentTeam Total tackels": team1Summary["Team Total tackels"],
                "OpponentTeam Dribbled past": team1Summary["Team Dribbled past"],
                "OpponentTeam Average touches": team1Summary["Team Average touches"],
                "OpponentTeam Average Accurate Pass Rating": team1Summary["Team Average Accurate Pass Rating"],
                "OpponentTeam Average Duel Win Rate": team1Summary["Team Average Duel Win Rate"],
                "OpponentTeam Average Ground Duel Win Rate": team1Summary["Team Average Ground Duel Win Rate"],
                "OpponentTeam Average Aerial Duel Win Rate": team1Summary["Team Average Aerial Duel Win Rate"],
                "OpponentTeam Possession Loss Rate": team1Summary["Team Possession Loss Rate"],
                "OpponentTeam Average Successful Long Balls": team1Summary["Team Average Successful Long Balls"],
                "OpponentTeam Average Successful Crosses": team1Summary["Team Average Successful Crosses"],
                "OpponentTeam Average Fouls": team1Summary["Team Average Fouls"],
                "OpponentTeam Total Key passes": team1Summary["Team Total Key passes"],
            })
        else:
            continue

        agregatedRows.append(rowData)

agregatedData = pd.DataFrame(agregatedRows)
agregatedData = agregatedData.fillna(0)

agregatedData.to_csv('C:/Users/Tamás/Desktop/Gazdinfó/Allamvizsga_masodikfelvonas2/agregatedData.csv', index=False, encoding='utf-8-sig')

In [36]:
#////////////////////////////////////////////////////////////////////////////////////////////////////////////
# PCA ANALYSIS

agregatedData_copy = agregatedData.drop(columns = "Player")

# Célváltozó (Rating) kivétele, csak bemeneti jellemzők megtartása
y = agregatedData_copy.drop(columns = ["Rating"])

# Adatok skálázása
scaler = StandardScaler()
scaled_y = scaler.fit_transform(y)

# PCA alkalmazása
pca = PCA()
pcaComponents = pca.fit_transform(scaled_y)

# Magyarázott variancia arány (Explained variance ratio)
explainedVariance = pca.explained_variance_ratio_
cumulativeVariance = np.cumsum(explainedVariance)

# Variancia diagram
plt.figure(figsize = (15, 10))
plt.plot(range(1, len(cumulativeVariance) + 1), cumulativeVariance, marker = 'o', linestyle = '--')
plt.title('Kumulált magyarázott variancia a főkomponensek számának a függvényében')
plt.xlabel('Kumulált magyarázott variancia')
plt.grid(True)
plt.tight_layout()
plt.show()

# Eredmény: első X komponens magyarázott varianciája
pcaVarianceDF = pd.DataFrame({
    "Főkomponens ": [f"PC{i + 1}" for i in range(len(explainedVariance))],
    "Magyarázott variancia (%) ": (explainedVariance * 100).round(2),
    "Kumulált variancia (%) ": (cumulativeVariance * 100).round(2)
})

In [None]:
#LINEÁRIS REGRESSZIÓ PCA ANALYSIS-EL

# Megnézzük az R² értéket különböző komponensszámok mellett (1-től 80-ig) Ebben nincsenek benne a pozíciók

r2_scores = []
componentRange = range(1, 81)
for n in componentRange:
    X_pca_n = pcaComponents[:, :n]
    X_test_n, X_train_n, y_test_n, y_train_n = train_test_split(X_pca_n, scaled_y, test_size = 0.1, random_state = 42)
    model_n = LinearRegression()
    model_n.fit(X_train_n, y_train_n)
    y_pred_n = model_n.predict(X_test_n)
    r2_scores.append(r2_score(y_test_n, y_pred_n))

# Ábra az R² érték alakulásáról
plt.figure(figsize = (15, 10))
plt.plot(componentRange, r2_scores, marker = 'o')
plt.title("Az R² érték alakulása a PCA komponensek számának a függvényében")
plt.xlabel("PCA komponensek száma")
plt.ylabel("Determinációs együttható (R²)")
plt.grid(True)
plt.tight_layout()
plt.show()

# Bemeneti változók: PCA komponensek
X_pca  = pcaComponents[:, :80]

X_train, X_test, y_train, y_test = train_test_split(X_pca, scaled_y, test_size = 0.1, random_state = 42)

pcaLinModel = LinearRegression()
pcaLinModel.fit(X_train, y_train)
pcaLinModelPredict = pcaLinModel.predict(X_test)

r2PCALinModel = r2_score(y_test, pcaLinModelPredict)
maePCALinModel = mean_absolute_error(y_test, pcaLinModelPredict)
msePCALinModel = mean_squared_error(y_test, pcaLinModelPredict)
mapePCALinModel = mean_absolute_percentage_error(y_test, pcaLinModelPredict)

print(f"Determinációs együttható (R²): {round(r2PCALinModel, 3)}")
print(f"Átlagos négyzetes hiba (MSE): {round(msePCALinModel, 3)}")
print(f"Átlagos abszolút hiba (MAE): {round(maePCALinModel, 3)}")
print(f"Mean absolute percentage error (MAPE): {round(mapePCALinModel, 3)}")

# Az első X komponenshez tartozó súlyok (komponens mátrix / loadings)
sulyok = pd.DataFrame(
    pcaComponents[:40].T,
    columns = [f"PC{i + 1}" for i in range(40)],
    index = y.columns
)

# Domináns változók minden komponenshez: abszolút érték alapján TOP 3
topSulyPerKomponens = {}
for komponens in sulyok.columns:
    topSulyok = sulyok[komponens].abs().sort_values(ascending = False).head(3).index.tolist()
    topSulyPerKomponens[komponens] = topSulyok

# Minden változóhoz kiszámítjuk az összes komponenshez tartozó abszolút súlyok összegét
variableImportance = sulyok.abs().sum(axis = 1).sort_values(ascending = False)

# Legfontosabb változók sorrendben (TOP X)
legfontosabbValtozok = variableImportance.head(20)
print(legfontosabbValtozok)

In [38]:
# POSZT SZERINTI PCA ELEMZÉS

# Pozíció szerinti szétválogatás
defenderPlayers = agregatedData_copy[agregatedData_copy["Position_D"] == 1]
defenderPlayers = defenderPlayers.drop(columns = ["Position_D", "Position_M", "Position_F"])
midfielderPlayers = agregatedData_copy[agregatedData_copy["Position_M"] == 1]
midfielderPlayers = midfielderPlayers.drop(columns = ["Position_D", "Position_M", "Position_F"])
forwardPlayers = agregatedData_copy[agregatedData_copy["Position_F"] == 1]
forwardPlayers = forwardPlayers.drop(columns = ["Position_D", "Position_M", "Position_F"])

# Minden posztra kiválasztjuk a Rating (teljesítmény) értékeket
positionRatings = {
    "Defenders": defenderPlayers["Rating"],
    "Midfielders": midfielderPlayers["Rating"],
    "Forwards": forwardPlayers["Rating"]
}

In [None]:
# # Boxplot a pozíciónkénti teljesítmény-eloszlásról
plt.figure(figsize = (15, 10))
plt.boxplot(positionRatings.values(), labels = positionRatings.keys(), patch_artist = True)
plt.title("Játékosok teljesítményének (Rating) eloszlása pozíciónként")
plt.ylabel("Rating")
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
from sklearn.pipeline import Pipeline

# Egy függvény, amely PCA-t és lineáris regressziót végez egy adott poszt adataira
def PCAAnalysisByPosition(data, positionName):
    X = data.drop(columns = ["Rating"])
    y = data["Rating"]

    # Standardizálás + PCA + Lineáris regresszió pipeline-ban
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('pca', PCA(n_components = 80)),
        ('regressor', LinearRegression())
    ])

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, random_state = 42)
    pipeline.fit(X_train, y_train)
    y_pred_pca_pos = pipeline.predict(X_test)

    r2 = r2_score(y_test, y_pred_pca_pos)
    mse = mean_squared_error(y_test, y_pred_pca_pos)
    mae = mean_absolute_error(y_test, y_pred_pca_pos)
    mape = mean_absolute_percentage_error(y_test, y_pred_pca_pos)

    return {
        "Pozíció ": positionName,
        "R²": round(r2, 3),
        "MSE": round(mse, 3),
        "MAE": round(mae, 3),
        "MAPE": round(mape, 3)
    }
# Modellek futtatása posztonként
eredmenyek = []
eredmenyek.append(PCAAnalysisByPosition(defenderPlayers, "Defenders"))
eredmenyek.append(PCAAnalysisByPosition(midfielderPlayers, "Midfielders"))
eredmenyek.append(PCAAnalysisByPosition(forwardPlayers, "Forwards"))

eredmenyekDF = pd.DataFrame(eredmenyek)
print(eredmenyekDF)

In [None]:
# Függvény: Legdominánsabb változók egy adott pozíció első N PCA-komponensében
def legfontosabbValtozokPozicionkent(data, positionName, n_component = 5, top_n_features = 5):
    X = data.drop(columns = "Rating")

    # Standardizálás
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # PCA illesztés
    pca = PCA(n_components = n_component)
    pca.fit(X_scaled)

    # Komponens súlyok
    sulyok = pd.DataFrame(
        pca.components_.T,
        columns = [f"PC{i + 1}" for i in range(n_component)],
        index = X.columns
    )

    # Minden komponenshez kiválasztjuk a legnagyobb abszolút értékű változókat
    legfontosabbStatisztikak = {}
    for suly in sulyok.columns:
        top = sulyok[suly].abs().sort_values(ascending = False).head(top_n_features).index.tolist()
        legfontosabbStatisztikak[suly] = top
    
    return pd.DataFrame(legfontosabbStatisztikak)

# Futtatás posztonként
legfontosabbVedoStatisztikak = legfontosabbValtozokPozicionkent(defenderPlayers, "Defenders")
legfontosabbKozeppalyasStatisztikak = legfontosabbValtozokPozicionkent(midfielderPlayers, "Midfielders")
legfontosabbTamadoStatisztikak = legfontosabbValtozokPozicionkent(forwardPlayers, "Forwards")

print(legfontosabbVedoStatisztikak)
print(legfontosabbKozeppalyasStatisztikak)
print(legfontosabbTamadoStatisztikak)

In [41]:
#////////////////////////////////////////////////////////////////////////////////////////////////////////////
# RIDGE / LASSO / ELASTIC NET REGRESSION

exclude_columns = ["Player", "MatchID", "Rating"]
features = [col for col in agregatedData.columns if col not in exclude_columns]

X = agregatedData[features]
y = agregatedData["Rating"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, random_state = 42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

ridge = Ridge(alpha = 1.0)
ridge.fit(X_train_scaled, y_train)
y_pred_ridge = ridge.predict(X_test_scaled)
print("MAE_Ridge:", f"{mean_absolute_error(y_test, y_pred_ridge): .3f}")
print("R2_Ridge:", f"{r2_score(y_test, y_pred_ridge): .3f}")
print("MAPE_Ridge:", f"{mean_absolute_percentage_error(y_test, y_pred_ridge): .3f}")

lasso = Lasso(alpha=0.1)
lasso.fit(X_train_scaled, y_train)
y_pred_lasso = lasso.predict(X_test_scaled)
print("MAE_Lasso:", f"{mean_absolute_error(y_test, y_pred_lasso): .3f}")
print("R2_Lasso:", f"{r2_score(y_test, y_pred_lasso): .3f}")
print("MAPE_Lasso:", f"{mean_absolute_percentage_error(y_test, y_pred_lasso): .3f}")

elastic = ElasticNet(alpha=0.1, l1_ratio=0.5)
elastic.fit(X_train_scaled, y_train)
y_pred_elastic = elastic.predict(X_test_scaled)
print("MAE_ElasticNet:", f"{mean_absolute_error(y_test, y_pred_elastic): .3f}")
print("R2_ElasticNet:", f"{r2_score(y_test, y_pred_elastic): .3f}")
print("MAPE_ElasticNet:", f"{mean_absolute_percentage_error(y_test, y_pred_elastic): .3f}")

In [42]:
#////////////////////////////////////////////////////////////////////////////////////////////////////////////
# RANDOM FOREST

targetColumn = "Rating"
excludeColumns = ["Player", "MatchID", targetColumn]
featureColumns = [col for col in agregatedData.columns if col not in excludeColumns]

X = agregatedData[featureColumns]
y = agregatedData[targetColumn]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

randomForestModel = RandomForestRegressor(n_estimators = 200, random_state = 42)
randomForestModel.fit(X_train, y_train)

y_pred = randomForestModel.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)

print(f"MAE_RandomForest: {mae: .3f}, R2_RandomForest: {r2: .3f}, MAPE_RandomForest: {mape: .3f}")

In [43]:
#////////////////////////////////////////////////////////////////////////////////////////////////////////////
# XGBOOOST

# Csak numerikus típusú oszlopokat tartunk meg a feature setben
exclude_columns = ["Player", "MatchID", "Rating"]
feature_columns = [
    col for col in agregatedData.columns
    if col not in exclude_columns and agregatedData[col].dtype in [np.int64, np.float64, np.bool_]
]

X = agregatedData[feature_columns]
y = agregatedData["Rating"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

# XGBoost modell létrehozása és tanítása
xgb_model = xgb.XGBRegressor(
    n_estimators=400,
    learning_rate=0.05,
    max_depth=4,
    subsample=1,
    colsample_bytree=0.7,
    random_state=42
)

xgb_model.fit(X_train, y_train)

# Előrejelzés és kiértékelés
y_pred_xgb = xgb_model.predict(X_test)
mae_xgb = mean_absolute_error(y_test, y_pred_xgb)
r2_xgb = r2_score(y_test, y_pred_xgb)
mape_xgb = mean_absolute_percentage_error(y_test, y_pred_xgb)

# print(f"MAE_XGB: {mae_xgb: .3f}, R2_XGB: {r2_xgb: .3f}, MAPE_XGB: {mape_xgb: .3f}")

In [44]:
import joblib

# Modellek mentése
joblib.dump(randomForestModel, "random_forest_model.pkl")
joblib.dump(xgb_model, "xgboost_model.pkl")
joblib.dump(ridge, "ridge_model.pkl")
joblib.dump(lasso, "lasso_model.pkl")
joblib.dump(elastic, "elastic_model.pkl")
joblib.dump(featureColumns, "model_features.pkl")


['model_features.pkl']

In [None]:
#////////////////////////////////////////////////////////////////////////////////////////////////////////////
# POZíCIÓ SPECIFIKUS MODELLEK

positionResults = {}

for position in ['D', 'M', 'F']:
    positionColumn = f'Position_{position}'
    if positionColumn in agregatedData.columns:
        print(f"\n--- {position} pozíció modellezése ---")

        # Adatszűrés
        positionData = agregatedData[agregatedData[positionColumn] == 1]
        positionFeatureColumns = [col for col in positionData.columns if col not in ["Player", "MatchID", "Rating"] and positionData[col].dtype in [np.int64, np.float64, np.bool_]]
        X = positionData[positionFeatureColumns]
        y = positionData["Rating"]

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

        # Skálázás a regressziókhoz
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)

        # 100-Fold Cross-Validation
        cv = KFold(n_splits = 5, shuffle = True, random_state = 42)

        positionModelScores = {}

        # Ridge Regression
        ridge = Ridge(alpha = 1.0)
        ridge.fit(X_train_scaled, y_train)
        y_pos_pred_ridge = ridge.predict(X_test_scaled)
        mae_cv = -cross_val_score(ridge, X_train_scaled, y_train, cv=cv, scoring='neg_mean_absolute_error').mean()
        r2_cv = cross_val_score(ridge, X_train_scaled, y_train, cv=cv, scoring='r2').mean()
        positionModelScores["Ridge"] = {
            "MAE_Ridge": round(mean_absolute_error(y_test, y_pos_pred_ridge), 3),
            "R2_Ridge": round(r2_score(y_test, y_pos_pred_ridge), 3),
            "MAPE (%)": round(mean_absolute_percentage_error(y_test, y_pos_pred_ridge) * 100, 3),
            "CV MAE": round(mae_cv, 3),
            "CV R2": round(r2_cv, 3)
        }

        # Lasso Regression
        lasso = Lasso(alpha = 0.1)
        lasso.fit(X_train_scaled, y_train)
        y_pos_pred_lasso = lasso.predict(X_test_scaled)
        mae_cv = -cross_val_score(lasso, X_train_scaled, y_train, cv=cv, scoring='neg_mean_absolute_error').mean()
        r2_cv = cross_val_score(lasso, X_train_scaled, y_train, cv=cv, scoring='r2').mean()
        positionModelScores["Lasso"] = {
            "MAE_Lasso": round(mean_absolute_error(y_test, y_pos_pred_lasso), 3),
            "R2_Lasso": round(r2_score(y_test, y_pos_pred_lasso), 3),
            "MAPE (%)": round(mean_absolute_percentage_error(y_test, y_pos_pred_lasso) * 100, 3),
            "CV MAE": round(mae_cv, 3),
            "CV R2": round(r2_cv, 3)
        }

        # ElasticNet Regression
        elastic = ElasticNet(alpha = 0.1, l1_ratio = 0.5)
        elastic.fit(X_train_scaled, y_train)
        y_pos_pred_elastic = elastic.predict(X_test_scaled)
        mae_cv = -cross_val_score(elastic, X_train_scaled, y_train, cv=cv, scoring='neg_mean_absolute_error').mean()
        r2_cv = cross_val_score(elastic, X_train_scaled, y_train, cv=cv, scoring='r2').mean()
        positionModelScores["ElasticNet"] = {
            "MAE_ElasticNet": round(mean_absolute_error(y_test, y_pos_pred_elastic), 3),
            "R2_ElasticNet": round(r2_score(y_test, y_pos_pred_elastic), 3),
            "MAPE (%)": round(mean_absolute_percentage_error(y_test, y_pos_pred_elastic) * 100, 3),
            "CV MAE": round(mae_cv, 3),
            "CV R2": round(r2_cv, 3)
        }

        # Random Forest
        randomforest = RandomForestRegressor(n_estimators = 300, random_state = 42)
        randomforest.fit(X_train, y_train)
        y_pos_pred_randfor = randomforest.predict(X_test)
        mae_cv = -cross_val_score(randomforest, X_train, y_train, cv=cv, scoring='neg_mean_absolute_error').mean()
        r2_cv = cross_val_score(randomforest, X_train, y_train, cv=cv, scoring='r2').mean()
        positionModelScores["Random Forest"] = {
            "MAE_RandomForest": round(mean_absolute_error(y_test, y_pos_pred_randfor), 3),
            "R2_RandomForest": round(r2_score(y_test, y_pos_pred_randfor), 3),
            "MAPE (%)": round(mean_absolute_percentage_error(y_test, y_pos_pred_randfor) * 100, 3),
            "CV MAE": round(mae_cv, 3),
            "CV R2": round(r2_cv, 3)
        }

        # XGBoost
        xgboost = xgb.XGBRegressor(
            n_estimators=400,
            learning_rate=0.05,
            max_depth=4,
            subsample=1,
            colsample_bytree=0.7,
            random_state=42
        )
        xgboost.fit(X_train, y_train)
        y_pos_pred_xgboost = xgboost.predict(X_test)
        mae_cv = -cross_val_score(xgboost, X_train, y_train, cv=cv, scoring='neg_mean_absolute_error').mean()
        r2_cv = cross_val_score(xgboost, X_train, y_train, cv=cv, scoring='r2').mean()
        positionModelScores["XGBoost"] = {
            "MAE_XGBoost": round(mean_absolute_error(y_test, y_pos_pred_xgboost), 3),
            "R2_XGBoost": round(r2_score(y_test, y_pos_pred_xgboost), 3),
            "MAPE (%)": round(mean_absolute_percentage_error(y_test, y_pos_pred_xgboost) * 100, 3),
            "CV MAE": round(mae_cv, 3),
            "CV R2": round(r2_cv, 3)
        }

        positionResults[position] = positionModelScores

pd.set_option('display.max_rows', None)       # összes sor
pd.set_option('display.max_columns', None)    # összes oszlop
pd.set_option('display.width', None)          # ne tördeljen
pd.set_option('display.max_colwidth', None)   # teljes szövegek megjelenítése

positionResultsDF = pd.DataFrame(positionResults).T
print(positionResultsDF)

In [46]:
import joblib

position_models = {}
position_features = {}

for position in ['D', 'M', 'F']:
    positionColumn = f'Position_{position}'
    if positionColumn in agregatedData.columns:
        print(f"--- {position} pozíció modellezése ---")

        positionData = agregatedData[agregatedData[positionColumn] == 1]
        featureCols = [col for col in positionData.columns if col not in ["Player", "MatchID", "Rating"] and positionData[col].dtype in [np.int64, np.float64, np.bool_]]
        X = positionData[featureCols]
        y = positionData["Rating"]

        position_features[position] = featureCols  # Mentjük a feature oszlopokat is

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)
        X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
        X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

        # Skálázó mentése
        joblib.dump(scaler, f"scaler_{position}.pkl")

        cv = KFold(n_splits = 60, shuffle = True, random_state = 42)

        # Modellek
        models = {
            "ridge": Ridge(alpha = 1.0),
            "lasso": Lasso(alpha = 0.1),
            "elastic": ElasticNet(alpha = 0.1, l1_ratio = 0.5),
            "randomforest": RandomForestRegressor(n_estimators = 300, random_state = 42),
            "xgboost": xgb.XGBRegressor(
                n_estimators=400,
                learning_rate=0.05,
                max_depth=4,
                subsample=1,
                colsample_bytree=0.7,
                random_state=42
            )
        }

        position_models[position] = {}

        for name, model in models.items():
            if name in ["ridge", "lasso", "elastic"]:
                model.fit(X_train_scaled, y_train)
            else:
                model.fit(X_train, y_train)
            position_models[position][name] = model

# Mentés fájlba
for position, models in position_models.items():
    for name, model in models.items():
        joblib.dump(model, f"model_{position}_{name}.pkl")

# Feature oszlopok mentése is minden pozícióhoz
for position, features in position_features.items():
    joblib.dump(features, f"features_{position}.pkl")


--- D pozíció modellezése ---
--- M pozíció modellezése ---
--- F pozíció modellezése ---


NEURÁLIS HÁLÓKKAL VALÓ PRÓBÁLKOZÁS  

In [47]:
# Feature-k és célváltozó szétválasztása

exclude_columns = ["Player", "MatchID", "Rating"]
features = [col for col in agregatedData.columns if col not in exclude_columns]

X = agregatedData[features]
y = agregatedData["Rating"]

In [48]:
# Adatok skálázása

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [49]:
# Adatok szétosztása tanuló és tesztelő halmazokra

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size = 0.2, random_state = 42)

In [50]:
# Bemenet átalakítása PyTorch formátumba

import torch

X_train_tensor = torch.tensor(X_train, dtype = torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype = torch.float32).view(-1, 1)
X_test_tensor = torch.tensor(X_test, dtype = torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype = torch.float32).view(-1, 1)

In [51]:
from torch.utils.data import TensorDataset, DataLoader

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

train_loader = DataLoader(train_dataset, batch_size = 128, shuffle = True)
test_loader = DataLoader(test_dataset, batch_size = 128)

In [52]:
# NEURÁLIS HÁLÓ DEFINIÁLÁSA

import torch.nn as nn
import torch.optim as optim

# Hálózat definíció
class PlayerRatingNN(nn.Module):
    def __init__(self, input_dim):
        super(PlayerRatingNN, self).__init__()
        self.fc1 = nn.Linear(input_dim, 256)
        # self.bn1 = nn.BatchNorm1d(256)
        self.dropout1 = nn.Dropout(p = 0.3)

        self.fc2 = nn.Linear(256, 128)
        # self.bn2 = nn.BatchNorm1d(128)
        self.dropout2 = nn.Dropout(p = 0.3)

        self.fc3 = nn.Linear(128, 64)
        # self.bn3 = nn.BatchNorm1d(64)
        self.dropout3 = nn.Dropout(p = 0.3)

        self.fc4 = nn.Linear(64, 32)
        # self.bn4 = nn.BatchNorm1d(32)
        self.dropout4 = nn.Dropout(p = 0.3)

        self.fc5 = nn.Linear(32, 1)
        self.relu = nn.ReLU()

    def forward(self, x):
        # x = self.dropout1(self.relu(self.bn1(self.fc1(x))))
        # x = self.dropout2(self.relu(self.bn2(self.fc2(x))))
        # x = self.dropout3(self.relu(self.bn3(self.fc3(x))))
        # x = self.dropout4(self.relu(self.bn4(self.fc4(x))))

        x = self.relu(self.fc1(x))
        x = self.dropout1(x)

        x = self.relu(self.fc2(x))
        x = self.dropout2(x)

        x = self.relu(self.fc3(x))
        x = self.dropout3(x)

        x = self.relu(self.fc4(x))
        x = self.dropout4(x)
        
        x = self.fc5(x)
        return x
    
model = PlayerRatingNN(input_dim = X_train.shape[1])

criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr = 0.0005)

In [None]:
# Tanítási ciklus

num_epochs = 1500
train_losses = []
val_losses = []

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0

    for inputs, targets in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
    
    avg_loss = running_loss / len(train_loader)
    train_losses.append(avg_loss)

    # Validációs veszteség kiszámítása (a teszt halmazon)
    model.eval()
    with torch.no_grad():
        val_outputs = model(X_test_tensor)
        val_loss = criterion(val_outputs, y_test_tensor).item()
    val_losses.append(val_loss)

    if (epoch + 1) % 10 == 0:
        print(f'Epoch {epoch + 1} / {num_epochs}, Loss: {avg_loss: .4f}, Val Loss: {val_loss: .4f}')

In [None]:
# Kiértékelés

model.eval()
with torch.no_grad():
    y_pred = model(X_test_tensor).numpy()
    y_true = y_test_tensor.numpy()

mae = mean_absolute_error(y_true, y_pred)
mape = mean_absolute_percentage_error(y_true, y_pred)
r2 = r2_score(y_true, y_pred)

print(f"MAE: {mae: .3f}")
print(f"MAPE: {mape: .3f}")
print(f"R2: {r2: .3f}")

# LOSS görbe rajzolása
plt.figure(figsize = (15, 10))
plt.plot(train_losses)
plt.title("Train loss during epochs")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.grid()
plt.show()

In [None]:
# Módosított adatbázis szétválasztás pozíciók szerint
defenderPlayers = agregatedData[agregatedData["Position_D"] == 1].drop(columns=["Position_D", "Position_M", "Position_F"])
midfielderPlayers = agregatedData[agregatedData["Position_M"] == 1].drop(columns=["Position_D", "Position_M", "Position_F"])
forwardPlayers = agregatedData[agregatedData["Position_F"] == 1].drop(columns=["Position_M", "Position_D", "Position_F"])

# Függvény a legfontosabb statisztikák lekérésére
def get_top_features(data, position_name, top_n=10):
    numeric_cols = data.select_dtypes(include=[np.number]).columns
    features = [col for col in numeric_cols if col not in ['Rating', 'MatchID']]
    correlations = data[features].corrwith(data['Rating']).sort_values(ascending=False)
    return correlations.head(top_n), position_name

# Top változók lekérése
top_defender, name_d = get_top_features(defenderPlayers, 'Védőjátékosok')
top_midfielder, name_m = get_top_features(midfielderPlayers, 'Középpályások')
top_forward, name_f = get_top_features(forwardPlayers, 'Támadók')

# Ábrák külön megjelenítése
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(15, 10))
sns.barplot(x=top_defender.values, y=top_defender.index, palette='Blues_r')
plt.title(f'{name_d} - Legfontosabb statisztikák', fontsize = 16)
plt.ylabel('Statisztikák', fontsize = 14)
plt.xlabel('Fontosság', fontsize = 14)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.grid()
plt.show()

plt.figure(figsize=(15, 10))
sns.barplot(x=top_midfielder.values, y=top_midfielder.index, palette='Greens_r')
plt.title(f'{name_m} - Legfontosabb statisztikák', fontsize = 16)
plt.ylabel('Statisztikák', fontsize = 14)
plt.xlabel('Fontosság', fontsize = 14)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.grid()
plt.show()

plt.figure(figsize=(15, 10))
sns.barplot(x=top_forward.values, y=top_forward.index, palette='Reds_r')
plt.title(f'{name_f} - Legfontosabb statisztikák', fontsize = 16)
plt.ylabel('Statisztikák', fontsize = 14)
plt.xlabel('Fontosság', fontsize = 14)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.grid()
plt.show()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Pozíció alapú felosztás
defenderPlayers = agregatedData[agregatedData["Position_D"] == 1].drop(columns=["Position_D", "Position_M", "Position_F"])
midfielderPlayers = agregatedData[agregatedData["Position_M"] == 1].drop(columns=["Position_D", "Position_M", "Position_F"])
forwardPlayers = agregatedData[agregatedData["Position_F"] == 1].drop(columns=["Position_M", "Position_D", "Position_F"])

# Függvény a 20 legfontosabb statisztika kiválasztásához és korrelációs mátrix megjelenítéséhez
def plot_top_corr_matrix(data, position_name):
    numeric_cols = data.select_dtypes(include=['number'])
    features = [col for col in numeric_cols.columns if col != 'Rating']
    correlations = numeric_cols[features].corrwith(numeric_cols['Rating']).abs().sort_values(ascending=False)
    top_features = correlations.head(20).index.tolist()
    # top_features.append('Rating')  # Vegyük fel a Ratinget is a mátrixba
    corr_matrix = data[top_features].corr()

    plt.figure(figsize=(15, 10))
    sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm', center=0)
    plt.title(f'{position_name} - a legfontosabb 20 statisztikai változó korrelációs mátrixa', fontsize = 16)
    plt.xticks(rotation=45, ha='right', fontsize = 12)
    plt.yticks(rotation=0, fontsize = 12)
    plt.tight_layout()
    plt.show()

# Vizuális megjelenítés pozíciónként
plot_top_corr_matrix(defenderPlayers, "Védőjátékosok")
plot_top_corr_matrix(midfielderPlayers, "Középpályások")
plot_top_corr_matrix(forwardPlayers, "Támadók")
