### Notebook Summary (football.ipynb)

This Jupyter Notebook implements the **K-Nearest Neighbors (KNN) classification algorithm** to predict the **position of football players** (Defender, Midfielder, Forward, or Goalkeeper).

The main steps are:

* **Data Cleaning:** Positions are abbreviated (e.g., DFMF -> DF) and players without playing time (0 '90s' minutes) are removed.
* **Feature Engineering:** Statistics are normalized to **'per 90 minutes' (per90)** to eliminate playing time-bias.
* **Model Training:** The data is scaled and a KNN model is trained using the normalized statistics.
* **Result:** The model achieves an **accuracy of approximately 58%** in predicting the player's position.

In [1]:
# KNN PL players Assignment

import pandas as pd
import matplotlib.pyplot as plt
from pandas.plotting import scatter_matrix


# Adjust the path to where you saved the file
url ="https://raw.githubusercontent.com/Patrick0481/Data-mining-2025-course/refs/heads/main/playerstats%20(1).csv"
df = pd.read_csv(url, encoding='latin-1')

kolommen = df.columns.tolist()
print(kolommen)

['Rk;Player;Nation;Pos;Squad;Comp;Age;Born;MP;Starts;Min;90s;Goals;Shots;SoT;SoT%;G/Sh;G/SoT;ShoDist;ShoFK;ShoPK;PKatt;PasTotCmp;PasTotAtt;PasTotCmp%;PasTotDist;PasTotPrgDist;PasShoCmp;PasShoAtt;PasShoCmp%;PasMedCmp;PasMedAtt;PasMedCmp%;PasLonCmp;PasLonAtt;PasLonCmp%;Assists;PasAss;Pas3rd;PPA;CrsPA;PasProg;PasAtt;PasLive;PasDead;PasFK;TB;Sw;PasCrs;TI;CK;CkIn;CkOut;CkStr;PasCmp;PasOff;PasBlocks;SCA;ScaPassLive;ScaPassDead;ScaDrib;ScaSh;ScaFld;ScaDef;GCA;GcaPassLive;GcaPassDead;GcaDrib;GcaSh;GcaFld;GcaDef;Tkl;TklWon;TklDef3rd;TklMid3rd;TklAtt3rd;TklDri;TklDriAtt;TklDri%;TklDriPast;Blocks;BlkSh;BlkPass;Int;Tkl+Int;Clr;Err;Touches;TouDefPen;TouDef3rd;TouMid3rd;TouAtt3rd;TouAttPen;TouLive;ToAtt;ToSuc;ToSuc%;ToTkl;ToTkl%;Carries;CarTotDist;CarPrgDist;CarProg;Car3rd;CPA;CarMis;CarDis;Rec;RecProg;CrdY;CrdR;2CrdY;Fls;Fld;Off;Crs;TklW;PKwon;PKcon;OG;Recov;AerWon;AerLost;AerWon%']


In [4]:
import pandas as pd
import numpy as np 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report

# load data
url ="https://raw.githubusercontent.com/Patrick0481/Data-mining-2025-course/refs/heads/main/playerstats%20(1).csv"
# file uses semicolon as separator, so specify sep=';'
df = pd.read_csv(url, sep=';', encoding='latin-1')
# strip possible whitespace around column names
df.columns = df.columns.str.strip()

kolommen = df.columns.tolist()
print(kolommen)

# === 1. Data Cleaning ===
# Reduce positions to the first 2 letters (DFMF -> DF)
df["Pos"] = df["Pos"].str[:2]

# Drop players without minutes (otherwise you get division by 0)
df = df[df["90s"] > 0]

# === 2. Per90 features ===
df["Goals_per90"] = df["Goals"] / df["90s"]
df["Assists_per90"] = df["Assists"] / df["90s"]
df["Shots_per90"] = df["Shots"] / df["90s"]
df["SoT_per90"] = df["SoT"] / df["90s"]
df["PassesCmp_per90"] = df["PasTotCmp"] / df["90s"]
df["PassesAtt_per90"] = df["PasTotAtt"] / df["90s"]
df["PassesProg_per90"] = df["PasProg"] / df["90s"]
df["Tkl_per90"] = df["Tkl"] / df["90s"]
df["Int_per90"] = df["Int"] / df["90s"]
df["Clr_per90"] = df["Clr"] / df["90s"]
df["Blocks_per90"] = df["Blocks"] / df["90s"]
df["AerWon_per90"] = df["AerWon"] / df["90s"]

# === 3. Feature Selection ===
features = [
    "Age",
    "Goals_per90", "Assists_per90", "Shots_per90", "SoT_per90",
    "PassesCmp_per90", "PassesAtt_per90", "PassesProg_per90",
    "Tkl_per90", "Int_per90", "Clr_per90", "Blocks_per90",
    "AerWon_per90"
]

X = df[features]
y = df["Pos"]

# === 4. Train/test split ===
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# === 5. Scaling ===
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# === 6. kNN model ===
knn = KNeighborsClassifier(n_neighbors=7, weights='distance')
knn.fit(X_train_scaled, y_train)

# === 7. Evaluation and prediction with existing players ===
y_pred = knn.predict(X_test_scaled)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Show some examples of predictions
print("\nExamples of predictions:")
sample_indices = np.random.choice(y_test.index, size=5, replace=False)  # original DataFrame indices

for original_idx in sample_indices:
    name = df.loc[original_idx, 'Player']
    team = df.loc[original_idx, 'Squad'] if 'Squad' in df.columns else "Unknown"
    
    # Find the position of this index in y_test / y_pred
    test_idx = list(y_test.index).index(original_idx)
    true_pos = y_test.iloc[test_idx]
    pred_pos = y_pred[test_idx]
    
    # original stats
    player_stats = df.loc[original_idx, features].apply(pd.to_numeric, errors='coerce')
    
    print(f"\nPlayer: {name} ({team})")
    print(f"Actual position: {true_pos}")
    print(f"Predicted position: {pred_pos}")
    print("Statistics per 90 minutes:")
    print(player_stats.round(2))

['Rk', 'Player', 'Nation', 'Pos', 'Squad', 'Comp', 'Age', 'Born', 'MP', 'Starts', 'Min', '90s', 'Goals', 'Shots', 'SoT', 'SoT%', 'G/Sh', 'G/SoT', 'ShoDist', 'ShoFK', 'ShoPK', 'PKatt', 'PasTotCmp', 'PasTotAtt', 'PasTotCmp%', 'PasTotDist', 'PasTotPrgDist', 'PasShoCmp', 'PasShoAtt', 'PasShoCmp%', 'PasMedCmp', 'PasMedAtt', 'PasMedCmp%', 'PasLonCmp', 'PasLonAtt', 'PasLonCmp%', 'Assists', 'PasAss', 'Pas3rd', 'PPA', 'CrsPA', 'PasProg', 'PasAtt', 'PasLive', 'PasDead', 'PasFK', 'TB', 'Sw', 'PasCrs', 'TI', 'CK', 'CkIn', 'CkOut', 'CkStr', 'PasCmp', 'PasOff', 'PasBlocks', 'SCA', 'ScaPassLive', 'ScaPassDead', 'ScaDrib', 'ScaSh', 'ScaFld', 'ScaDef', 'GCA', 'GcaPassLive', 'GcaPassDead', 'GcaDrib', 'GcaSh', 'GcaFld', 'GcaDef', 'Tkl', 'TklWon', 'TklDef3rd', 'TklMid3rd', 'TklAtt3rd', 'TklDri', 'TklDriAtt', 'TklDri%', 'TklDriPast', 'Blocks', 'BlkSh', 'BlkPass', 'Int', 'Tkl+Int', 'Clr', 'Err', 'Touches', 'TouDefPen', 'TouDef3rd', 'TouMid3rd', 'TouAtt3rd', 'TouAttPen', 'TouLive', 'ToAtt', 'ToSuc', 'ToSuc%'