In [None]:
import csv
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, roc_curve, auc
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier

In [None]:
# Clean the data
with open("player_injuries_impact.csv", "r", encoding="utf8") as stat:
    reader = csv.reader(stat)
    headers = next(reader)  # Read the header row
    playstats = list(range(9)) + [12, 41]  # Columns relevant to the model
    data = []

    for row in reader:
        selected_values = [row[i] if i < len(row) else "" for i in playstats]
        data.append(selected_values)

In [None]:
# Convert data to a NumPy array
data = np.array(data)

In [None]:
# Feature selection
X = data[:, :-1]  # All columns except the last one
y = data[:, -1]   # Last column (target)

In [None]:
# Handle missing values
X[X == ""] = -1
y[y == ""] = -1

In [None]:
# Convert strings to numeric
def to_float_safe(value):
    try:
        return float(value)
    except ValueError:
        return -1  # Replace non-numeric values with -1

X = np.vectorize(to_float_safe)(X)
y = np.vectorize(to_float_safe)(y)

In [None]:
# Target variable y represnets "performance after injury" based on player rating, match performance, etc..
bins = [0, 3, 6, 10]
labels = [0, 1, 2]  
y = np.digitize(y, bins, right=True) - 1  # Assign class labels

In [None]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Initialize the RandomForestClassifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

In [None]:
# Train the model
rf_model.fit(X_train, y_train)

In [None]:
# Make predictions
y_pred = rf_model.predict(X_test)

In [None]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(report)

In [None]:
#### Feature Importance ####
feature_importances = rf_model.feature_importances_

# Sorting features by importance
sorted_indices = np.argsort(feature_importances)[::-1]
top_features = sorted_indices[:5]  # Get the top 5 features

In [None]:
# Displaying feature importance as a ranked list
print("Top 5 Features by Importance:")
for rank, idx in enumerate(top_features, start=1):
    print(f"{rank}. Feature: {headers[idx]}, Importance: {feature_importances[idx]:.4f}")

In [None]:
#### Cross-Validation ####
cv_scores = cross_val_score(rf_model, X, y, cv=5)  # 5-fold cross-validation

In [None]:
# Cross-Validation results
print(f"\nCross-Validation Scores: {cv_scores}")
print(f"Mean CV Accuracy: {np.mean(cv_scores)}")

In [None]:
#### ROC CURVE ####
y_binarized = label_binarize(y, classes=[-1, 1, 2])

X_train, X_test, y_train, y_test = train_test_split(X, y_binarized, test_size=0.2, random_state=42)

In [None]:
# Training the classifier (One-vs-Rest strategy for multi-class)
clf = OneVsRestClassifier(RandomForestClassifier(n_estimators=100, random_state=42))
clf.fit(X_train, y_train)

In [None]:
# Predicted probabilities
y_score = clf.predict_proba(X_test)

In [None]:
# Compute ROC curve and ROC area for each class
fpr = dict()
tpr = dict()
roc_auc = dict()

for i in range(y_binarized.shape[1]):
    fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

In [None]:
# ROC curve for each class
plt.figure(figsize=(10, 6))
for i, color in zip(range(y_binarized.shape[1]), ['blue', 'red', 'green']):
    plt.plot(fpr[i], tpr[i], color=color, lw=2,
             label=f'Class {i} (AUC = {roc_auc[i]:.2f})')

In [None]:
# Plots for ROC curve
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc='lower right')
plt.grid(alpha=0.3)
plt.show()

In [None]:
print("AUROC for each class:")
for i in range(y_binarized.shape[1]):
    print(f"Class {i}: {roc_auc[i]:.4f}")