In [1]:
import pandas as pd

data = pd.read_csv('../Data/archive_injuries/injury_data.csv')

X = data.drop(columns=['Likelihood_of_Injury'])
y = data['Likelihood_of_Injury']

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, accuracy_score

# Define models to include in the pipeline
models = [
    ('Logistic Regression', LogisticRegression()),
    ('Random Forest', RandomForestClassifier(n_estimators=100, random_state=42)),
    ('SVM', SVC(kernel='linear', random_state=42)),
    ('Neural Network', MLPClassifier(hidden_layer_sizes=(50, 50), max_iter=500, random_state=42))
]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a pipeline for each model
pipelines = []
for name, model in models:
    if name == 'Neural Network':
        pipeline = Pipeline([
            ('scaler', StandardScaler()),  # Scale features for MLP
            (name, model)
        ])
    else:
        pipeline = Pipeline([
            (name, model)
        ])
    pipelines.append((name, pipeline))

# Fit the pipelines and evaluate
for name, pipeline in pipelines:
    print(f"Training {name}...")
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    print(f"Evaluating {name}...")
    print(classification_report(y_test, y_pred))
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("-------------------------")

Training Logistic Regression...
Evaluating Logistic Regression...
              precision    recall  f1-score   support

           0       0.54      0.67      0.60        95
           1       0.62      0.48      0.54       105

    accuracy                           0.57       200
   macro avg       0.58      0.57      0.57       200
weighted avg       0.58      0.57      0.57       200

Accuracy: 0.57
-------------------------
Training Random Forest...
Evaluating Random Forest...
              precision    recall  f1-score   support

           0       0.55      0.54      0.55        95
           1       0.59      0.61      0.60       105

    accuracy                           0.57       200
   macro avg       0.57      0.57      0.57       200
weighted avg       0.57      0.57      0.57       200

Accuracy: 0.575
-------------------------
Training SVM...
Evaluating SVM...
              precision    recall  f1-score   support

           0       0.55      0.63      0.59        95




In [2]:
X.head()

Unnamed: 0,Player_Age,Player_Weight,Player_Height,Previous_Injuries,Training_Intensity,Recovery_Time
0,24,66.251933,175.732429,1,0.457929,5
1,37,70.996271,174.58165,0,0.226522,6
2,32,80.093781,186.329618,0,0.61397,2
3,28,87.473271,175.50424,1,0.252858,4
4,25,84.65922,190.175012,0,0.577632,1


# Features for prediction:
    -'Player-Age' INT
    -'Player_Weight' FLOAT
    -'Player_Height' FLOAT
    -'Previous_Injuries' INT (1 or 0)
    -'Training_Intensity' FLOAT
    -'Recovery_Time' INT

In [3]:
import pickle

pickle.dump(model, open("injury_model.pkl", "wb"))