In [1]:
import pandas as pd

In [5]:
train_data = pd.read_csv("Featured_data.csv")
test_data = pd.read_csv("Featured_data_2025.csv")


train_data.drop(columns=['Unnamed: 0'], inplace=True)
test_data.drop(columns=['Unnamed: 0'], inplace=True)

Missing values check

In [8]:
missing_train = train_data.isnull().sum()
missing_test = test_data.isnull().sum()




W/L%    1
PW      0
SRS     0
NRtg    1
ORtg    0
TS%     0
DRtg    0
PA/G    1
eFG%    0
3P%     1
FG%     1
dtype: int64

In [9]:
train_data.fillna(train_data.mean(), inplace=True)

In [10]:
test_data.fillna(test_data.mean(), inplace=True)

Ridge Regression is a type of linear regression that includes regularization to prevent overfitting. Regularization adds a penalty to the model's loss function, discouraging it from assigning large weights to features, especially when those features highly correlated.
Ridge regression  = (X^TX + lambda* I)^(-1) X^Ty

where X is the explanatory matrix, lambda is the ridge coefficient greater than 1



In [13]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

# Features and target variable
X = train_data.drop(columns=["W/L%"])
y = train_data["W/L%"]

# Splitting the training data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardizing the features (Ridge Regression is sensitive to scale)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(test_data.drop(columns=["W/L%"]))

# Ridge Regression model
ridge = Ridge(alpha=1.0)  # Default alpha = 1.0
ridge.fit(X_train_scaled, y_train)

# Predictions
y_train_pred = ridge.predict(X_train_scaled)
y_val_pred = ridge.predict(X_val_scaled)
y_test_pred = ridge.predict(X_test_scaled)

# Evaluate the model
train_mae = mean_absolute_error(y_train, y_train_pred)
val_mae = mean_absolute_error(y_val, y_val_pred)
val_rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))

# Results
train_mae, val_mae, val_rmse


(0.0291206061281818, 0.03219382637905683, 0.0428489070874633)

In [15]:
# Predict W/L% for the 2025 dataset
test_data["Predicted W/L%"] = y_test_pred

# Rank teams based on predicted W/L%
test_data["Predicted Rank"] = test_data["Predicted W/L%"].rank(ascending=False).astype(int)

# Sort the teams by their predicted rank for standings
predicted_standings = test_data.sort_values(by="Predicted Rank")

predicted_standings[["Predicted W/L%", "Predicted Rank"]].head()


Unnamed: 0,Predicted W/L%,Predicted Rank
1,0.728834,1
0,0.715641,2
2,0.661014,3
3,0.64444,4
4,0.63007,5


In [16]:
predicted_standings

Unnamed: 0,W/L%,PW,SRS,NRtg,ORtg,TS%,DRtg,PA/G,eFG%,3P%,FG%,Predicted W/L%,Predicted Rank
1,0.867,27,9.77,11.8,121.9,0.622,110.1,110.2,0.595,0.406,0.504,0.728834,1
0,0.821,26,11.27,12.0,115.5,0.571,103.5,103.1,0.539,0.349,0.464,0.715641,2
2,0.733,26,9.36,10.8,120.7,0.593,109.9,109.8,0.562,0.365,0.456,0.661014,3
3,0.667,25,7.22,8.6,116.8,0.591,108.2,113.0,0.558,0.37,0.486,0.64444,4
4,0.667,24,5.98,8.2,121.1,0.609,112.9,110.1,0.577,0.396,0.497,0.63007,5
6,0.633,22,4.79,5.7,117.6,0.593,111.9,111.8,0.559,0.377,0.486,0.577663,6
5,0.69,22,6.09,6.4,112.9,0.542,106.5,106.3,0.506,0.324,0.442,0.563049,7
7,0.571,18,2.76,2.9,118.1,0.598,115.2,116.1,0.566,0.376,0.496,0.513039,8
10,0.517,18,1.87,1.8,111.2,0.577,109.4,106.9,0.545,0.372,0.457,0.50406,9
11,0.571,17,1.25,1.3,113.1,0.589,111.8,111.2,0.562,0.391,0.481,0.503878,10
