In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [3]:
# Load the dataset
df = pd.read_csv("attendance.csv")

In [4]:
# Handle missing values in 'weekly_attendance' by filling with median
df['weekly_attendance'].fillna(df['weekly_attendance'].median(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['weekly_attendance'].fillna(df['weekly_attendance'].median(), inplace=True)


In [5]:
# Encode categorical features
label_enc_team = LabelEncoder()
df['team_encoded'] = label_enc_team.fit_transform(df['team'])

In [6]:

label_enc_team_name = LabelEncoder()
df['team_name_encoded'] = label_enc_team_name.fit_transform(df['team_name'])

In [7]:
# Select features and target
X = df[['team_encoded', 'team_name_encoded', 'year', 'total', 'home', 'away', 'week']]
y = df['weekly_attendance']

In [8]:
# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize numerical features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [9]:
# Train a Linear Regression model
model = LinearRegression()
model.fit(X_train_scaled, y_train)

# Make predictions
y_pred = model.predict(X_test_scaled)

In [10]:

# Evaluate model performance
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5
r2 = r2_score(y_test, y_pred)

# Print evaluation metrics
print(f"Mean Absolute Error: {mae}")
print(f"Root Mean Squared Error: {rmse}")
print(f"R-squared: {r2}")


Mean Absolute Error: 5353.2956125847895
Root Mean Squared Error: 7616.357877449383
R-squared: 0.20371461062891105


In [11]:
# test
# Example input for prediction (replace with actual values)
new_data = pd.DataFrame({
    'team_encoded': [0],   # Replace with a valid encoded team value
    'team_name_encoded': [0],  # Replace with a valid encoded team name
    'year': [2025],  # Example year
    'total': [500000],  # Example total attendance
    'home': [250000],  # Example home attendance
    'away': [250000],  # Example away attendance
    'week': [10]  # Example matchweek
})

# Standardize input data
new_data_scaled = scaler.transform(new_data)

# Make prediction
predicted_attendance = model.predict(new_data_scaled)
print(f"Predicted Weekly Attendance: {predicted_attendance[0]}")


Predicted Weekly Attendance: 33121.347366859874
