In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import log_loss
from sklearn.ensemble import RandomForestClassifier


In [None]:
#Importing of the files. Cleaned prior to import.
testAway = pd.read_csv('testDataV3_away.txt')
testHome = pd.read_csv('testDataV3_home.txt')
testY = pd.read_csv('testDataV3_y.txt') # Used to check prediction accuracy. Contains home_win, draw, away_win values

trainAway = pd.read_csv('trainDataV3_away.txt') # Different IDS than test
trainHome = pd.read_csv('trainDataV3_home.txt')
trainY = pd.read_csv('trainDataV3_y.txt') # In training, use this to understand what difference in values leads to a win, draw, loss.

In [None]:
selected_features = [
    'TEAM_SHOTS_TOTAL_season_sum', 'TEAM_SHOTS_ON_TARGET_season_sum',
    'TEAM_PASSES_season_sum', 'TEAM_SUCCESSFUL_PASSES_season_sum',
    'TEAM_SAVES_season_sum', 'TEAM_CORNERS_season_sum',
    'TEAM_FOULS_season_sum', 'TEAM_YELLOWCARDS_season_sum',
    'TEAM_REDCARDS_season_sum', 'TEAM_ATTACKS_season_sum',
    'TEAM_DANGEROUS_ATTACKS_season_sum', 'TEAM_GOALS_season_sum'
]

In [None]:
X_train = trainHome[selected_features].values - trainAway[selected_features].values
X_test = testHome[selected_features].values - testAway[selected_features].values

# Set up Y labels (win/loss/draw columns)
Y_train = trainY[['HOME_WINS', 'DRAW', 'AWAY_WINS']].values

# Train model
model = LinearRegression()
model.fit(X_train, Y_train)

# Predict win probabilities
Y_pred = model.predict(X_test)

# Normalize probabilities (to sum to 1)
Y_pred = np.clip(Y_pred, 0, None)  # Ensure non-negatives
Y_pred /= Y_pred.sum(axis=1, keepdims=True)

# dataframe
Y_pred_df = pd.DataFrame(Y_pred, columns=['Home Win Prob', 'Draw Prob', 'Away Win Prob'])



In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(Y_pred_df, bins=20, kde=True, alpha=0.6)
plt.title("Distribution of Predicted Win Probabilities")
plt.xlabel("Predicted Probability")
plt.ylabel("Frequency")
plt.legend(['Home Win', 'Draw', 'Away Win'])
plt.show()


In [None]:
plt.figure(figsize=(8, 6))
plt.scatter(X_test[:, -1], Y_pred_df['Home Win Prob'], alpha=0.5)
plt.xlabel("Goal Difference (Home - Away)")
plt.ylabel("Predicted Home Win Probability")
plt.title("Goal Difference vs. Home Win Probability")
plt.axvline(0, color='red', linestyle='--', label="No Goal Difference")
plt.legend()
plt.show()


In [None]:
train_features = trainHome[selected_features].copy()
train_features['HOME_WINS'] = trainY['HOME_WINS']
train_features['DRAW'] = trainY['DRAW']
train_features['AWAY_WINS'] = trainY['AWAY_WINS']

# Step 2: Create subsets for each outcome
home_wins_data = train_features[train_features['HOME_WINS'] == 1]
draw_data = train_features[train_features['DRAW'] == 1]
away_wins_data = train_features[train_features['AWAY_WINS'] == 1]

# Shorten feature names for the plot titles
short_titles = {
    'TEAM_SHOTS_TOTAL_season_sum': 'Shots Total',
    'TEAM_SHOTS_ON_TARGET_season_sum': 'Shots On Target',
    'TEAM_PASSES_season_sum': 'Passes',
    'TEAM_SUCCESSFUL_PASSES_season_sum': 'Successful Passes',
    'TEAM_SAVES_season_sum': 'Saves',
    'TEAM_CORNERS_season_sum': 'Corners',
    'TEAM_FOULS_season_sum': 'Fouls',
    'TEAM_YELLOWCARDS_season_sum': 'Yellow Cards',
    'TEAM_REDCARDS_season_sum': 'Red Cards',
    'TEAM_ATTACKS_season_sum': 'Attacks',
    'TEAM_DANGEROUS_ATTACKS_season_sum': 'Dangerous Attacks',
    'TEAM_GOALS_season_sum': 'Goals'
}

# Step 3: Plot histograms for each selected feature by match outcome
plt.figure(figsize=(14, 12))

for i, feature in enumerate(selected_features):
    plt.subplot(3, 4, i+1)  # Create subplots in a grid
    sns.histplot(home_wins_data[feature], kde=True, color='green', label='Home Wins', bins=30)
    sns.histplot(draw_data[feature], kde=True, color='gray', label='Draw', bins=30)
    sns.histplot(away_wins_data[feature], kde=True, color='red', label='Away Wins', bins=30)
    plt.title(short_titles.get(feature, feature))  # Use the shorter title
    plt.xlabel(short_titles.get(feature, feature))
    plt.ylabel('Frequency')
    plt.legend()

plt.tight_layout()
plt.show()


In [None]:
plt.figure(figsize=(14, 12))

for i, feature in enumerate(selected_features):
    plt.subplot(3, 4, i+1)
    sns.boxplot(x='HOME_WINS', y=feature, data=train_features, hue='HOME_WINS', palette='Greens', legend=False)
    sns.boxplot(x='DRAW', y=feature, data=train_features, hue='DRAW', palette='Greys', legend=False)
    sns.boxplot(x='AWAY_WINS', y=feature, data=train_features, hue='AWAY_WINS', palette='Reds', legend=False)
    plt.title(short_titles.get(feature, feature))
    plt.xlabel('Outcome')
    plt.ylabel('Value')

plt.tight_layout()
plt.show()


In [None]:
correlation_matrix = train_features[selected_features].corr()
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Heatmap')
plt.show()


In [None]:
from sklearn.ensemble import RandomForestClassifier

# Train a random forest model to get feature importance
model = RandomForestClassifier()
model.fit(train_features[selected_features], trainY['HOME_WINS']) 

feature_importance = model.feature_importances_
plt.figure(figsize=(10, 6))
plt.barh(selected_features, feature_importance, color='skyblue')
plt.title('Feature Importance (Random Forest)')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.show()


In [None]:
plt.figure(figsize=(8, 6))
class_counts = trainY[['HOME_WINS', 'DRAW', 'AWAY_WINS']].sum()
sns.barplot(x=class_counts.index, y=class_counts.values, palette='pastel')
plt.title('Class Distribution (Home Wins, Draws, Away Wins)')
plt.xlabel('Outcome')
plt.ylabel('Count')
plt.show()


In [None]:
plt.figure(figsize=(14, 12))

for i, feature in enumerate(selected_features):
    plt.subplot(3, 4, i+1)
    sns.kdeplot(home_wins_data[feature], color='green', label='Home Wins', fill=True)
    sns.kdeplot(draw_data[feature], color='gray', label='Draw', fill=True)
    sns.kdeplot(away_wins_data[feature], color='red', label='Away Wins', fill=True)
    plt.title(short_titles.get(feature, feature))
    plt.xlabel(short_titles.get(feature, feature))
    plt.ylabel('Density')
    plt.legend()

plt.tight_layout()
plt.show()


In [None]:
mse = mean_squared_error(testY[['HOME_WINS', 'DRAW', 'AWAY_WINS']].values, Y_pred)
print(f"Test Set MSE: {mse:.4f}")
