<a href="https://colab.research.google.com/github/Sidakalankam/Premier-League-Title-Prediction/blob/main/Premier_League_Winner_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split


In [None]:
# Load the data
df_train = pd.read_csv('premier_league_winners_training.csv')
df_test = pd.read_csv('premier_league_winners_test.csv')

# Combine train and test data for encoding
df_combined = pd.concat([df_train, df_test], axis=0)

# Preprocess the data
le_winner = LabelEncoder()
le_runner = LabelEncoder()

df_combined['Winner'] = le_winner.fit_transform(df_combined['Winner'])
df_combined['Runners-up'] = le_runner.fit_transform(df_combined['Runners-up'])

# Add new features
df_combined['Year'] = df_combined['Year'].apply(lambda x: int(x.split('-')[0]))

# Sort the data by year
df_combined = df_combined.sort_values(by='Year').reset_index(drop=True)

# Initialize columns for previous winner, runner-up, and number of wins
df_combined['Previous_Winner'] = -1
df_combined['Previous_Runner'] = -1
df_combined['Number_of_Wins'] = 0

# Create dictionaries to keep track of the last known winners, runners-up, and number of wins
last_winner = {}
last_runner = {}
wins_count = {}

# Assign the previous winner and runner-up for each year, and count wins
for idx in range(len(df_combined)):
    current_year = df_combined.loc[idx, 'Year']
    current_winner = df_combined.loc[idx, 'Winner']

    if current_year - 1 in wins_count:
        df_combined.at[idx, 'Number_of_Wins'] = wins_count[current_year - 1]

    if idx > 0:
        prev_year = df_combined.loc[idx - 1, 'Year']
        if current_year == prev_year + 1:
            df_combined.at[idx, 'Previous_Winner'] = df_combined.loc[idx - 1, 'Winner']
            df_combined.at[idx, 'Previous_Runner'] = df_combined.loc[idx - 1, 'Runners-up']
        else:
            if current_year - 1 in last_winner:
                df_combined.at[idx, 'Previous_Winner'] = last_winner[current_year - 1]
            if current_year - 1 in last_runner:
                df_combined.at[idx, 'Previous_Runner'] = last_runner[current_year - 1]

    # Update the dictionaries with the current year's winner and runner-up
    if current_winner not in wins_count:
        wins_count[current_winner] = 0
    wins_count[current_winner] += 1
    df_combined.at[idx, 'Number_of_Wins'] = wins_count[current_winner]

    last_winner[current_year] = current_winner
    last_runner[current_year] = df_combined.loc[idx, 'Runners-up']

# Fill NaN values for previous winner and runner-up
df_combined['Previous_Winner'].replace(-1, df_combined['Previous_Winner'].mode()[0], inplace=True)
df_combined['Previous_Runner'].replace(-1, df_combined['Previous_Runner'].mode()[0], inplace=True)

# Calculate Years Since Last Win
df_combined['Years_Since_Last_Win'] = df_combined.groupby('Winner')['Year'].diff().fillna(0)

# Split back into train and test
df_train = df_combined.iloc[:len(df_train)]
df_test = df_combined.iloc[len(df_train):]

# Extract features and target for training
features = ['Year', 'Runners-up', 'Previous_Winner', 'Previous_Runner', 'Years_Since_Last_Win', 'Number_of_Wins']
X = df_train[features].values
y = df_train['Winner'].values

# Normalize the input data
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Convert to tensors
X = torch.tensor(X, dtype=torch.float32)
y = torch.tensor(y, dtype=torch.long)

# Split the data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

print(df_train)

     Year  Winner  Runners-up  Previous_Winner  Previous_Runner  \
0    1888      18           1               14               18   
1    1889      18          11               18                1   
2    1890       7          22               18               11   
3    1891      21          22                7               22   
4    1892      21          22               21               22   
..    ...     ...         ...              ...              ...   
116  2014       4          17               13               16   
117  2015      11           0                4               17   
118  2016       4          28               11                0   
119  2017      13          18                4               28   
120  2018      13          16               13               18   

     Number_of_Wins  Years_Since_Last_Win  
0                 1                   0.0  
1                 2                   1.0  
2                 1                   0.0  
3                 1

In [None]:
# Define the model
class PremierLeaguePredictor(nn.Module):
    def __init__(self, input_size, hidden_size1, hidden_size2, output_size, dropout_rate=0.5):
        super(PremierLeaguePredictor, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size1)
        self.fc2 = nn.Linear(hidden_size1, hidden_size2)
        self.fc3 = nn.Linear(hidden_size2, output_size)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(dropout_rate)
        self.batch_norm1 = nn.BatchNorm1d(hidden_size1)
        self.batch_norm2 = nn.BatchNorm1d(hidden_size2)

    def forward(self, x):
        x = self.fc1(x)
        x = self.batch_norm1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.fc2(x)
        x = self.batch_norm2(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.fc3(x)
        return x

# Initialize the model
input_size = len(features)
hidden_size1 = 128
hidden_size2 = 64
output_size = len(le_winner.classes_)
model = PremierLeaguePredictor(input_size, hidden_size1, hidden_size2, output_size)



In [None]:
# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)

# Train the model
num_epochs = 2000
batch_size = 16

for epoch in range(num_epochs):
    for i in range(0, len(X_train), batch_size):
        batch_X = X_train[i:i+batch_size]
        batch_y = y_train[i:i+batch_size]

        # Forward pass
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)

        # Backward pass and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    if (epoch + 1) % 100 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

Epoch [100/2000], Loss: 1.5307
Epoch [200/2000], Loss: 0.7476
Epoch [300/2000], Loss: 0.6946
Epoch [400/2000], Loss: 0.5306
Epoch [500/2000], Loss: 0.6054
Epoch [600/2000], Loss: 0.4349
Epoch [700/2000], Loss: 0.2422
Epoch [800/2000], Loss: 0.2625
Epoch [900/2000], Loss: 0.1640
Epoch [1000/2000], Loss: 0.2789
Epoch [1100/2000], Loss: 0.1542
Epoch [1200/2000], Loss: 0.0994
Epoch [1300/2000], Loss: 0.0754
Epoch [1400/2000], Loss: 0.2082
Epoch [1500/2000], Loss: 0.0833
Epoch [1600/2000], Loss: 0.1595
Epoch [1700/2000], Loss: 0.0573
Epoch [1800/2000], Loss: 0.2225
Epoch [1900/2000], Loss: 0.4127
Epoch [2000/2000], Loss: 0.1644


In [None]:
# Prepare test data
X_test = df_test[features].values
X_test = scaler.transform(X_test)
X_test = torch.tensor(X_test, dtype=torch.float32)

# Make predictions
model.eval()
with torch.no_grad():
    predictions = model(X_test)
    predicted_winners = le_winner.inverse_transform(predictions.argmax(dim=1).numpy())

# Print predictions
for year, winner in zip(df_test['Year'], predicted_winners):
    print(f"Predicted winner for {year}: {winner}")

# Compare with actual winners
actual_winners = le_winner.inverse_transform(df_test['Winner'].values)
correct_predictions = sum(predicted_winners == actual_winners)
accuracy = correct_predictions / len(actual_winners)
print(f"\nAccuracy: {accuracy:.2f}")

print("Actual winners:", actual_winners)
print("Predicted winners:", predicted_winners)

Predicted winner for 2019: Manchester United
Predicted winner for 2020: Manchester City
Predicted winner for 2021: Manchester City
Predicted winner for 2022: Manchester United
Predicted winner for 2023: Manchester United

Accuracy: 0.40
Actual winners: ['Liverpool' 'Manchester City' 'Manchester City' 'Manchester City'
 'Manchester City']
Predicted winners: ['Manchester United' 'Manchester City' 'Manchester City'
 'Manchester United' 'Manchester United']


fatal: not a git repository (or any of the parent directories): .git
