In [None]:
import pandas as pd
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer

# Load training data
train_file = 'C:/Users/Vishal/Downloads/EPL Data 2018-2024.xlsx'
train_df = pd.read_excel(train_file)

# Load test data (for prediction)
test_file = 'C:/Users/Vishal/Downloads/testData.xlsx'
test_df = pd.read_excel(test_file)

# Select input and output columns for training
X_train = train_df[['Home_Team', 'Home_xG', 'Away_xG', 'Away_Team', 'Attendance', 'Venue', 'Referee']]
y_train = train_df['Score']

# Select input columns for test data (for prediction)
X_test = test_df[['Home_Team', 'Away_Team', 'Attendance', 'Venue', 'Referee']]

# Impute missing values in training and test data
imputer = SimpleImputer(strategy='mean')
X_train[['Home_xG', 'Away_xG', 'Attendance']] = imputer.fit_transform(X_train[['Home_xG', 'Away_xG', 'Attendance']])
X_test[['Home_xG', 'Away_xG', 'Attendance']] = imputer.transform(X_test[['Home_xG', 'Away_xG', 'Attendance']])

# Concatenate training and test data
combined_data = pd.concat([X_train, X_test])

# Convert categorical variables to dummy variables for combined data
combined_data = pd.get_dummies(combined_data)

# Split back into training and test data
X_train_encoded = combined_data[:len(X_train)]
X_test_encoded = combined_data[len(X_train):]

# Ensure columns present in both training and test data after one-hot encoding
common_columns = set(X_train_encoded.columns) & set(X_test_encoded.columns)
X_train_encoded = X_train_encoded[common_columns]
X_test_encoded = X_test_encoded[common_columns]

# Now, impute missing values in the test data
X_test_encoded[['Home_xG', 'Away_xG', 'Attendance']] = imputer.transform(X_test_encoded[['Home_xG', 'Away_xG', 'Attendance']])

# Splitting back doesn't preserve index, resetting it
X_train_encoded.reset_index(drop=True, inplace=True)
X_test_encoded.reset_index(drop=True, inplace=True)

# Fit linear regression model
model = LinearRegression()
model.fit(X_train_encoded, y_train)

# Predict on test data
y_test_pred = model.predict(X_test_encoded)

# Display predictions for test data
test_df['Predicted_Score'] = y_test_pred
print(test_df[['Home_Team', 'Away_Team', 'Attendance', 'Venue', 'Referee', 'Predicted_Score']])

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam

def load_and_preprocess_data(filepath):
    # Load the data
    data = pd.read_csv(filepath)
    data['Date'] = pd.to_datetime(data['Date'])

    # Extracting scores
    data['Home_Score'] = data['Score'].str.split('—', expand=True)[0].astype(int)
    data['Away_Score'] = data['Score'].str.split('—', expand=True)[1].astype(int)

    # Encoding teams
    encoder = OneHotEncoder(sparse=False)
    teams = data['Home_Team'].append(data['Away_Team']).unique().reshape(-1, 1)
    encoder.fit(teams)

    home_teams_encoded = encoder.transform(data[['Home_Team']])
    away_teams_encoded = encoder.transform(data[['Away_Team']])

    data = pd.concat([
        data,
        pd.DataFrame(home_teams_encoded, columns=[f"home_{team}" for team in encoder.categories_[0]]),
        pd.DataFrame(away_teams_encoded, columns=[f"away_{team}" for team in encoder.categories_[0]])
    ], axis=1)

    # Scaling scores
    scaler = MinMaxScaler()
    data[['Home_Score', 'Away_Score']] = scaler.fit_transform(data[['Home_Score', 'Away_Score']])

    return data, scaler, encoder

def create_sequences(data, n_steps=3):
    X, y = [], []
    for i in range(len(data) - n_steps):
        X.append(data.iloc[i:(i+n_steps)].drop(['Score', 'Home_Team', 'Away_Team', 'Date', 'Home_Score', 'Away_Score'], axis=1).values)
        y.append(data.iloc[i + n_steps][['Home_Score', 'Away_Score']])
    return np.array(X), np.array(y)

def build_model(input_shape):
    model = Sequential([
        LSTM(50, activation='relu', input_shape=input_shape),
        Dropout(0.2),
        Dense(2)  # Predicting two scores: home and away
    ])
    model.compile(optimizer=Adam(learning_rate=0.01), loss='mse')
    return model

def main():
    # Assuming the CSV file is named 'soccer_data.csv' and is located in the specified directory
    filepath = 'C:/Users/Vishal/Desktop/Dissertation/Datasets/EPL Data 2018-2024.csv'
    data, scaler, encoder = load_and_preprocess_data(filepath)

    # Prepare the sequences
    X, y = create_sequences(data)

    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Build the model
    model = build_model((X_train.shape[1], X_train.shape[2]))

    # Train the model
    model.fit(X_train, y_train, epochs=50, validation_data=(X_test, y_test))

    # Save the model and scaler for later use
    model.save('football_score_predictor_model.h5')
    # Assume you handle scaler saving and loading yourself

    print("Model training complete and saved.")

if __name__ == "__main__":
    main()


In [18]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout
from datetime import datetime

def load_and_preprocess_data(filepath):
    try:
        data = pd.read_csv(filepath, encoding='ISO-8859-1')
    except UnicodeDecodeError:
        data = pd.read_csv(filepath, encoding='utf-8')

    # Attempt to extract scores and handle cases where extraction fails
    scores = data['Score'].str.extract('(\d+)[—-](\d+)')
    data['home_team_score'], data['away_team_score'] = scores[0].fillna(-1).astype(int), scores[1].fillna(-1).astype(int)

    # Convert Date to datetime
    data['Date'] = pd.to_datetime(data['Date'], format='%d-%m-%Y')

    # Filter for seasons from 2018-2019 to 2022-2023
    data = data[data['Season'].isin(['2018-2019', '2019-2020', '2020-2021', '2021-2022', '2022-2023'])]

    # Encode teams
    encoder = LabelEncoder()
    data['Home_Team'] = encoder.fit_transform(data['Home_Team'])
    data['Away_Team'] = encoder.transform(data['Away_Team'])
    
    return data, encoder

def prepare_inputs(data):
    n_input_steps = 3  # Number of past records to consider
    n_features = 4     # home_team, away_team, home_team_score, away_team_score

    # Scale features
    scaler = MinMaxScaler(feature_range=(0, 1))
    data_scaled = scaler.fit_transform(data[['Home_Team', 'Away_Team', 'home_team_score', 'away_team_score']])

    X, y = [], []
    for i in range(n_input_steps, len(data_scaled)):
        X.append(data_scaled[i-n_input_steps:i, :])
        y.append(data_scaled[i, 2:4])  # Indices for scores

    return np.array(X), np.array(y)

def build_model(n_input_steps, n_features):
    model = Sequential([
        LSTM(50, return_sequences=True, input_shape=(n_input_steps, n_features)),
        Dropout(0.5),
        LSTM(50),
        Dense(2)  # Predicting two scores: home and away
    ])
    model.compile(optimizer='adam', loss='mse')
    return model

def main():
    filepath = 'C:/Users/Vishal/Desktop/Dissertation/Datasets/EPL Data 2018-2024.csv'
    data, team_encoder = load_and_preprocess_data(filepath)
    
    X, y = prepare_inputs(data)
    
    model = build_model(X.shape[1], X.shape[2])  # Pass the shape of the input
    model.fit(X, y, epochs=50, batch_size=64)
    
    # Here you would handle predictions and save to CSV
    # This part needs the data for the season 2023-2024 which should be processed similarly
#     predictions = model.predict(new_season_data)
#     predictions_df.to_csv('C:/Users/Vishal/Desktop/Dissertation/Datasets/EPL Data 2018-2024/Results/2023-2024_predictions.csv', index=False)

if __name__ == '__main__':
    main()


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


NameError: name 'new_season_data' is not defined

In [23]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout
from datetime import datetime

def load_and_preprocess_data(filepath, encoder=None, scaler=None, is_train=True):
    data = pd.read_csv(filepath, encoding='ISO-8859-1')

    # Extract scores and handle cases where extraction fails
    scores = data['Score'].str.extract('(\d+)[—-](\d+)')
    data['home_team_score'], data['away_team_score'] = scores[0].fillna(-1).astype(int), scores[1].fillna(-1).astype(int)

    # Convert Date to datetime
    data['Date'] = pd.to_datetime(data['Date'], format='%d-%m-%Y')

    # Filter by season if training data
    if is_train:
        data = data[data['Season'].isin(['2018-2019', '2019-2020', '2020-2021', '2021-2022', '2022-2023'])]

    # Get all unique team names from the dataset
    if is_train and not encoder:
        encoder = LabelEncoder()
        all_teams = pd.concat([data['Home_Team'], data['Away_Team']]).unique()
        all_teams = np.append(all_teams, "Unknown")  # Append 'Unknown' for unseen teams
        encoder.fit(all_teams)
    
    # Transform team names with handling for unseen teams
    data['Home_Team'] = data['Home_Team'].apply(lambda x: x if x in encoder.classes_ else "Unknown")
    data['Away_Team'] = data['Away_Team'].apply(lambda x: x if x in encoder.classes_ else "Unknown")
    data['Home_Team'] = encoder.transform(data['Home_Team'])
    data['Away_Team'] = encoder.transform(data['Away_Team'])

    # Scaling features
    if not scaler:
        scaler = MinMaxScaler(feature_range=(0, 1))
        data_scaled = scaler.fit_transform(data[['Home_Team', 'Away_Team', 'home_team_score', 'away_team_score']])
    else:
        data_scaled = scaler.transform(data[['Home_Team', 'Away_Team', 'home_team_score', 'away_team_score']])
    
    return data, data_scaled, encoder, scaler


def build_model(n_input_steps, n_features):
    model = Sequential([
        LSTM(100, return_sequences=True, input_shape=(n_input_steps, n_features)),  # Increased complexity
        Dropout(0.3),  # Adjusted dropout rate
        LSTM(100),
        Dense(2, activation='relu')  # Ensures non-negative outputs directly
    ])
    model.compile(optimizer='adam', loss='mse')
    return model

def main():
    train_filepath = 'C:/Users/Vishal/Desktop/Dissertation/Datasets/EPL Data 2018-2024.csv'
    test_filepath = 'C:/Users/Vishal/Desktop/Dissertation/EPL Data 2024.csv'
    
    # Load and preprocess training data
    train_data, train_scaled, encoder, scaler = load_and_preprocess_data(train_filepath)

    # Prepare training inputs
    X, y = prepare_inputs(train_scaled, 3)
    
    # Build and train the model
    model = build_model(X.shape[1], X.shape[2])
    model.fit(X, y, epochs=100, batch_size=32)  # Adjusted epochs and batch size

    # Load and preprocess test data for the 2023-2024 season
    test_data, test_scaled, _, _ = load_and_preprocess_data(test_filepath, encoder, scaler, is_train=False)
    
    # Prepare test inputs
    X_test, _ = prepare_inputs(test_scaled, 3)

    # Predict for the 2023-2024 season
    predictions = model.predict(X_test)

    # Post-process predictions: round to nearest integer (model already ensures non-negative via ReLU)
    predictions = np.round(predictions)

    # Combine predictions with the original test data
    predictions_df = test_data.iloc[3:].copy()  # Adjust index based on window size
    predictions_df['Predicted_Home_Score'], predictions_df['Predicted_Away_Score'] = predictions[:, 0], predictions[:, 1]
    
    predictions_df.to_csv('C:/Users/Vishal/Desktop/Dissertation/Datasets/Results/2023-2024_predictions.csv', index=False)

if __name__ == '__main__':
    main()




Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 98/100
Epoch 99/100
Epoch 100/100


In [34]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout
from datetime import datetime

def parse_scores(score_str):
    if not isinstance(score_str, str):
        return -1, -1  # Handle cases where the score might be NaN or similar
    for delimiter in ['—', '-', '–']:
        if delimiter in score_str:
            parts = score_str.split(delimiter)
            return int(parts[0]), int(parts[1])
    return -1, -1

def load_and_preprocess_data(filepath, encoder=None, scaler=None, is_train=True):
    data = pd.read_csv(filepath, encoding='ISO-8859-1')
    data[['home_team_score', 'away_team_score']] = pd.DataFrame(data['Score'].apply(parse_scores).tolist(), index=data.index)
    data['Date'] = pd.to_datetime(data['Date'], format='%d-%m-%Y')

    if is_train:
        data = data[data['Season'].isin(['2018-2019', '2019-2020', '2020-2021', '2021-2022', '2022-2023'])]

    if not encoder:
        encoder = LabelEncoder()
        all_teams = pd.concat([data['Home_Team'], data['Away_Team']]).unique()
        encoder.fit(all_teams)
    
    data['Home_Team'] = encoder.transform(data['Home_Team'])
    data['Away_Team'] = encoder.transform(data['Away_Team'])

    if not scaler:
        scaler = MinMaxScaler(feature_range=(0, 1))
        data_scaled = scaler.fit_transform(data[['Home_Team', 'Away_Team', 'home_team_score', 'away_team_score']])
    else:
        data_scaled = scaler.transform(data[['Home_Team', 'Away_Team', 'home_team_score', 'away_team_score']])
    
    return data, data_scaled, encoder, scaler

def prepare_inputs(data_scaled, n_input_steps):
    X, y = [], []
    for i in range(n_input_steps, len(data_scaled)):
        X.append(data_scaled[i-n_input_steps:i, :])
        y.append(data_scaled[i, 2:4])
    return np.array(X), np.array(y)

def build_model(n_input_steps, n_features):
    model = Sequential([
        LSTM(100, return_sequences=True, input_shape=(n_input_steps, n_features)),
        Dropout(0.3),
        LSTM(100),
        Dense(2, activation='relu')
    ])
    model.compile(optimizer='adam', loss='mse')
    return model

def main():
    train_filepath = 'C:/Users/Vishal/Desktop/Dissertation/EPL Data 2018-2024.csv'
    test_filepath = 'C:/Users/Vishal/Desktop/Dissertation/EPL Data 2024.csv'
    
    train_data, train_scaled, encoder, scaler = load_and_preprocess_data(train_filepath)
    X, y = prepare_inputs(train_scaled, 3)
    model = build_model(X.shape[1], X.shape[2])
    model.fit(X, y, epochs=100, batch_size=32)

    test_data, test_scaled, _, _ = load_and_preprocess_data(test_filepath, encoder, scaler, is_train=False)
    X_test, _ = prepare_inputs(test_scaled, 3)
    predictions = model.predict(X_test)
    predictions = np.round(predictions)

    test_data_adjusted = test_data.iloc[3:]
    test_data_adjusted['Predicted_Home_Score'] = predictions[:, 0]
    test_data_adjusted['Predicted_Away_Score'] = predictions[:, 1]

    test_data_adjusted.to_csv('C:/Users/Vishal/Desktop/Dissertation/Datasets/Results/2023-2024_predictions.csv', index=False)

if __name__ == '__main__':
    main()


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 98/100
Epoch 99/100
Epoch 100/100


ValueError: y contains previously unseen labels: 'Newcastle'