In [2]:
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score

# Load and Prepare Data
data = pd.read_csv('matches.csv')

# Define a mapping of team names to standardized names
team_mapping = {
    'Manchester City': 'Manchester City',
    'Manchester United': 'Manchester United',
    'Liverpool': 'Liverpool',
    'Chelsea': 'Chelsea',
    'Leicester City': 'Leicester City',
    'West Ham United': 'West Ham United',
    'Tottenham Hotspur': 'Tottenham Hotspur',
    'Arsenal': 'Arsenal',
    'Leeds United': 'Leeds United',
    'Everton': 'Everton',
    'Aston Villa': 'Aston Villa',
    'Newcastle United': 'Newcastle United',
    'Wolverhampton Wanderers': 'Wolverhampton Wanderers',
    'Crystal Palace': 'Crystal Palace',
    'Southampton': 'Southampton',
    'Brighton and Hove Albion': 'Brighton and Hove Albion',
    'Burnley': 'Burnley',
    'Fulham': 'Fulham',
    'West Bromwich Albion': 'West Bromwich Albion',
    'Sheffield United': 'Sheffield United',
    'Bournemouth': 'Bournemouth',
    'Brentford': 'Brentford',
    'Nottingham Forest': 'Nottingham Forest',
    'Luton Town': 'Luton Town',
    'Watford': 'Watford',
    'Norwich City': 'Norwich City'
}

# Standardize opponent names
data['opponent'] = data['opponent'].replace(team_mapping).fillna(data['opponent'])

# Convert categorical columns to numeric using LabelEncoder
le_team = LabelEncoder()
le_opponent = LabelEncoder()
le_venue = LabelEncoder()

data['team'] = le_team.fit_transform(data['team'])
data['opponent'] = le_opponent.fit_transform(data['opponent'])
data['venue'] = le_venue.fit_transform(data['venue'])

# Calculate mean statistics for home and away matches for all teams
home_stats = data[data['venue'] == le_venue.transform(['Home'])[0]][['team', 'gf', 'ga', 'xg', 'xga', 'poss']].groupby('team').mean()
away_stats = data[data['venue'] == le_venue.transform(['Away'])[0]][['team', 'gf', 'ga', 'xg', 'xga', 'poss']].groupby('team').mean()

# Prepare feature matrix and target vector
X = data[['team', 'opponent', 'gf', 'ga', 'xg', 'xga', 'poss']]
y = data['result'].map({'W': 2, 'D': 1, 'L': 0})  # Encoding result to 2: Win, 1: Draw, 0: Loss

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Train a Random Forest Classifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Save the model, scaler, and label encoders
with open('model.pkl', 'wb') as model_file:
    pickle.dump(model, model_file)

with open('scaler.pkl', 'wb') as scaler_file:
    pickle.dump(scaler, scaler_file)

with open('le_team.pkl', 'wb') as le_file:
    pickle.dump(le_team, le_file)

with open('le_opponent.pkl', 'wb') as le_file:
    pickle.dump(le_opponent, le_file)

with open('le_venue.pkl', 'wb') as le_file:
    pickle.dump(le_venue, le_file)

print("Pickle files created successfully.")


Pickle files created successfully.
