In [7]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

In [8]:
def calculate_team_statistic(player_ids, stat_dict):
    valid_player_ids = [player_id for player_id in player_ids if player_id != 0]
    if valid_player_ids:
        return sum(stat_dict.get(player_id, 0) for player_id in valid_player_ids) / len(valid_player_ids)
    else:
        return 0
    
def playeravg_features(training_set):
    player_stats = {}
    df = training_set

    # Iterate over each match to collect player statistics
    for index, row in df.iterrows():
        # Extract player IDs for both teams
        team1_players = [player_id for player_id in row[['Team 1 P1', 'Team 1 P2', 'Team 1 P3', 'Team 1 P4', 'Team 1 P5', 'Team 1 P6', 'Team 1 P7', 'Team 1 P8']] if player_id != 0]
        team2_players = [player_id for player_id in row[['Team 2 P1', 'Team 2 P2', 'Team 2 P3', 'Team 2 P4', 'Team 2 P5', 'Team 2 P6', 'Team 2 P7', 'Team 2 P8']] if player_id != 0]

        for player_id in team1_players:
            player_stats[player_id] = player_stats.get(player_id, {"matches_played": 0, "matches_won": 0, "total_GS": 0, "total_GC": 0})
            player_stats[player_id]["matches_played"] += 1
            player_stats[player_id]["total_GS"] += row['Team 1 Goals']
            player_stats[player_id]["total_GC"] += row['Team 2 Goals']
            if row['Team 1 Result'] == 1:  # Team 1 won
                player_stats[player_id]["matches_won"] += 1

        for player_id in team2_players:
            player_stats[player_id] = player_stats.get(player_id, {"matches_played": 0, "matches_won": 0, "total_GS": 0, "total_GC": 0})
            player_stats[player_id]["matches_played"] += 1
            player_stats[player_id]["total_GS"] += row['Team 2 Goals']
            player_stats[player_id]["total_GC"] += row['Team 1 Goals']
            if row['Team 2 Result'] == 1:  # Team 2 won
                player_stats[player_id]["matches_won"] += 1
    
    # Calculate win percentage, average goals, and average goals conceded for each player
    win_percentage_dict = {}
    avg_GS_dict = {}
    avg_GC_dict = {}

    for player_id, stats in player_stats.items():
        win_percentage_dict[player_id] = (stats["matches_won"] / stats["matches_played"]) * 100 if stats["matches_played"] > 0 else 0
        avg_GS_dict[player_id] = stats["total_GS"] / stats["matches_played"]
        avg_GC_dict[player_id] = stats["total_GC"] / stats["matches_played"]
    
    return win_percentage_dict, avg_GS_dict, avg_GC_dict

def create_testdataset(team1_players, team2_players, player_id_dict):
    # Create DataFrame for the match
    match_data = pd.DataFrame(columns=['Match ID', 'Team 1 P1', 'Team 1 P2', 'Team 1 P3', 'Team 1 P4', 'Team 1 P5', 'Team 1 P6', 'Team 1 P7', 'Team 1 P8',
                                        'Team 2 P1', 'Team 2 P2', 'Team 2 P3', 'Team 2 P4', 'Team 2 P5', 'Team 2 P6', 'Team 2 P7', 'Team 2 P8',
                                        'Team 1 Goals', 'Team 2 Goals', 'Team 1 Result', 'Team 2 Result'])

    # Assign players for Team 1
    for i, player in enumerate(team1_players):
        match_data.at[0, f'Team 1 P{i+1}'] = player

    # Assign players for Team 2
    for i, player in enumerate(team2_players):
        match_data.at[0, f'Team 2 P{i+1}'] = player

    match_data.fillna(0, inplace=True)

    # Replace player names with player IDs
    for column in match_data.columns:
        if column.startswith('Team 1 P') or column.startswith('Team 2 P'):
            match_data[column] = match_data[column].apply(lambda x: player_id_dict.get(x, x))
    
    return match_data

def apply_playerfeatures (win_percentage_dict, avg_GS_dict, avg_GC_dict, dataset):
    df = dataset
    for index, row in df.iterrows():
        df['team1_win_percentage'] = df.apply(lambda row: calculate_team_statistic(row[['Team 1 P1', 'Team 1 P2', 'Team 1 P3', 'Team 1 P4', 'Team 1 P5', 'Team 1 P6', 'Team 1 P7', 'Team 1 P8']], win_percentage_dict), axis=1)
        df['team2_win_percentage'] = df.apply(lambda row: calculate_team_statistic(row[['Team 2 P1', 'Team 2 P2', 'Team 2 P3', 'Team 2 P4', 'Team 2 P5', 'Team 2 P6', 'Team 2 P7', 'Team 2 P8']], win_percentage_dict), axis=1)
        df['team1_avg_goals'] = df.apply(lambda row: calculate_team_statistic(row[['Team 1 P1', 'Team 1 P2', 'Team 1 P3', 'Team 1 P4', 'Team 1 P5', 'Team 1 P6', 'Team 1 P7', 'Team 1 P8']], avg_GS_dict), axis=1)
        df['team2_avg_goals'] = df.apply(lambda row: calculate_team_statistic(row[['Team 2 P1', 'Team 2 P2', 'Team 2 P3', 'Team 2 P4', 'Team 2 P5', 'Team 2 P6', 'Team 2 P7', 'Team 2 P8']], avg_GS_dict), axis=1)
        df['team1_avg_goalsconceded'] = df.apply(lambda row: calculate_team_statistic(row[['Team 1 P1', 'Team 1 P2', 'Team 1 P3', 'Team 1 P4', 'Team 1 P5', 'Team 1 P6', 'Team 1 P7', 'Team 1 P8']], avg_GC_dict), axis=1)
        df['team2_avg_goalsconceded'] = df.apply(lambda row: calculate_team_statistic(row[['Team 2 P1', 'Team 2 P2', 'Team 2 P3', 'Team 2 P4', 'Team 2 P5', 'Team 2 P6', 'Team 2 P7', 'Team 2 P8']], avg_GC_dict), axis=1)

    columns_to_convert = ['team1_win_percentage', 'team2_win_percentage', 'team1_avg_goals', 'team2_avg_goals', 'team1_avg_goalsconceded', 'team2_avg_goalsconceded']
    df[columns_to_convert] = df[columns_to_convert].astype('float64')
    
    return df

In [9]:
training_dataset = pd.read_csv("ML_Data_Set.csv")
player_ids = pd.read_csv("Player keys.csv")

In [10]:
#Who are the teams you want to run through the model?
team1_players = ['Waq', 'Jamie', 'Saeed', 'Sam', 'Jason', 'Yusuf', 'Carlos', 'Mo.O']
team2_players = ['Saqi', 'Jake', 'Satpal', 'Ashley', 'Shyam', 'Riz', 'Gergo', 'Saj']

In [11]:
player_id_dict = dict(zip(player_ids['player name'], player_ids['player_ID']))

test_dataset = create_testdataset(team1_players, team2_players, player_id_dict)

win_percentage_dict, avg_GS_dict, avg_GC_dict = playeravg_features(training_dataset)

In [12]:
clf = RandomForestClassifier()
reg_team1 = RandomForestRegressor()
reg_team2 = RandomForestRegressor()

def training_clf(training_dataset):
    df = apply_playerfeatures(win_percentage_dict, avg_GS_dict, avg_GC_dict, training_dataset)
    df = df.select_dtypes(exclude=['object']) 
    df["Team 1 Win"] = df["Team 1 Result"].apply(lambda x: 1 if x == 1 else 0)
    x_clf = df.drop(["Team 1 Result", "Team 2 Result", "Team 1 Win", "Team 1 Goals", "Team 2 Goals"], axis=1)
    y_clf = df["Team 1 Win"]
    clf.fit(x_clf, y_clf)
    return

def training_rfg(training_dataset):
    df = apply_playerfeatures(win_percentage_dict, avg_GS_dict, avg_GC_dict, training_dataset)
    df = df.select_dtypes(exclude=['object']) 
    x_reg = df.drop(["Team 1 Result", "Team 2 Result", "Team 1 Goals", "Team 2 Goals"], axis=1)
    y_reg_team1 = df["Team 1 Goals"]
    y_reg_team2 = df["Team 2 Goals"] 
    reg_team1.fit(x_reg, y_reg_team1)
    reg_team2.fit(x_reg, y_reg_team2)
    return

def predict_scores(test_dataset):
    df = apply_playerfeatures(win_percentage_dict, avg_GS_dict, avg_GC_dict, test_dataset)
    df = df.select_dtypes(exclude=['object'])
    x_test = df.drop(["Team 1 Result", "Team 2 Result", "Team 1 Goals", "Team 2 Goals"], axis=1)
    team1_goals = reg_team1.predict(x_test)
    team2_goals = reg_team2.predict(x_test)
    team1_win = clf.predict(x_test)
    return team1_goals, team2_goals, team1_win

training_clf(training_dataset)
training_rfg(training_dataset)
team1_goals, team2_goals, team1_win = predict_scores(test_dataset)

In [13]:
# Print predicted goals
print("Predicted Goals:")
print("Team 1:", team1_goals)
print("Team 2:", team2_goals)
print("\nPredicted Results:")
if team1_win == 1:
    print("Team 1")
else:
    print("Team 2")

Predicted Goals:
Team 1: [10.4]
Team 2: [4.84]

Predicted Results:
Team 1
