# Scenario 3 - Active Manager
## Import required libraries

In [None]:
import numpy
import pandas
from numpy import arange
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from rsome import ro
from rsome import grb_solver

## 1 Game Week Data

### Import all gw data (16-21)
The current season's data (21/22) will not be included since it is being updated weekly.

In [None]:
gw_df_list = [pandas.read_csv(filepath_or_buffer='./raw_data/scenario_3/gw_' + str(i) + '_' + str(i + 1) + '.csv', encoding_errors='ignore') for i in range(16, 21)]

### Define helper functions

In [None]:
def append_season(df: pandas.DataFrame, season: str) -> None:
    """
    Appends new column "season" to df.
    :param df: Dataframe to be edited.
    :param season: Season to be appended.
    """
    df["season"] = season

def clean_names(df: pandas.DataFrame) -> None:
    """
    Cleans 'name' column to remove numbers and underscores.
    :param df: Dataframe to be cleaned.
    """
    df['name'] = df['name'].map(lambda x: x.rstrip('_0123456789').replace('_', ' '))

def fill_missing_gw(dataframe: pandas.DataFrame) -> pandas.DataFrame:
    """
    Fills missing game weeks in dataframe.
    :param dataframe: Dataframe to be cleaned.
    :return: Cleaned dataframe.
    """
    name = dataframe.at[0, 'name']
    # multiples of 38, indicating first row of each player
    number = 0
    # last row number
    last = 0
    for i in range(len(dataframe)):
        if dataframe.at[i, 'name'] != name:
            number += 38
            name = dataframe.at[i, 'name']
        last = number + dataframe.at[i, 'GW'] - 1
        dataframe.at[i, 'index'] = last

    new_index = pandas.Index(arange(0,last + 1), name="index")
    dataframe = dataframe.set_index(keys="index")
    dataframe = dataframe.reindex(new_index)

    # Fill NaN rows with name, gw accordingly. Set points = 0
    name = dataframe.at[0, 'name']
    value = dataframe.at[0, 'value']
    for i in range(len(dataframe)):
        if i % 38 == 0:
            j = i
            while pandas.isna(obj=dataframe.at[j, 'name']):
                j += 1
            name = dataframe.at[j, 'name']
            value = dataframe.at[j, 'value']
        if pandas.isna(obj=dataframe.at[i, 'name']):
            dataframe.at[i, 'name'] = name
            dataframe.at[i, 'GW'] = (i % 38) + 1
            dataframe.at[i, 'value'] = value
            dataframe.at[i, 'total_points'] = 0
    return dataframe

### Use helper functions to clean gw data

In [None]:
for i in range(16, 21):
    append_season(df=gw_df_list[i - 16], season='20' + str(i) + '/20' + str(i + 1))
    
# Clean names
for i in gw_df_list:
    print("before: '" + i.loc[0]['name'], end="' | ")
    clean_names(df=i)
    print("after: '" + i.loc[0]['name'] + "'")

gw_df = pandas.concat(objs=gw_df_list)
gw_df

### Clean data to get average scores for each player in each gameweek

In [None]:
# Get non-null position rows and put them in dict
non_null_pos = gw_df[gw_df['position'].notna()]
names_dict = dict(zip(non_null_pos['name'], non_null_pos['position']))

# Get non-null team rows and put them in dict
non_null_teams = gw_df[gw_df['team'].notna()]
teams_dict = dict(zip(non_null_teams['name'], non_null_teams['team']))

### Clean data to get cumulative weighted scores for each player in each game week
Scores for each gameweek in this year will be a weighted average of the score that week and the scores from past games

In [None]:
# Combine previous years dataset
prev_df = pandas.concat(objs=gw_df_list[:-1]).groupby(by=['name']).mean()
prev_df.reset_index(inplace=True)
prev_df

In [None]:
# Clean current year dataset
curr_df = gw_df_list[-1].copy().groupby(by=['name', 'GW']).mean()
curr_df.reset_index(inplace=True)
curr_df.drop(index=curr_df[curr_df['GW'] > 38].index, inplace=True)
curr_df.sort_values(by=['name', 'GW'], inplace=True)
curr_df.reset_index(inplace=True)
curr_df

In [None]:
# Remove useless rows from previous years dataset
name_list = curr_df['name'].unique().tolist()
prev_df = prev_df[prev_df['name'].isin(values=name_list)]
prev_df

In [None]:
# Fill missing gw
curr_df = fill_missing_gw(dataframe=curr_df)
curr_df

In [None]:
def weight_df(curr_dataframe: pandas.DataFrame, prev_dataframe: pandas.DataFrame, curr_weight: float, names_dictionary: dict, teams_dictionary: dict) -> pandas.DataFrame:
    """
    Weight current player data and historical player data differently.
    :param curr_dataframe: Dataframe containing current player data.
    :param prev_dataframe: Dataframe containing historical player data.
    :param curr_weight: Weight of current week (weight = 0.75 means 75% curr, 25% prev).
    :param names_dictionary: Dictionary of players and their positions.
    :param teams_dictionary: Dictionary of players and their teams.
    :return: Weighted dataframe.
    """
    curr_dataframe = curr_dataframe.copy()
    prev_dataframe = prev_dataframe.copy()

    # Get weighted average of curr and prev
    for i in range(len(curr_dataframe)):
        if curr_dataframe.at[i, 'GW'] == 1:
            curr_name = curr_dataframe.at[i, 'name']
            points_column = prev_dataframe[prev_dataframe['name'] == curr_name]['total_points']
            if len(points_column) > 0 and not pandas.isna(obj=points_column.values[0]):
                curr_dataframe.at[i, 'total_points'] = (1 - curr_weight) * points_column.values[0] + curr_weight * curr_dataframe.at[i, 'total_points']
        else:
            curr_dataframe.at[i, 'total_points'] = (1 - curr_weight) * curr_dataframe.at[i - 1, 'total_points'] + curr_weight * curr_dataframe.at[i, 'total_points']

    # Round to 2 decimal places
    curr_dataframe['total_points'] = curr_dataframe['total_points'].round(decimals=2)

    # Fill in position and team (disappeared after groupby function)
    curr_dataframe['position'] = curr_dataframe["name"].map(names_dictionary)
    curr_dataframe['team'] = curr_dataframe["name"].map(teams_dictionary)
    
    # Drop NA positions
    weighted_avg_dataframe = curr_dataframe.dropna(subset=['position'])

    # Keep only useful columns
    weighted_avg_dataframe = weighted_avg_dataframe[['name', 'GW', 'position', 'team', 'total_points', 'value']]
    weighted_avg_dataframe.rename(columns={'name': 'Name', 'position': 'Position', 'team': 'Club', 'total_points': 'Total Points', 'value': 'Cost'}, inplace=True)
    
    return weighted_avg_dataframe

In [None]:
weight = 0.1
weighted_avg_df = weight_df(curr_df, prev_df, weight, names_dict, teams_dict)
weighted_avg_df.to_csv(path_or_buf='./clean_data/scenario_3/weighted_avg_' + str(weight) + '.csv', index=True)

In [None]:
weight = 0.5
weighted_avg_df = weight_df(curr_df, prev_df, weight, names_dict, teams_dict)
weighted_avg_df.to_csv(path_or_buf='./clean_data/scenario_3/weighted_avg_' + str(weight) + '.csv', index=True)

## 2. Build Models
### Read files

In [None]:
weighted_avg_df_10 = pandas.read_csv(filepath_or_buffer='./clean_data/scenario_3/weighted_avg_0.1.csv')
weighted_avg_df_50 = pandas.read_csv(filepath_or_buffer='./clean_data/scenario_3/weighted_avg_0.5.csv')

### Declare helper functions

In [None]:
evaluator_data = pandas.read_csv(filepath_or_buffer='./raw_data/scenario_3/gw_21_22.csv')
evaluator_data = evaluator_data[['name', 'total_points', 'GW']]
evaluator_data = evaluator_data.groupby(by=['name','GW']).mean()
evaluator_data.reset_index(inplace=True)

In [None]:
def get_gw_data(data: pandas.DataFrame, game_week: int) -> pandas.DataFrame:
    """
    Gets specific gw data for all players.
    :param data: Data containing all game weeks.
    :param game_week: Game week to get data for.
    :return: Game week data.
    """
    return data[data['GW'] == game_week]

def onehot_encode(data: pandas.Series) -> (numpy.ndarray, LabelEncoder):
    """
    One-hot encodes data, returns 2D array and encoder that
    can be used to reverse encoding to retrieve original data.

    Reference: https://machinelearningmastery.com/how-to-one-hot-encode-sequence-data-in-python/

    :param data: 1D data with categorical values.
    :returns encoded: 2D encoded values.
    :returns label_encoder: Encoder, can be used to reverse encoding to retrieve original values.

    To reverse encoding: label_encoder.inverse_transform([argmax(encoded[0, :])])
    """
    label_encoder = LabelEncoder()
    int_encoded = label_encoder.fit_transform(y=data)
    int_encoded = int_encoded.reshape(len(int_encoded), 1)

    onehot_encoder = OneHotEncoder(sparse=False)
    encoded = onehot_encoder.fit_transform(X=int_encoded)
    return encoded, label_encoder

def solve_gw(this_week_data: pandas.DataFrame, game_week_no: int, prev_gw_y_result: pandas.DataFrame) -> (numpy.ndarray, numpy.float64, float):
    """
    Solves for the optimal players for the game week.
    :param this_week_data: This week's gw data.
    :param game_week_no: GW No.
    :param prev_gw_y_result: Prev gw data.
    :return: Results from solved model.
    """

    # In 100,000
    price_budget = 1000
    x = this_week_data['Total Points'].to_numpy()
    p = this_week_data['Cost'].to_numpy()
    (pos_matrix, _) = onehot_encode(data=this_week_data['Position'])
    (t, _) = onehot_encode(data=this_week_data['Club'])
    max_players_per_team = 3
    max_players = 15

    # Position requirement: DF, FW, GK, MF
    position_req = numpy.array([5, 3, 2, 5])

    model = ro.Model(name="Game Week " + str(game_week_no))

    # Define binary decision variables - players to choose
    y = model.dvar(shape=len(this_week_data), vtype="B")

    # Define penalty variable
    z = model.dvar(shape=1) if game_week_no != 1 else 0

    # Additional dv for other gw - was player i transferred in?
    c = model.dvar(shape=len(this_week_data), vtype="B")

    # GW - penalty if transfer
    model.max(y @ x - z)

    model.st(y.sum() <= max_players)
    model.st(y @ p <= price_budget)
    model.st(y @ t <= max_players_per_team)
    model.st(y @ pos_matrix == position_req)

    if game_week_no != 1:
        # set C[i]=1 if player is transferred in (1-0), for calculation of penalty
        model.st(y - prev_gw_y_result <= c)

        # Penalty for transfers: z = max(0, 4 * sum(c) - 4)
        model.st(z >= 0, z >= 4 * sum(c) - 4)

    model.solve(solver=grb_solver, display=False)

    return y.get(), z.get()[0] if game_week_no != 1 else numpy.float64(0),  model.get()

In [None]:
def run_all_gw(all_data: pandas.DataFrame, latest_data: pandas.DataFrame) -> None:
    """
    Runs our model for all game weeks, to select a lineup each week.
    :param all_data: Dataframe of weighted player data. Contains all gameweeks of all players.
    :param latest_data: Dataframe containing the latest year data to evaluate our model.
    """
    curr_gw_y = None
    results_list = []
    total_season_points = 0
    total_evaluated_season_points = 0

    for i in range(1, 39):
        game_week_data = get_gw_data(data=all_data, game_week=i)
        prev_gw_y = curr_gw_y
        curr_gw_y, penalty, solved_model = solve_gw(this_week_data=game_week_data, game_week_no=i, prev_gw_y_result=curr_gw_y)

        if i != 1:
            transfers_out = game_week_data[(prev_gw_y-curr_gw_y) == 1]['Name'].tolist()
            transfers_in = game_week_data[(curr_gw_y-prev_gw_y) == 1]['Name'].tolist()

            print("Transfer", len(transfers_out), "out:", transfers_out)
            print("Transfer", len(transfers_in), "in:", transfers_in)
            print("Penalty:", penalty)

        print("Game Week", i)

        curr_team_df = game_week_data[curr_gw_y == 1][['Name', 'Club', 'Position', 'Total Points', 'Cost']]
        results_list.append(curr_team_df)
        
        print(curr_team_df)
        
        if i <= latest_data['GW'].max():
            gw_data = latest_data[latest_data['GW'] == i] 
            evaluated_total_points = game_week_data[curr_gw_y == 1].merge(right=gw_data, how='left', left_on='Name', right_on='name')['total_points']
            total_evaluated_season_points += evaluated_total_points.sum() - penalty
            print("Total points (evaluated):", evaluated_total_points.sum() - penalty)
        
        print("Total points:", sum(curr_team_df['Total Points']))
        print("Total cost:", sum(curr_team_df['Cost']))
        
        total_season_points += sum(curr_team_df['Total Points'])

        print()
        print()
    print("Total points earned in season:", total_season_points)
    print("Total evaluated points earned in season:", total_evaluated_season_points)

In [None]:
run_all_gw(weighted_avg_df_10, evaluator_data)

In [None]:
run_all_gw(weighted_avg_df_50, evaluator_data)