In [1]:
import pickle
import random
from math import sqrt

import pandas as pd

# Importing Datasets

In [2]:
# World Cup Matches
matches = pd.read_csv("../WorldCupMatches.csv")

# wwmatches
wwmatches = pd.read_csv("../wwmatches.csv")

# Player Stats
stats = pd.read_csv("../PlayersStats.csv")

  interactivity=interactivity, compiler=compiler, result=result)


## Load `team_players` binary

In [3]:
with open("../../site/server/match_predictor/ml_data/team_players.b", "rb") as f:
    team_players = pickle.load(f)

# Narrow the dataset to 2018 FIFA participants

In [4]:
matches = matches.append(wwmatches, sort=True)
matches = matches.reset_index().drop("index", axis=1)

In [5]:
new_matches = pd.DataFrame(columns=matches.columns)

for i in range(len(matches)):
    if matches["Home Team Name"][i] in team_players.keys() and matches["Away Team Name"][i] in team_players.keys():
        new_matches = new_matches.append(matches.iloc[i])
        
matches = new_matches

In [6]:
matches = matches.reset_index().drop("index", axis=1)

In [7]:
assert len(matches["Home Team Name"].unique()) == len(team_players.keys())

In [8]:
len(matches)

603

In [9]:
matches.columns

Index(['Away Team Goals', 'Away Team Name', 'Home Team Goals',
       'Home Team Name'],
      dtype='object')

# Match players with matches

# Add player's FIFA stats

### Define `final` DataFrame

In [10]:
columns = list(matches.columns)

for i in range(1, 12):
    columns.append("Home Player " + str(i) + " Overall")

for i in range(1, 12):
    columns.append("Away Player " + str(i) + " Overall")

final = pd.DataFrame(columns=columns)

### Preparing utility functions

In [11]:
stats = stats.sort_values(by="Name")
stats = stats.reset_index().drop("index", axis=1)

In [12]:
def clean(x):
    x = x.lower().strip().split(" ")

    if len(x) == 1:
        x = x[0]

    elif len(x) == 2:
        if "." in x[0]:
            x = x[1]
        elif "." in x[1]:
            x = x[0]
        else:
            x = x[0] + x[1]

    elif len(x) == 3:
        if "." in x[0]:
            x = x[1] + x[2]
        elif "." in x[1]:
            x = x[0] + x[2]
        elif "." in x[2]:
            x = x[0] + x[1]
        else:
            x = x[0] + x[1] + x[2]

    elif len(x) == 4:
        if "(" in x[3] or ")" in x[3]:
            if "." in x[0]:
                x = x[1] + x[2]
            elif "." in x[1]:
                x = x[0] + x[2]
            elif "." in x[2]:
                x = x[0] + x[1]
            else:
                x = x[0] + x[1] + x[2]
        else:
            if "." in x[0]:
                x = x[1] + x[2] + x[3]
            elif "." in x[1]:
                x = x[0] + x[2] + x[3]
            elif "." in x[2]:
                x = x[0] + x[1] + x[3]
            elif "." in x[3]:
                x = x[0] + x[1] + x[2]
            else:
                x = x[0] + x[1] + x[2] + x[3]
            
    else:
        pass
    
    return x

def find_start_index(name):
    name = clean(name)
    i = 0
    jump_step = int(sqrt(len(stats)))
    
    while i < len(stats) and name[0] > clean(stats.iloc[i]["Name"])[0]:
        i += jump_step
    
    if i >= jump_step:
        i -= jump_step

    return i

def find_end_index(name):
    name = clean(name)
    i = len(stats)-1
    jump_step = int(sqrt(len(stats)))
    
    while i >= 0 and name[0] < clean(stats.iloc[i]["Name"])[0]:
        i -= jump_step

    if i < len(stats)-jump_step:
        i += jump_step

    return i

def similarity_score(a, b):
    a = clean(a)
    b = clean(b)
    count = 0
    length = min(len(a), len(b))
    
    for i in range(length):
        if a[i] == b[i]:
            count+=1
            
    return count/length

def find_player_stats(name, debug=False):
    best_match_index = -1
    best_score = 0
            
    for i in range(find_start_index(name), find_end_index(name)):      
        current_score = similarity_score(name, stats.iloc[i]["Name"])
        if current_score > best_score:
            best_score = current_score
            best_match_index = i
            
    if best_match_index != -1:
        if debug:
            print("Search: ", name, " | ", "Best match: ", stats.iloc[best_match_index]["Name"])
        return stats.iloc[best_match_index]
        
    else:
        # Pick a random player's stat
        return stats.iloc[random.randint(0, len(stats)-1)]
    
def get_nbest_scores(team_name, n=11):
    players = team_players[team_name]
    scores = []

    # pprint(players)
    for i in range(len(players)):
        try:
            scores.append(int(players[i][1]))

        except:
            player_score = int(find_player_stats(players[i])["Overall"])
            scores.append(player_score)
            team_players[team_name][i] = [players[i], player_score]

    scores.sort()
    return scores[:n]

### Define `merging`

In [13]:
def run_merging(debug=False):
    global final
    
    for i in range(len(matches)):
        home_team_name = matches["Home Team Name"][i]
        away_team_name = matches["Away Team Name"][i]

        # # Add player's FIFA Overall scores # #
        home_players_stats = get_nbest_scores(home_team_name)
        away_players_stats = get_nbest_scores(away_team_name)

        # # Agregate data # #
        data = list(matches.iloc[i])
        data += home_players_stats
        data += away_players_stats

        # # Append current data # # 
        final = final.append(pd.Series(data, index=final.columns), ignore_index=True)

### Run `merging` (will take about an hour)

In [14]:
run_merging(debug=False)

In [15]:
final.head()

Unnamed: 0,Away Team Goals,Away Team Name,Home Team Goals,Home Team Name,Home Player 1 Overall,Home Player 2 Overall,Home Player 3 Overall,Home Player 4 Overall,Home Player 5 Overall,Home Player 6 Overall,...,Away Player 2 Overall,Away Player 3 Overall,Away Player 4 Overall,Away Player 5 Overall,Away Player 6 Overall,Away Player 7 Overall,Away Player 8 Overall,Away Player 9 Overall,Away Player 10 Overall,Away Player 11 Overall
0,0.0,France,0.0,Uruguay,59,60,62,66,66,66,...,56,56,60,60,62,62,63,63,64,64
1,0.0,Nigeria,1.0,Argentina,52,56,56,57,60,61,...,56,58,59,61,61,62,62,63,64,65
2,0.0,Australia,4.0,Germany,56,59,60,61,61,62,...,57,57,59,59,59,62,62,63,64,64
3,1.0,Switzerland,0.0,Spain,60,62,63,63,65,68,...,57,60,61,62,62,63,64,64,65,66
4,1.0,Korea Republic,4.0,Argentina,52,56,56,57,60,61,...,60,60,60,60,60,60,60,60,62,67


# Save `final` into CSV file and `team_players` into binary file

In [16]:
final.to_csv("../final.csv", index=0)

In [17]:
with open("../../site/server/match_predictor/ml_data/team_players.b", "wb") as f:
    pickle.dump(team_players, f)