In [1]:
import pandas as pd
from math import sqrt
import random

# Importing Datasets

In [2]:
# World Cup Matches
matches = pd.read_csv("../WorldCupMatches.csv")

# World Cup Players
players = pd.read_csv("../WorldCupPlayers.csv")

# World Cups
cups = pd.read_csv("../WorldCups.csv")

# Player Stats
stats = pd.read_csv("../PlayersStats.csv")

  interactivity=interactivity, compiler=compiler, result=result)


# Match Players with Matches

# Add Player's overall FIFA score

### Define `final` DataFrame

In [3]:
columns = list(matches.columns)
new_columns = []

new_columns.append("Home Coach Name")
for i in range(1, 12):
    for column_name in stats.columns:
        new_columns.append("Home Player " + str(i) + " " + column_name)

new_columns.append("Away Coach Name")
for i in range(1, 12):
    for column_name in stats.columns:
        new_columns.append("Away Player " + str(i) + " " + column_name)

columns += new_columns
final = pd.DataFrame(columns=columns)

In [4]:
final.columns[:20]

Index(['Stage', 'Home Team Name', 'Home Team Goals', 'Away Team Goals',
       'Away Team Name', 'Attendance', 'Half-time Home Goals',
       'Half-time Away Goals', 'Home Team Initials', 'Away Team Initials',
       'Home Coach Name', 'Home Player 1 Name', 'Home Player 1 Age',
       'Home Player 1 Overall', 'Home Player 1 Potential',
       'Home Player 1 Acceleration', 'Home Player 1 Aggression',
       'Home Player 1 Agility', 'Home Player 1 Balance',
       'Home Player 1 Ball control'],
      dtype='object')

### Preparing utility functions

In [5]:
stats = stats.sort_values(by="Name")
stats = stats.reset_index().drop("index", axis=1)

In [6]:
def clean(x):
    x = x.lower().strip().split(" ")

    if len(x) == 1:
        x = x[0]

    elif len(x) == 2:
        if "." in x[0]:
            x = x[1]
        elif "." in x[1]:
            x = x[0]
        else:
            x = x[0] + x[1]

    elif len(x) == 3:
        if "." in x[0]:
            x = x[1] + x[2]
        elif "." in x[1]:
            x = x[0] + x[2]
        elif "." in x[2]:
            x = x[0] + x[1]
        else:
            x = x[0] + x[1] + x[2]

    elif len(x) == 4:
        if "(" in x[3] or ")" in x[3]:
            if "." in x[0]:
                x = x[1] + x[2]
            elif "." in x[1]:
                x = x[0] + x[2]
            elif "." in x[2]:
                x = x[0] + x[1]
            else:
                x = x[0] + x[1] + x[2]
        else:
            if "." in x[0]:
                x = x[1] + x[2] + x[3]
            elif "." in x[1]:
                x = x[0] + x[2] + x[3]
            elif "." in x[2]:
                x = x[0] + x[1] + x[3]
            elif "." in x[3]:
                x = x[0] + x[1] + x[2]
            else:
                x = x[0] + x[1] + x[2] + x[3]
            
    else:
        pass
    
    return x

def find_start_index(name):
    name = clean(name)
    i = 0
    jump_step = int(sqrt(len(stats)))
    
    while i < len(stats) and name[0] > clean(stats.iloc[i]["Name"])[0]:
        i += jump_step
    
    i -= jump_step
    return i

def find_end_index(name):
    name = clean(name)
    i = len(stats)-1
    jump_step = int(sqrt(len(stats)))
    
    while i >= 0 and name[0] < clean(stats.iloc[i]["Name"])[0]:
        i -= jump_step
    
    i += jump_step
    return i

def similarity_score(a, b):
    a = clean(a)
    b = clean(b)
    count = 0
    length = min(len(a), len(b))
    
    for i in range(length):
        if a[i] == b[i]:
            count+=1
            
    return count/length

In [7]:
def find_team_members_names(team_initials):
    names = ["coach"]  

    for i in range(len(players)):
        current_player_name = players.iloc[i]["Player Name"]
        current_coach_name = players.iloc[i]["Coach Name"]
        current_team_initials = players.iloc[i]["Team Initials"]

        if team_initials == current_team_initials:
            if names[0] == "coach":
                names[0] = current_coach_name
            names.append(current_player_name)
        else:
            pass
          
    picked_names = []
    max_iter = 5
    k = 0
    while len(names) != 12:
        if len(names) > 12:
            del names[random.randint(1, len(names) - 1)]
        else:
            random_name = random.choice(names)
            while k < max_iter and random_name in picked_names:
                random_name = random.choice(names)
                k+=1

            names.append(random_name)
            picked_names.append(random_name)
    
    return names

In [10]:
def find_player_stats(name, debug=False):
    best_match_index = -1
    best_score = 0
            
    for i in range(find_start_index(name), find_end_index(name)):         
        current_score = similarity_score(name, stats.iloc[i]["Name"])
        if current_score > best_score:
            best_score = current_score
            best_match_index = i
            
    if best_match_index != -1:
        if debug:
            print("Search: ", name, " | ", "Best match: ", stats.iloc[best_match_index]["Name"])
        return stats.iloc[best_match_index]
        
    else:
        # Pick a random player's stat
        return stats.iloc[random.randint(0, len(stats)-1)]

### Define `merging`

In [14]:
def run_merging(debug=False):
    global final
    
    for i in range(len(matches)):
        # # Add players's (and coach's) name # #
        home_team_names = find_team_members_names(matches.iloc[i]["Home Team Initials"])
        away_team_names = find_team_members_names(matches.iloc[i]["Away Team Initials"])
                
        # # Add player's FIFA stats # #
        home_players_stats = []
        away_players_stats = []
        
        for name in home_team_names[1:]:
            home_players_stats.append(find_player_stats(name, debug=debug))
            
        for name in away_team_names[1:]:
            away_players_stats.append(find_player_stats(name, debug=debug))
        
        # # Agregate data # #
        data = list(matches.iloc[i])

        data.append(home_team_names[0]) # coach's name
        for stats in home_players_stats:
            data += list(stats)
            
        data.append(away_team_names[0]) # coach's name
        for stats in away_players_stats:
            data += list(stats)

        # # Append current data # # 
        final = final.append(pd.Series(data, index=final.columns), ignore_index=True)

### Run `merging` (will take about an hour)

In [16]:
run_merging(debug=True)

Search:  Lucas RADEBE  |  Best match:  Lucas
Search:  RADEBE  |  Best match:  T. Hadebe
Search:  KHUNE  |  Best match:  I. Khune


KeyboardInterrupt: 

In [None]:
final.head()

# Save final into CSV file

In [None]:
final.to_csv("../final.csv", index=0)