In [1]:
import pandas as pd
from unidecode import unidecode
from helper.df_handler import kor_get_last, kor_update_last, kor_update_first, kor_handle_u, manual_check, eng_remove_jr, manual_merge, eng_handle_rare_last_name, recover_dash, eng_check_possible_match, eng_update_birthday

In [2]:
df_st = pd.read_csv('../data/st_player.csv')
df_br = pd.read_csv('../data/br_player.csv')

In [3]:
# data generalization (statiz)
df_st['name_eng'] = df_st['name_eng'].str.strip()             # remove whitespace around the name
df_st['name_eng'] = df_st['name_eng'].str.replace('--', '-')  # change '--' to '-'
df_st['name_eng'] = df_st['name_eng'].apply(recover_dash)     # manual addition of '-' for those who are missing it
df_st['name_new'] = df_st['name_eng'].str.lower()             # lower all characters.
df_st['name_new'] = df_st['name_new'].str.replace('-', ' ')   # remove '-'.
df_st['name_new'] = df_st['name_new'].apply(unidecode)        # decode with unicode to remove accents.
df_st = df_st.apply(manual_check, axis=1)                     # manual manipulation for unconventional cases (order matters! must be here)
df_st['name_new'] = df_st['name_new'].str.replace('young', 'yeong')
df_st['name_new'] = df_st['name_new'].str.replace('yung', 'yeong')
df_st['name_new'] = df_st['name_new'].str.replace('  ', ' ')  # change '  ' to ' '
df_st['name_new'] = df_st['name_new'].apply(kor_update_last)  # generalize the last name
df_st['name_new'] = df_st['name_new'].apply(kor_update_first) # generalize the first name
df_st['name_new'] = df_st['name_new'].apply(eng_remove_jr)    # remove 'Jr' at the end
df_st = df_st.apply(eng_update_birthday, axis=1)              # birthday correction

In [4]:
# Micah Bowie in BR never played in KBO. It should have been Jesus Sanchez.
correct_data = ("Jesus Sanchez", "1974-10-11", "https://www.baseball-reference.com/players/s/sanchje01.shtml")
df_br.loc[df_br['name_eng'] == "Micah Bowie", :] = correct_data

In [5]:
# data generalization (baseball reference)
df_br['name_eng'] = df_br['name_eng'].str.strip()             # remove whitespace around the name
df_br['name_eng'] = df_br['name_eng'].str.replace('--', '-')  # change '--' to '-'
df_br['name_eng'] = df_br['name_eng'].apply(recover_dash)     # manual addition of '-' for those who are missing it
df_br['name_new'] = df_br['name_eng'].str.lower()             # lower all characters.
df_br['name_new'] = df_br['name_new'].str.replace('-', ' ')   # remove '-'.
df_br['name_new'] = df_br['name_new'].apply(unidecode)        # decode with unicode to remove accents.
df_br = df_br.apply(manual_check, axis=1)                     # manual manipulation for unconventional cases (order matters! must be here)
df_br['name_new'] = df_br['name_new'].str.replace('young', 'yeong')
df_br['name_new'] = df_br['name_new'].str.replace('yung', 'yeong')
df_br['name_new'] = df_br['name_new'].str.replace('  ', ' ')  # change '  ' to ' '
df_br['name_new'] = df_br['name_new'].apply(kor_update_last)  # generalize the Korean last name
df_br['name_new'] = df_br['name_new'].apply(kor_update_first) # generalize the Korean first name
df_br['name_new'] = df_br['name_new'].apply(eng_remove_jr)    # remove 'Jr' at the end

In [6]:
# first merge after generalization
inner_join = pd.merge(df_br, df_st, on=['name_new', 'date_of_birth'], how='inner')
outer_join = pd.merge(df_br, df_st, on=['name_new', 'date_of_birth'], how='outer')
outer_join = outer_join[outer_join.isna().any(axis=1)] # outer_join is only to pick up left over. 

In [7]:
# manual merge process
inner_join, outer_join = manual_merge(inner_join, outer_join, kor_handle_u)
inner_join, outer_join = manual_merge(inner_join, outer_join, eng_handle_rare_last_name)
inner_join, outer_join = manual_merge(inner_join, outer_join, eng_check_possible_match)
inner_join, outer_join = manual_merge(inner_join, outer_join, eng_handle_rare_last_name)

In [8]:
inner_join.to_csv('../data/merged.csv', index=False)  
outer_join.to_csv('../data/left_off.csv', index=False)  