In [1]:
import pandas as pd
from unidecode import unidecode
from helper.df_handler import kor_get_last, eng_get_first_and_last, kor_update_last, kor_update_first, kor_handle_u, manual_check, eng_remove_jr, manual_merge

In [2]:
df_st = pd.read_csv('../data/st_player.csv')
df_br = pd.read_csv('../data/br_player.csv')

In [3]:
# data normalization (statiz)
df_st['name_new'] = df_st['name_eng'].str.lower()             # lower all characters.
df_st['name_new'] = df_st['name_new'].str.replace('-', ' ')   # remove '-'.
df_st['name_new'] = df_st['name_new'].apply(unidecode)        # decode with unicode to remove accents.
df_st['name_new'] = df_st['name_new'].apply(manual_check)     # manual manipulation for unconventional cases (order matters! must be here)
df_st['name_new'] = df_st['name_new'].apply(kor_update_last)  # generalize the last name
df_st['name_new'] = df_st['name_new'].apply(kor_update_first) # generalize the first name
df_st['name_new'] = df_st['name_new'].apply(eng_remove_jr)    # remove 'Jr' at the end

# extra normalization for non-Korean players to only keep the first and last name (might need to update in the future)
# df_st.loc[df_st['name_eng'].str.contains('-') == False, 'name_new'] = df_st.loc[df_st['name_eng'].str.contains('-') == False, 'name_new'].apply(eng_get_first_and_last)

print(df_st.head(5))

  name_kor        name_eng        name_new
0      김일권     Il-Kwon Kim     il kwon kim
1      김경훈  Kyung-Hoon Kim  kyung hoon kim
2      조충열  Chung-Yeol Cho  chung yeol cho
3      차영화   Young-Hwa Cha   yeong hwa cha
4      김종모     Jong-Mo Kim     jong mo kim


In [4]:
# data normalization (baseball reference)
df_br['name_new'] = df_br['name_eng'].str.lower()             # lower all characters.
df_br['name_new'] = df_br['name_new'].str.replace('-', ' ')   # remove '-'.
df_br['name_new'] = df_br['name_new'].apply(unidecode)        # decode with unicode to remove accents.
df_br['name_new'] = df_br['name_new'].apply(manual_check)     # manual manipulation for unconventional cases (order matters! must be here)
df_br['name_new'] = df_br['name_new'].apply(kor_update_last)  # generalize the Korean last name
df_br['name_new'] = df_br['name_new'].apply(kor_update_first) # generalize the Korean first name
df_br['name_new'] = df_br['name_new'].apply(eng_remove_jr)    # remove 'Jr' at the end

# extra normalization for non-Korean players to only keep the first and last name. 
# cannot do this because br doesn't always use - for Korean players
# df_br.loc[df_br['name_eng'].str.contains('-') == False, 'name_new'] = df_br.loc[df_br['name_eng'].str.contains('-') == False, 'name_new'].apply(eng_get_first_and_last)

print(df_br.head(5))

           name_eng                                                url  \
0     Seung Han Ahn  https://www.baseball-reference.com/register/pl...   
1    Raúl Alcántara  https://www.baseball-reference.com/register/pl...   
2  Jordan Balazovic  https://www.baseball-reference.com/register/pl...   
3     Kyu Bin Chang  https://www.baseball-reference.com/register/pl...   
4      Ji Kang Choi  https://www.baseball-reference.com/register/pl...   

           name_new  
0     seung han ahn  
1    raul alcantara  
2  jordan balazovic  
3      kyu bin jang  
4      ji kang choi  


In [5]:
inner_join = pd.merge(df_br, df_st, on='name_new', how='inner')
print(inner_join.head(5))
print(len(inner_join))

         name_eng_x                                                url  \
0     Seung Han Ahn  https://www.baseball-reference.com/register/pl...   
1    Raúl Alcántara  https://www.baseball-reference.com/register/pl...   
2  Jordan Balazovic  https://www.baseball-reference.com/register/pl...   
3     Kyu Bin Chang  https://www.baseball-reference.com/register/pl...   
4      Ji Kang Choi  https://www.baseball-reference.com/register/pl...   

           name_new name_kor        name_eng_y  
0     seung han ahn      안승한     Seung-Han Ahn  
1    raul alcantara     알칸타라    Raul Alcántara  
2  jordan balazovic     발라조빅  Jordan Balazovic  
3      kyu bin jang      장규빈      Gyu-Bin Jang  
4      ji kang choi      최지강      Ji-Kang Choi  
2234


In [6]:
# outer_join is only to pick up left over. 
outer_join = pd.merge(df_br, df_st, on='name_new', how='outer')
# outer_join = outer_join.drop(columns=['url'])
outer_join = outer_join[outer_join.isna().any(axis=1)]
print(outer_join.head(5))
print(len(outer_join))

     name_eng_x                                                url  \
2  Aaron Brooks  https://www.baseball-reference.com/register/pl...   
3           NaN                                                NaN   
5           NaN                                                NaN   
6   Adam Plutko  https://www.baseball-reference.com/register/pl...   
7           NaN                                                NaN   

              name_new name_kor           name_eng_y  
2         aaron brooks      NaN                  NaN  
3     aaron lee brooks      브룩스     Aaron Lee Brooks  
5  adam gregory plutko      플럿코  Adam Gregory Plutko  
6          adam plutko      NaN                  NaN  
7     adam robert wilk       아담     Adam Robert Wilk  
3184


In [7]:
inner_join, outer_join = manual_merge(inner_join, outer_join, kor_handle_u)

given name: amaury telemaco -> neither exists
given name: arquimedez pozo -> neither exists
given name: austin dean -> neither exists
given name: beau sulser -> neither exists
given name: beom jun park -> neither exists
given name: beom su jeong -> neither exists
given name: bubba carpenter -> neither exists
given name: bum hyun cho -> at least than 2 charcters, all 'u' (count: 2)
given name: burch smith -> neither exists
given name: byeong hun kam -> neither exists
given name: byeong hun lee -> neither exists
given name: byeong jun ye -> neither exists
given name: byeong kyu kang -> neither exists
given name: byeong uk cho -> neither exists
given name: byoung gon jeong -> neither exists
given name: byung hee kim -> neither exists
given name: byung hwee lee -> neither exists
given name: byung hyun kim -> at least than 2 charcters, all 'u' (count: 2)
given name: byung ui kang -> at least than 2 charcters, all 'u' (count: 2)
given name: chan gun lee -> neither exists
given name: chan uk 

In [8]:
inner_join.to_csv('../data/done_proto.csv', index=False)  
outer_join.to_csv('../data/to_do_proto.csv', index=False)  