In [1]:
import pandas as pd
from unidecode import unidecode
from helper.utils import kor_get_last, eng_get_first_and_last, kor_update_last, kor_update_first, kor_handle_u, manual_check

In [2]:
df_st = pd.read_csv('../data/st_player.csv')
df_br = pd.read_csv('../data/br_player.csv')

In [3]:
# data normalization (statiz)
df_st['name_new'] = df_st['name_eng'].str.lower()             # lower all characters.
df_st['name_new'] = df_st['name_new'].str.replace('-', ' ')   # remove '-'.
df_st['name_new'] = df_st['name_new'].apply(unidecode)        # decode with unicode to remove accents.
df_st['name_new'] = df_st['name_new'].apply(manual_check)     # manual manipulation for unconventional cases (order matters! must be here)
df_st['name_new'] = df_st['name_new'].apply(kor_update_last)  # generalize the last name
df_st['name_new'] = df_st['name_new'].apply(kor_update_first) # generalize the first name

# extra normalization for non-Korean players to only keep the first and last name (might need to update in the future)
df_st.loc[df_st['name_eng'].str.contains('-') == False, 'name_new'] = df_st.loc[df_st['name_eng'].str.contains('-') == False, 'name_new'].apply(eng_get_first_and_last)

print(df_st.head(5))

  name_kor        name_eng        name_new
0      김일권     Il-Kwon Kim     il kwon kim
1      김경훈  Kyung-Hoon Kim  kyung hoon kim
2      조충열  Chung-Yeol Cho  chung yeol cho
3      차영화   Young-Hwa Cha   yeong hwa cha
4      김종모     Jong-Mo Kim     jong mo kim


In [4]:
# data normalization (baseball reference)
df_br['name_new'] = df_br['name_eng'].str.lower()             # lower all characters.
df_br['name_new'] = df_br['name_new'].str.replace('-', ' ')   # remove '-'.
df_br['name_new'] = df_br['name_new'].apply(unidecode)        # decode with unicode to remove accents.
df_br['name_new'] = df_br['name_new'].apply(manual_check)     # manual manipulation for unconventional cases (order matters! must be here)
df_br['name_new'] = df_br['name_new'].apply(kor_update_last)  # generalize the last name
df_br['name_new'] = df_br['name_new'].apply(kor_update_first) # generalize the first name

# extra normalization for non-Korean players to only keep the first and last name. 
# cannot do this because br doesn't always use - for Korean players
# df_br.loc[df_br['name_eng'].str.contains('-') == False, 'name_new'] = df_br.loc[df_br['name_eng'].str.contains('-') == False, 'name_new'].apply(eng_get_first_and_last)

print(df_br.head(5))

           name_eng                                                url  \
0     Seung Han Ahn  https://www.baseball-reference.com/register/pl...   
1    Raúl Alcántara  https://www.baseball-reference.com/register/pl...   
2  Jordan Balazovic  https://www.baseball-reference.com/register/pl...   
3     Kyu Bin Chang  https://www.baseball-reference.com/register/pl...   
4      Ji Kang Choi  https://www.baseball-reference.com/register/pl...   

           name_new  
0     seung han ahn  
1    raul alcantara  
2  jordan balazovic  
3      kyu bin jang  
4      ji kang choi  


In [5]:
inner_join = pd.merge(df_br, df_st, on='name_new', how='inner')
print(inner_join.head(5))
print(len(inner_join))

         name_eng_x                                                url  \
0     Seung Han Ahn  https://www.baseball-reference.com/register/pl...   
1    Raúl Alcántara  https://www.baseball-reference.com/register/pl...   
2  Jordan Balazovic  https://www.baseball-reference.com/register/pl...   
3     Kyu Bin Chang  https://www.baseball-reference.com/register/pl...   
4      Ji Kang Choi  https://www.baseball-reference.com/register/pl...   

           name_new name_kor        name_eng_y  
0     seung han ahn      안승한     Seung-Han Ahn  
1    raul alcantara     알칸타라    Raul Alcántara  
2  jordan balazovic     발라조빅  Jordan Balazovic  
3      kyu bin jang      장규빈      Gyu-Bin Jang  
4      ji kang choi      최지강      Ji-Kang Choi  
2357


In [6]:
# outer_join is only to pick up left over. 
outer_join = pd.merge(df_br, df_st, on='name_new', how='outer')
# outer_join = outer_join.drop(columns=['url'])
outer_join = outer_join[outer_join.isna().any(axis=1)]
print(outer_join.head(5))
print(len(outer_join))

       name_eng_x                                                url  \
12      Al Martin  https://www.baseball-reference.com/register/pl...   
13            NaN                                                NaN   
15            NaN                                                NaN   
16   Alex Maestri  https://www.baseball-reference.com/register/pl...   
19  Allen Webster  https://www.baseball-reference.com/register/pl...   

              name_new name_kor          name_eng_y  
12           al martin      NaN                 NaN  
13       albert martin       마틴       Albert Martin  
15  alessandro maestri    마에스트리  Alessandro Maestri  
16        alex maestri      NaN                 NaN  
19       allen webster      NaN                 NaN  
2939


In [7]:
df_br = outer_join[outer_join['name_eng_y'].isna()].dropna(axis=1)
df_st = outer_join[outer_join['name_eng_x'].isna()].dropna(axis=1)
df_br_set = set(df_br["name_new"].to_list())
df_st_set = set(df_st["name_new"].to_list())
print(df_br.head(3))
print(df_st.head(3))

       name_eng_x                                                url  \
12      Al Martin  https://www.baseball-reference.com/register/pl...   
16   Alex Maestri  https://www.baseball-reference.com/register/pl...   
19  Allen Webster  https://www.baseball-reference.com/register/pl...   

         name_new  
12      al martin  
16   alex maestri  
19  allen webster  
              name_new name_kor          name_eng_y
13       albert martin       마틴       Albert Martin
15  alessandro maestri    마에스트리  Alessandro Maestri
22       an seok jeong      정안석       An-seok Jeong


In [8]:
df_br['name_new'] = df_br.apply(kor_handle_u, args=(df_st_set,), axis=1)
df_st['name_new'] = df_st.apply(kor_handle_u, args=(df_br_set,), axis=1)
# print(df_br.head(3))
# print(df_st.head(3))

given name: arquimedez pozo -> neither exists
given name: beom jun park -> neither exists
given name: beom su jeong -> neither exists
given name: bubba carpenter -> neither exists
given name: bubba smith -> neither exists
given name: bum hyun cho -> at least than 2 charcters, all 'u' (count: 2)
given name: byeong hun kam -> neither exists
given name: byeong hun lee -> neither exists
given name: byeong jun ye -> neither exists
given name: byeong kyu kang -> neither exists
given name: byeong uk cho -> neither exists
given name: byoung gon jeong -> neither exists
given name: byung hee kim -> neither exists
given name: byung hwee lee -> neither exists
given name: byung hyun kim -> at least than 2 charcters, all 'u' (count: 2)
given name: byung ui kang -> at least than 2 charcters, all 'u' (count: 2)
given name: byungho park -> neither exists
given name: chan gun lee -> neither exists
given name: chan uk choi -> neither exists
given name: chan yub noh -> neither exists
given name: chang hun

In [9]:
inner_join_2 = pd.merge(df_br, df_st, on='name_new', how='inner')
print(inner_join_2.head(5))
print(len(inner_join_2))

      name_eng_x                                                url  \
0    Beom Su Kim  https://www.baseball-reference.com/register/pl...   
1   Bo-hyeon Pak  https://www.baseball-reference.com/register/pl...   
2  Bo-hyeong Lee  https://www.baseball-reference.com/register/pl...   
3    Bo-seon Kim  https://www.baseball-reference.com/register/pl...   
4  Bo-seong Chen  https://www.baseball-reference.com/register/pl...   

         name_new name_kor    name_eng_y  
0    beom soo kim      김범수  Beom-Soo Kim  
1   bo hyeon park      박보현  Bo-Hyun Park  
2   bo hyeong lee      이보형  Bo-Hyung Lee  
3     bo seon kim      김보선    Bo-Sun Kim  
4  bo seong cheon      천보성  Bo-sung Chun  
495


In [10]:
# outer_join is only to pick up left over. 
outer_join_2 = pd.merge(df_br, df_st, on='name_new', how='outer')
outer_join_2 = outer_join_2.drop(columns=['url'])
outer_join_2 = outer_join_2[outer_join_2.isna().any(axis=1)]
print(outer_join_2.head(5))
print(len(outer_join_2))

      name_eng_x            name_new name_kor          name_eng_y
0      Al Martin           al martin      NaN                 NaN
1            NaN       albert martin       마틴       Albert Martin
2            NaN  alessandro maestri    마에스트리  Alessandro Maestri
3   Alex Maestri        alex maestri      NaN                 NaN
4  Allen Webster       allen webster      NaN                 NaN
1980


In [11]:
inner_join_final = pd.concat([inner_join, inner_join_2], axis=0)

In [12]:
inner_join_final.to_csv('../data/done_proto.csv', index=False)  
outer_join_2.to_csv('../data/to_do_proto.csv', index=False)  