In [1]:
import pandas as pd
from unidecode import unidecode
from helper.utils import kor_get_last, eng_get_first_and_last, kor_update_last

In [2]:
df_st = pd.read_csv('../data/st_player.csv')
df_br = pd.read_csv('../data/br_player.csv')

In [3]:
# data normalization
# 1. lower all characters.
# 2. remove '-'.
# 3. decode with unicode to remove accents.
# 4. generalize the naming conventions to make sure there is a single way of notating a single Korean last name.
df_st['name_new'] = df_st['name_eng'].str.lower().str.replace("-", " ").apply(unidecode).apply(kor_update_last)
# extra normalization for non-Korean players to only keep the first and last name. 
df_st.loc[df_st['name_eng'].str.contains('-') == False, 'name_new'] = df_st.loc[df_st['name_eng'].str.contains('-') == False, 'name_new'].apply(eng_get_first_and_last)

print(df_st.head(5))

  name_kor        name_eng        name_new
0      김일권     Il-Kwon Kim     il kwon kim
1      김경훈  Kyung-Hoon Kim  kyung hoon kim
2      조충열  Chung-Yeol Cho  chung yeol cho
3      차영화   Young-Hwa Cha   young hwa cha
4      김종모     Jong-Mo Kim     jong mo kim


In [5]:
# data normalization
# 1. lower all characters.
# 2. remove '-'.
# 3. decode with unicode to remove accents.
# 4. generalize the naming conventions to make sure there is a single way of notating a single Korean last name.
df_br['name_new'] = df_br['name_eng'].str.lower().str.replace("-", " ").apply(unidecode).apply(kor_update_last)

# extra normalization for non-Korean players to only keep the first and last name. 
df_br.loc[df_br['name_eng'].str.contains('-') == False, 'name_new'] = df_br.loc[df_br['name_eng'].str.contains('-') == False, 'name_new'].apply(eng_get_first_and_last)

print(df_br.head(5))

           name_eng                                                url  \
0     Seung Han Ahn  https://www.baseball-reference.com/register/pl...   
1    Raúl Alcántara  https://www.baseball-reference.com/register/pl...   
2  Jordan Balazovic  https://www.baseball-reference.com/register/pl...   
3     Kyu Bin Chang  https://www.baseball-reference.com/register/pl...   
4      Ji Kang Choi  https://www.baseball-reference.com/register/pl...   

           name_new  
0     seung han ahn  
1    raul alcantara  
2  jordan balazovic  
3      kyu bin jang  
4      ji kang choi  


In [6]:
inner_join = pd.merge(df_br, df_st, on='name_new', how='inner')
print(inner_join.head(5))
print(len(inner_join))

         name_eng_x                                                url  \
0     Seung Han Ahn  https://www.baseball-reference.com/register/pl...   
1    Raúl Alcántara  https://www.baseball-reference.com/register/pl...   
2  Jordan Balazovic  https://www.baseball-reference.com/register/pl...   
3      Ji Kang Choi  https://www.baseball-reference.com/register/pl...   
4      Jong In Choi  https://www.baseball-reference.com/register/pl...   

           name_new name_kor        name_eng_y  
0     seung han ahn      안승한     Seung-Han Ahn  
1    raul alcantara     알칸타라    Raul Alcántara  
2  jordan balazovic     발라조빅  Jordan Balazovic  
3      ji kang choi      최지강      Ji-Kang Choi  
4      jong in choi      최종인      Jong-In Choi  
1948


In [10]:
# outer_join is only to pick up left over. 
outer_join = pd.merge(df_br, df_st, on='name_new', how='outer')
outer_join = outer_join.drop(columns=['url'])
outer_join = outer_join[outer_join.isna().any(axis=1)]
print(outer_join.head(5))
print(len(outer_join))

       name_eng_x       name_new name_kor     name_eng_y
1   Aaron Altherr  aaron altherr      NaN            NaN
4             NaN  aarona ltherr      알테어  AaronA LTHERR
12    Ah-seop Son    ah seop son      NaN            NaN
13            NaN     ah sub son      손아섭     Ah-Sub Son
14      Al Martin      al martin      NaN            NaN
3664


In [9]:
inner_join.to_csv('../data/done_proto.csv', index=False)  
outer_join.to_csv('../data/to_do_proto.csv', index=False)  