In [1]:
import pandas as pd
import numpy as np
import time
import warnings
import duckdb
from bs4 import BeautifulSoup
import requests
import unicodedata
import os

cwd = os.path.abspath(os.getcwd()).replace("\\", "/")
if cwd.startswith("C:/Users/Rodolfo/"):
    RUN_LOCATION = "local"
else:
    RUN_LOCATION = "cloud"
    import cloudscraper

pd.set_option('display.max_columns', None)
warnings.filterwarnings("ignore")

In [2]:
%run ./common_utils.ipynb

In [3]:
def remove_accents(text):
    if not isinstance(text, str):
        return text
    # Normalize to NFKD (decomposes characters)
    text = unicodedata.normalize('NFKD', text)
    # Encode to ASCII bytes, ignore non-ASCII characters, then decode back to str
    return text.encode('ascii', 'ignore').decode('ascii')

df_teams = pd.read_csv("../src/team_info_xref.csv")
teams = df_teams.ABV.tolist()

df = pd.DataFrame()

for team in teams:
    time.sleep(6)
    url = f"https://www.basketball-reference.com/teams/{team}/{YEAR + 1}.html"
    if RUN_LOCATION == 'local':
        html = requests.get(url).text
    else:
        scraper = cloudscraper.create_scraper()
        html = scraper.get(url).text
    soup = BeautifulSoup(html, "lxml")
    
    target_ids = ["roster"]
    target_tables = [soup.find("table", {"id": tid}) for tid in target_ids]
    df_temp = [pd.read_html(str(tbl))[0] for tbl in target_tables][0]
    df_temp['Team'] = team
    display(df_temp)
    df = pd.concat([df, df_temp])
    break
df = df[['Team', 'Player', 'Pos']]
df["Player"] = df["Player"].apply(lambda x: x.encode("latin1").decode("utf-8"))
df['Player'] = df['Player'].apply(remove_accents)
df['Player'] = df.Player.str.replace(" (TW)", "")
df.to_csv(f'../tables/{YEAR}/plyr_pos_xref.csv', index=False)

Unnamed: 0,No.,Player,Pos,Ht,Wt,Birth Date,Birth,Exp,College,Team
0,5.0,Dyson Daniels,SG,6-7,199,"March 17, 2003",au AU,3,,ATL
1,7.0,Nickeil Alexander-Walker,SG,6-5,205,"September 2, 1998",ca CA,6,Virginia Tech,ATL
2,17.0,Onyeka Okongwu,C,6-10,240,"December 11, 2000",us US,5,USC,ATL
3,18.0,Mouhamed Gueye,PF,6-11,210,"November 9, 2002",sn SN,2,Washington State,ATL
4,1.0,Jalen Johnson,SF,6-8,219,"December 18, 2001",us US,4,Duke,ATL
5,10.0,Zaccharie Risacher,SF,6-8,200,"April 8, 2005",es ES,1,,ATL
6,27.0,Vit Krejci,PG,6-8,195,"June 19, 2000",cz CZ,4,,ATL
7,3.0,Luke Kennard,SG,6-5,206,"June 24, 1996",us US,8,Duke,ATL
8,14.0,Asa Newell,PF,6-10,220,"October 5, 2005",us US,R,Georgia,ATL
9,8.0,Kristaps PorziÅÄ£is,C,7-2,240,"August 2, 1995",lv LV,9,,ATL


# Update NULL Positions

In [9]:
df = pd.read_csv(f"../tables/{YEAR}/season_gamelogs.csv")
print('Null Pos Count:', df[(df.Pos.isnull())].shape[0])
df2 = pd.read_csv(f'../tables/{YEAR}/plyr_pos_xref.csv')
orig_cols = df.columns
df = pd.merge(df, df2, how = 'left', on = ['Team', 'Player'], suffixes=('', '_y'))
df['Pos'] = df.Pos.fillna(df.Pos_y)
print('Null Pos Count:', df[(df.Pos.isnull())].shape[0])
df = df[orig_cols]
df.to_csv(f"../tables/{YEAR}/season_gamelogs.csv", index=False)

Null Pos Count: 1579
Null Pos Count: 209
