In [32]:
import pandas as pd
import numpy as np
import requests

from datetime import datetime
from functools import partial
from multiprocessing import Pool
from nba_api.stats.endpoints import LeagueGameLog
from nba_api.stats.endpoints import BoxScoreAdvancedV3
from requests.exceptions import RequestException

season_types = [
    "Regular Season",
    "Pre Season",
    "Playoffs",
    "Preseason",
]


In [2]:
def check_proxy(proxy):
    try:
        res = requests.get("http://example.com", proxies={"http": proxy}, timeout=3)
        if res.ok:
            return proxy
    except IOError:
        return None
    else:
        return None


def get_proxies():
    """retrieves list of proxy addresses using the proxyscrape library

    Returns:
        list[str]: list of proxies of the form port:host
    """
    print("Retrieving proxies...")
    proxies = pd.read_csv(
        "https://raw.githubusercontent.com/TheSpeedX/PROXY-List/master/http.txt",
        header=None,
    )
    df = (
        pd.read_csv(
            "https://raw.githubusercontent.com/monosans/proxy-list/main/proxies/http.txt",
            sep="|",
            header=None,
        )
        .iloc[:, 0]
        .reset_index(drop=True)
    )
    proxies = (
        pd.concat([proxies, df])
        .drop_duplicates()
        .reset_index(drop=True)
        .values.tolist()
    )
    proxies = [p for sublist in proxies for p in sublist]
    print(f"Found {len(proxies)} proxies. Checking proxies...")
    with Pool(250) as p:
        proxies = p.map(check_proxy, proxies)
    proxies = pd.Series(proxies).dropna().tolist()
    print(f"Found {len(proxies)} valid proxies. Returning proxies...")
    return proxies

In [54]:
def get_league_game_log_from_season(season, proxies):
    print(f"Retrieving league game log from {season}...")
    dfs = []
    for season_type in season_types:
        while True:
            try:
                df = LeagueGameLog(
                    season=season,
                    #proxy=np.random.choice(proxies),
                    season_type_all_star=season_type,
                    timeout=3,
                ).get_data_frames()[0]
                df.columns = df.columns.to_series().apply(lambda x: x.lower())
                df = pd.merge(
                    df,
                    df,
                    on=["season_id", "game_id", "game_date", "min"],
                    suffixes=["_home", "_away"],
                )
                df = df[
                    (df["matchup_home"].str.contains("vs."))
                    & (df["team_name_home"] != df["team_name_away"])
                ]
                df["season_type"] = season_type
                dfs.append(df)
                break
            except RequestException:
                continue
            except ValueError:
                return None
            except KeyError:
                break
    df = pd.concat(dfs, ignore_index=True)
    return df

In [45]:
#proxies = get_proxies()
proxies = None
this_year = datetime.now().year
years = list(range(2021, this_year + 1))

In [55]:
with Pool(len(years)) as p:
    dfs = p.map(partial(get_league_game_log_from_season, proxies=proxies), years)
dfs = [df for df in dfs if df is not None]
df = pd.concat(dfs, ignore_index=True).reset_index(drop=True)

Retrieving league game log from 2022...Retrieving league game log from 2021...Retrieving league game log from 2024...Retrieving league game log from 2023...





  df = pd.concat(dfs, ignore_index=True)
  df = pd.concat(dfs, ignore_index=True).reset_index(drop=True)


In [57]:
df.loc[df['season_type'] == 'Playoffs']

Unnamed: 0,season_id,team_id_home,team_abbreviation_home,team_name_home,game_id,game_date,matchup_home,wl_home,min,fgm_home,...,reb_away,ast_away,stl_away,blk_away,tov_away,pf_away,pts_away,plus_minus_away,video_available_away,season_type
1296,42021,1610612742,DAL,Dallas Mavericks,0042100171,2022-04-16,DAL vs. UTA,L,240,29,...,53,15,3,5,14,25,99,6,1,Playoffs
1297,42021,1610612744,GSW,Golden State Warriors,0042100161,2022-04-16,GSW vs. DEN,W,240,43,...,35,26,8,3,11,22,107,-16,1,Playoffs
1298,42021,1610612755,PHI,Philadelphia 76ers,0042100131,2022-04-16,PHI vs. TOR,W,240,43,...,35,27,1,8,8,26,111,-20,1,Playoffs
1299,42021,1610612763,MEM,Memphis Grizzlies,0042100151,2022-04-16,MEM vs. MIN,L,240,39,...,46,32,5,7,19,32,130,13,1,Playoffs
1300,42021,1610612748,MIA,Miami Heat,0042100101,2022-04-17,MIA vs. ATL,W,240,43,...,38,16,8,2,18,19,91,-24,1,Playoffs
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2762,42022,1610612743,DEN,Denver Nuggets,0042200401,2023-06-01,DEN vs. MIA,W,240,40,...,43,26,5,4,8,15,93,-11,1,Playoffs
2763,42022,1610612743,DEN,Denver Nuggets,0042200402,2023-06-04,DEN vs. MIA,L,240,39,...,31,28,5,4,11,22,111,3,1,Playoffs
2764,42022,1610612748,MIA,Miami Heat,0042200403,2023-06-07,MIA vs. DEN,L,240,34,...,58,28,3,5,14,18,109,15,1,Playoffs
2765,42022,1610612748,MIA,Miami Heat,0042200404,2023-06-09,MIA vs. DEN,L,240,35,...,34,26,11,7,8,18,108,13,1,Playoffs
