In [1]:
from selenium import webdriver
from bs4 import BeautifulSoup
import requests
import pandas as pd
from io import StringIO
import os

In [2]:
# setting the years needed and url to look at player
year = 2025
url_start = 'https://www.basketball-reference.com/players/j/{}01/gamelog/2025'

In [3]:
# path to create the GameLog folder if it didn't exist
main_folder = "GameLogs" 
if not os.path.exists(main_folder):
    os.makedir(main_folder)

In [4]:
# get full name from input
full_name = input("Enter players name (firstname lastname): ").strip()
# split the name into first name last name
split_name = full_name.split()
if len(split_name) != 2: # raise Error if theres less than 1 name or more than 2
    raise ValueError("Please enter both first and last name. Make sure to include any '-' !")

first_name = split_name[0] 
last_name = split_name[1]
if len(last_name) >= 5:
    last_name = last_name[:5]
    
player_name = last_name.lower() + first_name[:2].lower() # formats the name so the website can use it. Lebron James -> jamesle

Enter players name (firstname lastname):  paul george


In [8]:
# looks up the corresponding year and creates/overwrites the html file in my JokicGameLogs folder
url = url_start.format(player_name)
try: 
    response = requests.get(url, timeout=20)
    response.raise_for_status()
except requests.exceptions.Timeout as e:
    print("Request timed out:", e)
except requests.exceptions.RequestException as e:
    print("An error occurred:", e)
# to write a file for each 'x'
file_path = f"GameLogs/{player_name}GameLog.html"
with open(file_path, "w+", encoding="utf-8") as f:
    f.write(response.text)

An error occurred: 503 Server Error: Service Unavailable for url: https://www.basketball-reference.com/players/j/georgpa01/gamelog/2025


In [14]:
with open(f"GameLogs/{player_name}GameLog.html", encoding="utf-8") as f:
    page = f.read()
    soup = BeautifulSoup(page, "html.parser")
    stats_table = soup.find(id="pgl_basic")
    stats_df = pd.read_html(StringIO(str(stats_table)))[0]
    df = stats_df

ImportError: Missing optional dependency 'html5lib'.  Use pip or conda to install html5lib.

In [7]:
# Drop rows of fluff where the index + 1 is divisible by 20 (21, 41, 61, ...)
if 20 in df.index:
    df = df.drop(20)
if 41 in df.index:
    df = df.drop(41)
if 62 in df.index:
    df = df.drop(62)
if 83 in df.index:
    df = df.drop(83)

##### DATAFRAME IS READY
Now lets clean up column names and nulls

In [9]:
# drop the 'Rk' column so we can use our index col instead
if 'Rk' in df.columns:
    df = df.drop(columns=['Rk'])

# drop null rows for subset df
game_only_df = df.dropna(subset=['G'])
# fill null Game played values with "DNP"
df.fillna({"G":"DNP"}, inplace=True)

In [10]:
# name all colums correctly
df.rename(columns={'Unnamed: 5': 'Location'}, inplace=True)
df.rename(columns={'Unnamed: 7': 'WLSpread'}, inplace=True)
df.fillna({'Location':'Home'}, inplace=True)
df.replace({'@': 'Away'}, inplace=True)

In [63]:
dfs = {}
dfs[player_name] = df

In [11]:
df

Unnamed: 0,G,Date,Age,Tm,Location,Opp,WLSpread,GS,MP,FG,...,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,GmSc,+/-
0,1,2024-10-22,39-297,LAL,Home,MIN,W (+7),1,34:39,7,...,5,5,4,0,2,2,3,16,10.1,-6
1,2,2024-10-25,39-300,LAL,Home,PHO,W (+7),1,34:42,7,...,4,4,8,0,0,2,1,21,17.9,+14
2,3,2024-10-26,39-301,LAL,Home,SAC,W (+4),1,33:46,12,...,14,14,10,0,1,5,3,32,27.1,+13
3,4,2024-10-28,39-303,LAL,Away,PHO,L (-4),1,35:48,3,...,5,5,8,1,0,2,1,11,6.9,-17
4,5,2024-10-30,39-305,LAL,Away,CLE,L (-24),1,28:58,9,...,5,6,3,0,0,6,2,26,18.0,-17
5,6,2024-11-01,39-307,LAL,Away,TOR,W (+6),1,36:15,9,...,6,6,10,1,0,0,1,27,27.3,+11
6,7,2024-11-04,39-310,LAL,Away,DET,L (-12),1,40:00,7,...,7,8,11,2,0,1,1,20,22.7,-16
7,8,2024-11-06,39-312,LAL,Away,MEM,L (-17),1,35:25,15,...,7,7,6,1,0,5,2,39,29.3,-21
8,9,2024-11-08,39-314,LAL,Home,PHI,W (+10),1,33:48,9,...,12,12,13,0,3,2,0,21,27.6,+12
9,10,2024-11-10,39-316,LAL,Home,TOR,W (+20),1,34:47,6,...,8,10,16,1,0,3,0,19,24.6,+13
