In [4]:
import requests
from bs4 import BeautifulSoup
from IPython.core.display import HTML
import pandas as pd
import io
import numpy as np
import time
from rapidfuzz import process, fuzz
from unidecode import unidecode
from tqdm import tqdm
from collections import Counter
import re

In [5]:
nba_team_abbreviations = {
    "Atlanta Hawks": "ATL",
    "Boston Celtics": "BOS",
    "Brooklyn Nets": "BRK",
    "Buffalo Braves": "BUF",
    "Charlotte Hornets": "CHO",
    "Charlotte Bobcats": "CHA",
    "Chicago Bulls": "CHI",
    "Cleveland Cavaliers": "CLE",
    "Dallas Mavericks": "DAL",
    "Denver Nuggets": "DEN",
    "Detroit Pistons": "DET",
    "Golden State Warriors": "GSW",
    "Houston Rockets": "HOU",
    "Indiana Pacers": "IND",
    "Kansas City Kings": "KCK",
    "Los Angeles Clippers": "LAC",
    "Los Angeles Lakers": "LAL",
    "Memphis Grizzlies": "MEM",
    "Miami Heat": "MIA",
    "Milwaukee Bucks": "MIL",
    "Minnesota Timberwolves": "MIN",
    "New Jersey Nets": "NJN",
    "New Orleans/Oklahoma City Hornets": "NOK",
    "New Orleans Pelicans": "NOP",
    "New Orleans Hornets": "NOH",
    "New Orleans Jazz": "NOJ",
    "New York Knicks": "NYK",
    "New York Nets": "NYN",
    "Oklahoma City Thunder": "OKC",
    "Orlando Magic": "ORL",
    "Philadelphia 76ers": "PHI",
    "Phoenix Suns": "PHO",
    "Portland Trail Blazers": "POR",
    "Sacramento Kings": "SAC",
    "San Antonio Spurs": "SAS",
    "San Diego Clippers": "SDC",
    "Seattle SuperSonics": "SEA",
    "Toronto Raptors": "TOR",
    "Utah Jazz": "UTA",
    "Vancouver Grizzlies": "VAN",
    "Washington Bullets": "WSB",
    "Washington Wizards": "WAS"
    }


nba_team_names = {
    "ATL": "Atlanta Hawks",
    "BOS": "Boston Celtics",
    "BRK": "Brooklyn Nets",
    "BUF": "Buffalo Braves",
    "CHO": "Charlotte Hornets",
    "CHA": "Charlotte Bobcats",
    "CHI": "Chicago Bulls",
    "CLE": "Cleveland Cavaliers",
    "DAL": "Dallas Mavericks",
    "DEN": "Denver Nuggets",
    "DET": "Detroit Pistons",
    "GSW": "Golden State Warriors",
    "HOU": "Houston Rockets",
    "IND": "Indiana Pacers",
    "KCK": "Kansas City Kings",
    "LAC": "Los Angeles Clippers",
    "LAL": "Los Angeles Lakers",
    "MEM": "Memphis Grizzlies",
    "MIA": "Miami Heat",
    "MIL": "Milwaukee Bucks",
    "MIN": "Minnesota Timberwolves",
    "NJN": "New Jersey Nets",
    "NOK": "New Orleans/Oklahoma City Hornets",
    "NOP": "New Orleans Pelicans",
    "NOH": "New Orleans Hornets",
    "NOJ": "New Orleans Jazz",
    "NYK": "New York Knicks",
    "NYN": "New York Nets",
    "OKC": "Oklahoma City Thunder",
    "ORL": "Orlando Magic",
    "PHI": "Philadelphia 76ers",
    "PHO": "Phoenix Suns",
    "POR": "Portland Trail Blazers",
    "SAC": "Sacramento Kings",
    "SAS": "San Antonio Spurs",
    "SDC": "San Diego Clippers",
    "SEA": "Seattle SuperSonics",
    "TOR": "Toronto Raptors",
    "UTA": "Utah Jazz",
    "VAN": "Vancouver Grizzlies",
    "WSB": "Washington Bullets",
    "WAS": "Washington Wizards"
}

nba_playoff_start_dates = {
    2025: "April 19",
    2024: "April 20",
    2023: "April 15",
    2022: "April 16",
    2021: "May 22",
    2020: "August 17",
    2019: "April 13",
    2018: "April 14",
    2017: "April 15",
    2016: "April 16",
    2015: "April 18",
    2014: "April 19",
    2013: "April 20",
    2012: "April 28",
    2011: "April 16",
    2010: "April 17",
    2009: "April 18",
    2008: "April 19",
    2007: "April 21",
    2006: "April 22",
    2005: "April 23",
    2004: "April 17",
    2003: "April 19",
    2002: "April 20",
    2001: "April 21",
    2000: "April 22",
    1999: "May 8",
    1998: "April 24",
    1997: "April 24",
    1996: "April 25",
    1995: "April 27"
}

nba_playoff_start_dates_dt = {
    year: pd.to_datetime(f"{date} {year}") for year, date in nba_playoff_start_dates.items()
}

In [6]:
def get_positions_link(link, testing = False):
    response = requests.get(link)
    if response.status_code != 200:
        positions = ['PG','SG','SF','PF','C']
        return positions
    else:
        soup = BeautifulSoup(response.text, features="html.parser")
        inactive_section = soup.find_all('p')
        for i in range(10):
            try:
                if testing:
                    print(f'Trying {i}')
                
                positions = inactive_section[i]
                
                strong_tag = positions.find('strong', string=lambda t: t and "Position:" in t)
                position_text = strong_tag.next_sibling.strip()
                break
            except:
                pass
            
            
            
        positions = position_text.strip()
        only_positions = positions.split('\n')[0]
        
        if ',' in only_positions:
            only_positions = only_positions.split(',')
        else:
            if 'and' in only_positions:
                only_positions = only_positions.split('and')
            else:
                only_positions = [only_positions]
        if testing:
            print(f'This is only positions: {only_positions}')
            
            
        final_positions = []
        position_dict = {
            'Point Guard':"PG",
            'Shooting Guard' : "SG",
            'Small Forward' : "SF",
            'Power Forward' : "PF",
            'Center' : "C",
            'Forward' : "SF",
            'Guard' : 'PG'
        }
        for i in range(len(only_positions)):
            cleaned_string = only_positions[i].replace("and", "")
            if testing:
                print(f'This is cleaned_string: {cleaned_string}')
            cleaned_string = cleaned_string.strip()
            if testing:
                print(f'This is cleaned_string: {cleaned_string}')
            
            cleaned_string = position_dict[cleaned_string]
            
            final_positions.append(cleaned_string)
            
        
        return final_positions

In [7]:
def name_to_positions(name, testing = False):
    link = player_link(name)
    positions = get_positions_link(link[0], testing = testing)
    positionsandlink = []
    positionsandlink.append(positions)
    positionsandlink.append(link[1])
    return positionsandlink

In [8]:
def get_positions(team_abv,year):
    link = team_link(team_abv,year)
    list_players = team_players(link)
    
    
    
    names_list = []
    pos1_list = []
    pos2_list = []
    pos3_list = []
    pos4_list = []
    pos5_list = []
    picture_link = []
    for name in list_players:
        time.sleep(8)
        
        positionsAll = name_to_positions(name, False)
        positions = positionsAll[0]
        
        names_list.append(name)
        
        for i in range(5):
            if i == 0:
                try:
                    pos1_list.append(positions[i])
                except:
                    pos1_list.append(np.nan)
            if i == 1:
                try:
                    pos2_list.append(positions[i])
                except:
                    pos2_list.append(np.nan)
            if i == 2:
                try:
                    pos3_list.append(positions[i])
                except:
                    pos3_list.append(np.nan)
            if i == 3:
                try:
                    pos4_list.append(positions[i])
                except:
                    pos4_list.append(np.nan)
            if i == 4:
                try:
                    pos5_list.append(positions[i])
                except:
                    pos5_list.append(np.nan)
        
        picture_link.append(positionsAll[1])
        print(names_list)
        #print(pos1_list)
        #print(pos2_list)
        #print(pos3_list)
        #print(pos4_list)
        #print(pos5_list)
    
    
    df = pd.DataFrame(
        {
            'Players':names_list,
            'Pos1':pos1_list,
            'Pos2':pos2_list,
            'Pos3':pos3_list,
            'Pos4':pos4_list,
            'Pos5':pos5_list,
            'PictureLink':picture_link
            
        }
    )
    
    return df
        
    

In [134]:
data2025 = pd.read_csv("../data/full_nba_data2025.csv")
denver = data2025[data2025['TeamAbv'] == 'DEN']
data2025

Unnamed: 0.1,Unnamed: 0,Date,Starters,TeamName,WonGame,Injured,DidNotPlay,FantasyPoints,MPTimeDelta,MP,...,Start(ET),Overtime,Attend.,InSeasonTournament,GameID,OpponentTeam,OpponentTeamAbv,InjTeamateCount,Starting,Top7InTeam
0,0,2024-10-22,Mikal Bridges,New York Knicks,False,False,False,10.0,0 days 00:34:37,34:37,...,7:30p,False,19156.0,False,20242210BOS,Boston Celtics,BOS,3.0,True,True
1,1,2024-10-22,OG Anunoby,New York Knicks,False,False,False,12.0,0 days 00:34:10,34:10,...,7:30p,False,19156.0,False,20242210BOS,Boston Celtics,BOS,3.0,True,True
2,2,2024-10-22,Jalen Brunson,New York Knicks,False,False,False,10.5,0 days 00:24:30,24:30,...,7:30p,False,19156.0,False,20242210BOS,Boston Celtics,BOS,3.0,True,True
3,3,2024-10-22,Josh Hart,New York Knicks,False,False,False,12.0,0 days 00:24:30,24:30,...,7:30p,False,19156.0,False,20242210BOS,Boston Celtics,BOS,3.0,True,True
4,4,2024-10-22,Karl-Anthony Towns,New York Knicks,False,False,False,16.5,0 days 00:23:37,23:37,...,7:30p,False,19156.0,False,20242210BOS,Boston Celtics,BOS,3.0,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44313,44313,2025-04-25,Leonard Miller,Minnesota Timberwolves,True,False,True,0.0,0 days 00:00:00,0,...,9:30p,False,19312.0,False,2025254MIN,Los Angeles Lakers,LAL,1.0,False,False
44314,44314,2025-04-25,Josh Minott,Minnesota Timberwolves,True,False,True,0.0,0 days 00:00:00,0,...,9:30p,False,19312.0,False,2025254MIN,Los Angeles Lakers,LAL,1.0,False,False
44315,44315,2025-04-25,Terrence Shannon Jr.,Minnesota Timberwolves,True,False,True,0.0,0 days 00:00:00,0,...,9:30p,False,19312.0,False,2025254MIN,Los Angeles Lakers,LAL,1.0,False,False
44316,44316,2025-04-25,Maxi Kleber,Los Angeles Lakers,False,True,True,0.0,0 days 00:00:00,0,...,9:30p,False,19312.0,False,2025254MIN,Minnesota Timberwolves,MIN,1.0,False,False


In [18]:
%pip install rapidfuzz

Note: you may need to restart the kernel to use updated packages.


In [52]:
# letters = [
#     'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
#     'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'y', 'z'
# ]

# all_names = pd.DataFrame()
# for letter in letters:
#     time.sleep(10)
#     print(letter)
#     url = f'https://www.basketball-reference.com/players/{letter}/'
#     response = requests.get(url)
#     response.encoding = 'utf-8'
#     soup = BeautifulSoup(response.text, 'html.parser')

#     table = soup.find('table', {'id': 'players'})
#     table_html = str(table)
#     df = pd.read_html(io.StringIO(table_html))[0]
#     df['Player'] = df['Player'].str.replace('*', '', regex=False)


#     rows = table.tbody.find_all('tr')
#     base_url = 'https://www.basketball-reference.com'
#     links = []

#     for row in rows:
#         if row.get('class') == ['thead']:
#             continue  # Skip subheaders
#         name_cell = row.find('th', {'data-stat': 'player'})
#         link_tag = name_cell.find('a')
#         if link_tag:
#             name = link_tag.text.strip().replace('*', '')
#             url = base_url + link_tag['href']
#             links.append({'Player': name, 'URL': url})


#     link_df = pd.DataFrame(links)
#     merged_df = pd.merge(df, link_df, on='Player', how='left')
#     all_names = pd.concat([all_names,merged_df])

    


a
b
c
d
e
f
g
h
i
j
k
l
m
n
o
p
q
r
s
t
u
v
w
y
z


In [11]:
# all_names.to_csv(f'../data/clean_name.csv')

NameError: name 'all_names' is not defined

In [6]:
all_name_data = pd.read_csv("../data/clean_name.csv")
all_name_data

Unnamed: 0.1,Unnamed: 0,Player,From,To,Pos,Ht,Wt,Birth Date,Colleges,URL
0,0,Alaa Abdelnaby,1991,1995,F-C,6-10,240.0,"June 24, 1968",Duke,https://www.basketball-reference.com/players/a...
1,1,Zaid Abdul-Aziz,1969,1978,C-F,6-9,235.0,"April 7, 1946",Iowa State,https://www.basketball-reference.com/players/a...
2,2,Kareem Abdul-Jabbar,1970,1989,C,7-2,225.0,"April 16, 1947",UCLA,https://www.basketball-reference.com/players/a...
3,3,Mahmoud Abdul-Rauf,1991,2001,G,6-1,162.0,"March 9, 1969",LSU,https://www.basketball-reference.com/players/a...
4,4,Tariq Abdul-Wahad,1998,2003,F,6-6,223.0,"November 3, 1974","Michigan, San Jose State",https://www.basketball-reference.com/players/a...
...,...,...,...,...,...,...,...,...,...,...
5408,15,Ante Žižić,2018,2020,F-C,6-10,266.0,"January 4, 1997",,https://www.basketball-reference.com/players/z...
5409,16,Jim Zoet,1983,1983,C,7-1,240.0,"December 20, 1953",Kent State University,https://www.basketball-reference.com/players/z...
5410,17,Bill Zopf,1971,1971,G,6-1,170.0,"June 7, 1948",Duquesne,https://www.basketball-reference.com/players/z...
5411,18,Ivica Zubac,2017,2025,C,7-0,240.0,"March 18, 1997",,https://www.basketball-reference.com/players/z...


##LLLLLLLLLLLLLLLLLLL
##LLLLLLLLLLLLLLLLLLL
##LLLLLLLLLLLLLLLLLLL
##LLLLLLLLLLLLLLLLLLL


In [8]:
def player_year_base_data(all_names):
    
    
    all_names['Birth Date'] = pd.to_datetime(all_names['Birth Date'])

    def convert_height(ht_str):
        
        try:
            feet, inches = ht_str.split('-')
            return int(feet) + int(inches) / 12
        except:
            return None

    all_names['Ht'] = all_names['Ht'].apply(convert_height)
    all_names['PlayerID'] = all_names['URL'].apply(lambda x: x.split('/')[-1].replace('.html', '') if isinstance(x, str) else None)
    all_names = all_names[['Player','From','To','Pos','Ht','Wt','Birth Date','Colleges','URL','PlayerID']]
    return all_names

    
player_data = player_year_base_data(all_name_data)

In [20]:

player_data_post1980 = player_data[player_data['To'] >= 1982]
player_data_post1980

Unnamed: 0,Player,From,To,Pos,Ht,Wt,Birth Date,Colleges,URL,PlayerID
0,Alaa Abdelnaby,1991,1995,F-C,6.833333,240.0,1968-06-24,Duke,https://www.basketball-reference.com/players/a...,abdelal01
2,Kareem Abdul-Jabbar,1970,1989,C,7.166667,225.0,1947-04-16,UCLA,https://www.basketball-reference.com/players/a...,abdulka01
3,Mahmoud Abdul-Rauf,1991,2001,G,6.083333,162.0,1969-03-09,LSU,https://www.basketball-reference.com/players/a...,abdulma02
4,Tariq Abdul-Wahad,1998,2003,F,6.500000,223.0,1974-11-03,"Michigan, San Jose State",https://www.basketball-reference.com/players/a...,abdulta01
5,Shareef Abdur-Rahim,1997,2008,F,6.750000,225.0,1976-12-11,California,https://www.basketball-reference.com/players/a...,abdursh01
...,...,...,...,...,...,...,...,...,...,...
5406,Stephen Zimmerman,2017,2017,C,7.000000,240.0,1996-09-09,UNLV,https://www.basketball-reference.com/players/z...,zimmest01
5407,Paul Zipser,2017,2018,G-F,6.666667,215.0,1994-02-18,,https://www.basketball-reference.com/players/z...,zipsepa01
5408,Ante Žižić,2018,2020,F-C,6.833333,266.0,1997-01-04,,https://www.basketball-reference.com/players/z...,zizican01
5409,Jim Zoet,1983,1983,C,7.083333,240.0,1953-12-20,Kent State University,https://www.basketball-reference.com/players/z...,zoetji01


In [26]:

response = requests.get("https://www.basketball-reference.com/players/b/birdla01.html")
response.encoding = 'utf-8'
soup = BeautifulSoup(response.text, 'html.parser')
table = soup.find('table', id='per_game_stats')
table_html1 = str(table)
df1 = pd.read_html(io.StringIO(table_html1))[0]
df1['G'] = pd.to_numeric(df1['G'], errors='coerce')
df1['Age'] = pd.to_numeric(df1['Age'], errors='coerce')
df1 = df1[(df1['G'] > 0) & df1['Age'] > 0]
df1

Unnamed: 0,Season,Age,Team,Lg,Pos,G,GS,MP,FG,FGA,...,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Awards
1,1979-80,23.0,BOS,NBA,PF,82.0,82,36.0,8.5,17.8,...,2.6,7.8,10.4,4.5,1.7,0.6,3.2,3.4,21.3,"MVP-4,ROY-1,AS,NBA1"
2,1980-81,24.0,BOS,NBA,PF,82.0,82,39.5,8.8,18.3,...,2.3,8.6,10.9,5.5,2.0,0.8,3.5,2.9,21.2,"MVP-2,AS,NBA1"
3,1981-82,25.0,BOS,NBA,PF,77.0,58,38.0,9.2,18.4,...,2.6,8.3,10.9,5.8,1.9,0.9,3.3,3.2,22.9,"MVP-2,AS,NBA1,DEF2"
4,1982-83,26.0,BOS,NBA,PF,79.0,79,37.7,9.5,18.7,...,2.4,8.6,11.0,5.8,1.9,0.9,3.0,2.5,23.6,"MVP-2,DPOY-3,AS,NBA1,DEF2"
5,1983-84,27.0,BOS,NBA,PF,79.0,77,38.3,9.6,19.5,...,2.3,7.8,10.1,6.6,1.8,0.9,3.0,2.5,24.2,"MVP-1,DPOY-12,AS,NBA1,DEF2"
6,1984-85,28.0,BOS,NBA,SF,80.0,77,39.5,11.5,22.0,...,2.1,8.5,10.5,6.6,1.6,1.2,3.1,2.6,28.7,"MVP-1,AS,NBA1"
7,1985-86,29.0,BOS,NBA,SF,82.0,81,38.0,9.7,19.6,...,2.3,7.5,9.8,6.8,2.0,0.6,3.2,2.2,25.8,"MVP-1,AS,NBA1"
8,1986-87,30.0,BOS,NBA,SF,74.0,73,40.6,10.6,20.2,...,1.7,7.5,9.2,7.6,1.8,0.9,3.2,2.5,28.1,"MVP-3,AS,NBA1"
9,1987-88,31.0,BOS,NBA,SF,76.0,75,39.0,11.6,22.0,...,1.4,7.8,9.3,6.1,1.6,0.8,2.8,2.1,29.9,"MVP-2,AS,NBA1"
10,1988-89,32.0,BOS,NBA,SF,6.0,6,31.5,8.2,17.3,...,0.2,6.0,6.2,4.8,1.0,0.8,1.8,3.0,19.3,


In [19]:
test1 = player_data.iloc[[0]]
firsttest = get_extra_data(test1)
firsttest

Fetching: https://www.basketball-reference.com/players/a/abdelal01.html
1
PF
Found Link: https://www.sports-reference.com/cbb/players/alaa-abdelnaby-1.html?utm_medium=sr_xsite&utm_source=bbr&utm_campaign=2023_02_tbl_player_college_stats&utm_content=lnk_mcbb&utm_id=abdelal01
    Season    Team    Conf   Class  Pos    G  GS    MP   FG  FGA  ...  FTA  \
0  1986-87    Duke     ACC      FR    F   29   0   6.6  1.6  2.8  ...  0.8   
1  1987-88    Duke     ACC      SO    F   34   1   9.4  1.8  3.6  ...  1.9   
2  1988-89    Duke     ACC      JR    F   33  19  16.1  3.7  5.9  ...  2.0   
3  1989-90    Duke     ACC      SR    F   38  37  24.9  5.7  9.2  ...  4.7   
4   Career  Career  Career  Career  NaN  134  57  14.8  3.3  5.6  ...  2.5   

     FT%  TRB  AST  STL  BLK  TOV   PF   PTS  Awards  
0  0.522  1.7  0.2  0.0  0.1  0.4  1.1   3.7     NaN  
1  0.698  2.0  0.1  0.3  0.1  1.0  1.9   4.9     NaN  
2  0.701  3.8  0.3  0.4  0.5  1.2  2.4   8.9     NaN  
3  0.775  6.6  0.7  0.7  1.3  1.4  2

Unnamed: 0,Player,Ht,Wt,Birth Date,Colleges,URL,PlayerID,Pos1,Pos2,Pos3,...,FGA,FG%,PTS,TRB,AST,STL,BLK,TOV,PF,YearsPlayed
0,Alaa Abdelnaby,6.833333,240.0,1968-06-24,Duke,https://www.basketball-reference.com/players/a...,abdelal01,PF,,,...,5.6,0.599,8.5,3.7,0.4,0.4,0.5,1.0,2.1,4


In [17]:
def get_extra_data(small_df):
    url = small_df['URL'].iloc[0]
    print(f"Fetching: {url}")

    response = requests.get(url)
    response.encoding = 'utf-8'
    soup = BeautifulSoup(response.text, 'html.parser')

    data = {
        'Position': None,
        'Shoots': None,
        'Recruiting Rank': None,
        'Draft': None
    }
    
    
    table = soup.find('table', id='per_game_stats')
    table_html1 = str(table)
    df1 = pd.read_html(io.StringIO(table_html1))[0]
    df1['G'] = pd.to_numeric(df1['G'], errors='coerce')
    df1['Age'] = pd.to_numeric(df1['Age'], errors='coerce')
    df1 = df1[(df1['G'] > 0) & df1['Age'] > 0]
    df1




    for p in soup.find_all('p'):
        for strong in p.find_all('strong'):
            label = strong.get_text(strip=True)

            if label == 'Position:':
                data['Position'] = strong.next_sibling.strip() if strong.next_sibling else None
                data['Position'] = data['Position'].split('\n')[0]

            elif label == 'Shoots:':
                data['Shoots'] = strong.next_sibling.strip() if strong.next_sibling else None

            elif label == 'Recruiting Rank:':
                a_tag = strong.find_next_sibling('a')
                extra_text = a_tag.next_sibling.strip() if a_tag and a_tag.next_sibling else ''
                if a_tag:
                    data['Recruiting Rank'] = a_tag.get_text(strip=True) + ' ' + extra_text
            
            elif label == 'Draft:':
                a_tag = strong.find_next_sibling('a')
                extra_text = a_tag.next_sibling.strip() if a_tag and a_tag.next_sibling else ''
                if a_tag:
                    data['Draft'] = a_tag.get_text(strip=True) + ' ' + extra_text

    poslen = len(data['Position'].split(','))
    print(poslen)
    
    try:
        Pos1 = data['Position'].split(',')[0]
        try:
            Pos1 = Pos1.split('and ')[1]
        except:
            pass
        Pos1 = Pos1.strip()
    except:
        Pos1 = np.NaN
    try:
        Pos2 = data['Position'].split(',')[1]
        try:
            Pos2 = Pos2.split('and ')[1]
        except:
            pass
        Pos2 = Pos2.strip()
    except:
        Pos2 = np.NaN
    try:
        Pos3 = data['Position'].split(',')[2]
        try:
            Pos3 = Pos3.split('and ')[1]
        except:
            pass
        Pos3 = Pos3.strip()
    except:
        Pos3 = np.NaN
    try:
        Pos4 = data['Position'].split(',')[3]
        try:
            Pos4 = Pos4.split('and ')[1]
        except:
            pass
        Pos4 = Pos4.strip()
    except:
        Pos4 = np.NaN
    try:
        Pos5 = data['Position'].split(',')[4]
        try:
            Pos5 = Pos5.split('and ')[1]
        except:
            pass
        Pos5 = Pos5.strip()
    except:
        Pos5 = np.NaN
    try:   
        DraftedCalYear = data['Recruiting Rank'].split(" ")[0]
    except:
        DraftedCalYear = None
    try:   
        RecruitingRank = data['Recruiting Rank'].split(" ")[1]
    except:
        RecruitingRank = None
    try:
        TeamDrafted = data['Draft'].split(",")[0].strip()
    except:
        TeamDrafted = None
    try:
        PickDrafted = data['Draft'].split(",")[1].split(",")[0] 
    except:
        PickDrafted = None

        
    position_dict = {
        'Point Guard':"PG",
        'Shooting Guard' : "SG",
        'Small Forward' : "SF",
        'Power Forward' : "PF",
        'Center' : "C",
        'Forward' : "SF",
        'Guard' : 'PG',
        np.NaN : np.NaN
    }
    
    Pos1 = position_dict[Pos1]
    Pos2 = position_dict[Pos2]
    Pos3 = position_dict[Pos3]
    Pos4 = position_dict[Pos4]
    Pos5 = position_dict[Pos5]
    
    
    new_df = pd.DataFrame({
        'Pos1': [Pos1],
        'Pos2': [Pos2],
        'Pos3': [Pos3],
        'Pos4': [Pos4],
        'Pos5': [Pos5],
        'DraftedCalYear': [DraftedCalYear],
        'RecuitRank': [RecruitingRank],
        'TeamDrafted': [TeamDrafted],
        'PickDrafted': [PickDrafted]
    })
    
      
    print(Pos1)  
    small_df = small_df.reset_index()
    new_df = new_df.reset_index()
    combined_df = pd.concat([small_df, new_df], axis=1)
    combined_df = combined_df.reset_index()
    
    college_link = get_college_link(url)
    
    if college_link == None:
        college_titles = ['G','GS','MP','FG','FGA','FG%','PTS','TRB','AST','STL','BLK','TOV','PF','YearsPlayed']
        college_data = pd.DataFrame([[0] * len(college_titles)], columns=college_titles)
    else:
        college_data = college_stats_website(college_link)
        college_data = college_data.reset_index()
        print(college_data)
        
        
    combined_df = pd.concat([combined_df, college_data], axis = 1)
    
    combined_df = combined_df[['Player','Ht','Wt','Birth Date','Colleges','URL','PlayerID',
                              'Pos1','Pos2','Pos3','Pos4','Pos5','DraftedCalYear','RecuitRank','TeamDrafted','PickDrafted',
                              'G','GS','MP','FG','FGA','FG%','PTS','TRB','AST','STL','BLK','TOV','PF','YearsPlayed']]
    
    combined_df['Birth Date'] = pd.to_datetime(combined_df['Birth Date'], format = 'mixed')
    
    return combined_df
    
#reaves = get_extra_data(testing)

In [16]:
def get_college_link(url):
    response = requests.get(url)
    response.encoding = 'utf-8'
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find the <a> tag with the link text that includes "More College Stats on SR/CBB"
    more_stats_link = soup.find('a', string="More College Stats on SR/CBB")

    # Check if the link is found
    if more_stats_link:
        link = more_stats_link.get('href')  # Get the href attribute value
        print(f"Found Link: {link}")
        return link
    else:
        print("Link not found.")
        return None

    
    
# Example usage
#url = testing['URL'].iloc[0]
#college_stats_df = get_more_college_stats_link("https://www.basketball-reference.com/players/j/jamesle01.html")

In [14]:
def college_stats_website(url):
    response = requests.get(url)
    response.encoding = 'utf-8'
    soup = BeautifulSoup(response.text, 'html.parser')
    table = soup.find('table', id='players_per_game')
    
    
    table_html1 = str(table)
    df1 = pd.read_html(io.StringIO(table_html1))[0]
    
    print(df1)
    career_row = df1[df1['Season'] == 'Career']
    df1['G'] = pd.to_numeric(df1['G'], errors='coerce')
    yearsplayed = ((df1['Season'].str.match(r'^\d')) & (df1['G'] > 0)).sum()


    career_row = career_row.drop(columns=['Season', 'Team','Conf','Class','Pos','Awards'], errors='ignore')


    career_row['YearsPlayed'] = yearsplayed


    career_row = career_row[['G','GS','MP','FG','FGA','FG%','PTS','TRB','AST','STL','BLK','TOV','PF','YearsPlayed']]

    return career_row
    

    
college_stats_website(get_college_link('https://www.basketball-reference.com/players/r/reaveau01.html'))

Found Link: https://www.sports-reference.com/cbb/players/austin-reaves-1.html?utm_medium=sr_xsite&utm_source=bbr&utm_campaign=2023_02_tbl_player_college_stats&utm_content=lnk_mcbb&utm_id=reaveau01
                  Season                     Team                     Conf  \
0                2016-17            Wichita State                      MVC   
1                2017-18            Wichita State                      AAC   
2                2018-19  Did not play - transfer  Did not play - transfer   
3                2019-20                 Oklahoma                   Big 12   
4                2020-21                 Oklahoma                   Big 12   
5                 Career                   Career                   Career   
6                    NaN                      NaN                      NaN   
7       Oklahoma (2 Yrs)         Oklahoma (2 Yrs)         Oklahoma (2 Yrs)   
8  Wichita State (2 Yrs)    Wichita State (2 Yrs)    Wichita State (2 Yrs)   

                     C

Unnamed: 0,G,GS,MP,FG,FGA,FG%,PTS,TRB,AST,STL,BLK,TOV,PF,YearsPlayed
5,122,67,24.5,3.3,7.8,0.421,10.8,3.8,2.6,0.7,0.3,1.7,1.5,4


In [53]:
def match_starters_only(yeardata):
    starters = list(yeardata['Starters'].unique())
    clean_names = pd.read_csv("../data/clean_name.csv")['Player'].unique()

    matches = []
    scorers = {
        'ratio': fuzz.ratio,
        'partial_ratio': fuzz.partial_ratio,
        'token_sort_ratio': fuzz.token_sort_ratio,
        'token_set_ratio': fuzz.token_set_ratio
    }

    for name in starters:
        best_matches = {'Original': name}
        for scorer_name, scorer_func in scorers.items():
            match, score, _ = process.extractOne(name, clean_names, scorer=scorer_func)
            best_matches[f'Matched_{scorer_name}'] = match
            best_matches[f'Score_{scorer_name}'] = score
        matches.append(best_matches)

    matches_df = pd.DataFrame(matches)

    def resolve_best_match(row):
        matched_names = [
            row['Matched_ratio'],
            row['Matched_partial_ratio'],
            row['Matched_token_sort_ratio'],
            row['Matched_token_set_ratio']
        ]
        name_counts = Counter(matched_names)
        return name_counts.most_common(1)[0][0]

    matches_df['Best_Match'] = matches_df.apply(resolve_best_match, axis=1)

    return matches_df

In [54]:
def apply_clean_names(df_original, matches_df):
    return df_original.merge(
        matches_df[['Original', 'Best_Match']],
        left_on='Starters',
        right_on='Original',
        how='left'
    ).assign(Starters=lambda df: df['Best_Match']).drop(columns=['Original', 'Best_Match'])


In [55]:
def clean_names_df(yeardata):
    matches_df = match_starters_only(yeardata)
    problems_df = get_problematic_matches(matches_df)


    bad_names = set(problems_df['Original'])
    yeardata_filtered = yeardata[~yeardata['Starters'].isin(bad_names)].copy()

    yeardata_cleaned = apply_clean_names(yeardata_filtered, matches_df)

    return yeardata_cleaned

In [142]:
clean_data2024 = clean_names_df(data2024)
clean_data2024

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Date,Starters,TeamName,WonGame,Injured,DidNotPlay,FantasyPoints,MPTimeDelta,...,Start(ET),Overtime,Attend.,InSeasonTournament,GameID,OpponentTeam,OpponentTeamAbv,InjTeamateCount,Starting,Top7InTeam
0,0,0,2023-10-24,D'Angelo Russell,Los Angeles Lakers,False,False,False,16.5,0 days 00:36:11,...,7:30p,False,19842,False,20232410DEN,Denver Nuggets,DEN,5.0,True,True
1,1,1,2023-10-24,Anthony Davis,Los Angeles Lakers,False,False,False,23.0,0 days 00:34:09,...,7:30p,False,19842,False,20232410DEN,Denver Nuggets,DEN,5.0,True,True
2,2,2,2023-10-24,Austin Reaves,Los Angeles Lakers,False,False,False,21.5,0 days 00:31:20,...,7:30p,False,19842,False,20232410DEN,Denver Nuggets,DEN,5.0,True,True
3,3,3,2023-10-24,Taurean Prince,Los Angeles Lakers,False,False,False,16.0,0 days 00:29:53,...,7:30p,False,19842,False,20232410DEN,Denver Nuggets,DEN,5.0,True,True
4,4,4,2023-10-24,LeBron James,Los Angeles Lakers,False,False,False,26.0,0 days 00:29:00,...,7:30p,False,19842,False,20232410DEN,Denver Nuggets,DEN,5.0,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43529,43629,43629,2024-04-19,Matt Ryan,New Orleans Pelicans,True,False,True,0.0,0 days 00:00:00,...,9:30p,False,18656,False,2024194NOP,Sacramento Kings,SAC,1.0,False,False
43530,43630,43630,2024-04-19,Cody Zeller,New Orleans Pelicans,True,False,True,0.0,0 days 00:00:00,...,9:30p,False,18656,False,2024194NOP,Sacramento Kings,SAC,1.0,False,False
43531,43631,43631,2024-04-19,Kevin Huerter,Sacramento Kings,False,True,True,0.0,0 days 00:00:00,...,9:30p,False,18656,False,2024194NOP,New Orleans Pelicans,NOP,2.0,False,False
43532,43632,43632,2024-04-19,Malik Monk,Sacramento Kings,False,True,True,0.0,0 days 00:00:00,...,9:30p,False,18656,False,2024194NOP,New Orleans Pelicans,NOP,2.0,False,False


In [146]:

def year_base_player_data(year, type_data = 'regsea'):
    
    if type_data == 'regsea':
        year_data = pd.read_csv(f"../data/regularseason/regsea_nba_data{year}.csv")
    elif type_data == 'playoffs':
        year_data = pd.read_csv(f"../data/playoffs/playoff_nba_data{year}.csv")
    elif type_data == 'full':
        year_data = pd.read_csv(f"../data/full_nba_data{year}.csv")
        
        
    cleaned_names = clean_names_df(year_data)
    return player_year_base_data(cleaned_names, year)


year_base_player_data(2024)

Unnamed: 0,Player,From,To,Pos,Ht,Wt,Birth Date,Colleges,URL,PlayerID
848,Jaylen Clark,2025,2025,G,6.416667,205.0,2001-10-13,UCLA,https://www.basketball-reference.com/players/c...,clarkja02
3542,Miles Norris,2025,2025,F,6.833333,220.0,2000-04-15,"Oregon, City College of San Francisco, UC Sant...",https://www.basketball-reference.com/players/n...,norrimi01
3042,Jaylen Martin,2025,2025,F,6.500000,216.0,2004-01-28,,https://www.basketball-reference.com/players/m...,martija02
3234,Nathan Mensah,2024,2024,F,6.666667,230.0,1998-04-09,San Diego State,https://www.basketball-reference.com/players/m...,mensana01
4399,Dmytro Skapintsev,2024,2024,C,7.083333,260.0,1998-05-12,Cal State Northridge,https://www.basketball-reference.com/players/s...,skapidm01
...,...,...,...,...,...,...,...,...,...,...
5388,Thaddeus Young,2008,2024,F,6.666667,235.0,1988-06-21,Georgia Tech,https://www.basketball-reference.com/players/y...,youngth01
4841,P.J. Tucker,2007,2025,F,6.416667,245.0,1985-05-05,Texas,https://www.basketball-reference.com/players/t...,tuckepj01
2923,Kyle Lowry,2007,2025,G,6.000000,196.0,1986-03-25,Villanova,https://www.basketball-reference.com/players/l...,lowryky01
3717,Chris Paul,2006,2025,G,6.000000,175.0,1985-05-06,Wake Forest,https://www.basketball-reference.com/players/p...,paulch01


In [159]:
test2024 = year_base_player_data(2024)
test2024

Unnamed: 0,Player,From,To,Pos,Ht,Wt,Birth Date,Colleges,URL,PlayerID
848,Jaylen Clark,2025,2025,G,6.416667,205.0,2001-10-13,UCLA,https://www.basketball-reference.com/players/c...,clarkja02
3542,Miles Norris,2025,2025,F,6.833333,220.0,2000-04-15,"Oregon, City College of San Francisco, UC Sant...",https://www.basketball-reference.com/players/n...,norrimi01
3042,Jaylen Martin,2025,2025,F,6.500000,216.0,2004-01-28,,https://www.basketball-reference.com/players/m...,martija02
3234,Nathan Mensah,2024,2024,F,6.666667,230.0,1998-04-09,San Diego State,https://www.basketball-reference.com/players/m...,mensana01
4399,Dmytro Skapintsev,2024,2024,C,7.083333,260.0,1998-05-12,Cal State Northridge,https://www.basketball-reference.com/players/s...,skapidm01
...,...,...,...,...,...,...,...,...,...,...
5388,Thaddeus Young,2008,2024,F,6.666667,235.0,1988-06-21,Georgia Tech,https://www.basketball-reference.com/players/y...,youngth01
4841,P.J. Tucker,2007,2025,F,6.416667,245.0,1985-05-05,Texas,https://www.basketball-reference.com/players/t...,tuckepj01
2923,Kyle Lowry,2007,2025,G,6.000000,196.0,1986-03-25,Villanova,https://www.basketball-reference.com/players/l...,lowryky01
3717,Chris Paul,2006,2025,G,6.000000,175.0,1985-05-06,Wake Forest,https://www.basketball-reference.com/players/p...,paulch01


In [257]:
testing = test2024[test2024['Player'] == 'Kyrie Irving']
testing

Unnamed: 0,Player,From,To,Pos,Ht,Wt,Birth Date,Colleges,URL,PlayerID
2248,Kyrie Irving,2012,2025,G,6.166667,195.0,1992-03-23,Duke,https://www.basketball-reference.com/players/i...,irvinky01


In [266]:
reaves

Unnamed: 0,Player,Ht,Wt,Birth Date,Colleges,URL,PlayerID,Pos1,Pos2,Pos3,...,FGA,FG%,PTS,TRB,AST,STL,BLK,TOV,PF,YearsPlayed
0,Kyrie Irving,6.166667,195.0,1992-03-23,Duke,https://www.basketball-reference.com/players/i...,irvinky01,SG,,,...,9.5,0.529,17.5,3.4,4.3,1.5,0.5,2.5,2.1,1


In [259]:
kryie

Unnamed: 0,Player,From,To,Pos,Ht,Wt,Birth Date,Colleges,URL,PlayerID,...,FGA,FG%,PTS,TRB,AST,STL,BLK,TOV,PF,YearsPlayed
0,Kyrie Irving,2012,2025,G,6.166667,195.0,1992-03-23,Duke,https://www.basketball-reference.com/players/i...,irvinky01,...,9.5,0.529,17.5,3.4,4.3,1.5,0.5,2.5,2.1,1


In [128]:
def reg_playoff_split(year, save_to_file = False):
    base_df = pd.read_csv(f"../data/full_nba_data{year}.csv")
    base_df['Date'] = pd.to_datetime(base_df['Date'])
    playoffdf = base_df[base_df['Date'] >= nba_playoff_start_dates_dt[year]]
    regdf = base_df[base_df['Date'] < nba_playoff_start_dates_dt[year]]
    
    print(regdf.groupby('TeamAbv')['GameID'].nunique())
    
    if save_to_file:
        playoffdf.to_csv(f'../data/playoffs/playoff_nba_data{year}.csv')
        regdf.to_csv(f'../data/regularseason/regsea_nba_data{year}.csv')
        
    
    return playoffdf, regdf

reg_playoff_split(2018, True)

TeamAbv
ATL    82
BOS    82
BRK    82
CHI    82
CHO    82
CLE    82
DAL    82
DEN    82
DET    82
GSW    82
HOU    82
IND    82
LAC    82
LAL    82
MEM    82
MIA    82
MIL    82
MIN    82
NOP    82
NYK    82
OKC    82
ORL    82
PHI    82
PHO    82
POR    82
SAC    82
SAS    82
TOR    82
UTA    82
WAS    82
Name: GameID, dtype: int64


(       Unnamed: 0       Date           Starters               TeamName  \
 40680       40680 2018-04-14        Patty Mills      San Antonio Spurs   
 40681       40681 2018-04-14  LaMarcus Aldridge      San Antonio Spurs   
 40682       40682 2018-04-14    Dejounte Murray      San Antonio Spurs   
 40683       40683 2018-04-14        Danny Green      San Antonio Spurs   
 40684       40684 2018-04-14      Kyle Anderson      San Antonio Spurs   
 ...           ...        ...                ...                    ...   
 43134       43134 2018-06-08    Jordan Clarkson    Cleveland Cavaliers   
 43135       43135 2018-06-08         Quinn Cook  Golden State Warriors   
 43136       43136 2018-06-08       Damian Jones  Golden State Warriors   
 43137       43137 2018-06-08   Kendrick Perkins    Cleveland Cavaliers   
 43138       43138 2018-06-08        Okaro White    Cleveland Cavaliers   
 
        WonGame  Injured  DidNotPlay  FantasyPoints      MPTimeDelta     MP  \
 40680    False    

In [130]:
reg_playoff_split(2000, True)

TeamAbv
ATL    82
BOS    82
CHH    82
CHI    82
CLE    82
DAL    82
DEN    82
DET    82
GSW    82
HOU    82
IND    82
LAC    82
LAL    82
MIA    82
MIL    82
MIN    82
NJN    82
NYK    82
ORL    82
PHI    82
PHO    82
POR    82
SAC    82
SAS    82
SEA    82
TOR    82
UTA    82
VAN    82
WAS    82
Name: GameID, dtype: int64


(       Unnamed: 0       Date       Starters            TeamName  WonGame  \
 28397       28397 2000-04-22  Allen Iverson  Philadelphia 76ers     True   
 28398       28398 2000-04-22      Eric Snow  Philadelphia 76ers     True   
 28399       28399 2000-04-22   Theo Ratliff  Philadelphia 76ers     True   
 28400       28400 2000-04-22    Tyrone Hill  Philadelphia 76ers     True   
 28401       28401 2000-04-22   George Lynch  Philadelphia 76ers     True   
 ...           ...        ...            ...                 ...      ...   
 30192       30192 2000-06-19   Derek Fisher  Los Angeles Lakers     True   
 30193       30193 2000-06-19     Brian Shaw  Los Angeles Lakers     True   
 30194       30194 2000-06-19  Devean George  Los Angeles Lakers     True   
 30195       30195 2000-06-19  Travis Knight  Los Angeles Lakers     True   
 30196       30196 2000-06-19    John Salley  Los Angeles Lakers     True   
 
        Injured  DidNotPlay  FantasyPoints      MPTimeDelta     MP  ...  \