In [4]:
import pandas as pd
from bs4 import BeautifulSoup
from urllib.request import urlopen
import requests
import time 

# Big 12 Scrape

In [5]:
base_url = "https://www.sports-reference.com/cbb/conferences/big-12/men/{}-stats.html"

# years to scrape
years = list(range(2003, 2024))  # 2003 to 2023

# columns to scrape
columns = ["Player", "Team", "Pos", 'PTS', 'AST', "TRB", 'ORB', 'DRB', 
'STL', 'BLK', 'TOV', "G", "MP", "FG", "FGA", "FG%", "eFG%", 'FT', 'FTA', 'FT%']

# empty DataFrame to hold all data
all_data = pd.DataFrame()

In [6]:
# Loop through each year and scrape data
for year in years:
    url = base_url.format(year)
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")
    
    # Find the table
    tables = soup.find_all('table')
    
    # read table into a dataframe
    df = pd.read_html(str(tables))[0]
    
    
    # select columns 
    df = df[columns]
    
    # add a column for year
    df["Year"] = year
    
    # append to main DataFrame
    all_data = pd.concat([all_data, df])
    
    # rate limiting so I don't break this server
    time.sleep(1)

# Reset the index of the final DataFrame
all_data.reset_index(drop=True, inplace=True)

In [7]:
all_data

Unnamed: 0,Player,Team,Pos,PTS,AST,TRB,ORB,DRB,STL,BLK,...,G,MP,FG,FGA,FG%,eFG%,FT,FTA,FT%,Year
0,Andre Emmett,Texas Tech,G,21.8,1.9,6.6,2.3,4.4,1.9,0.5,...,34.0,37.0,8.7,17.5,0.499,0.508,4.0,5.6,0.712,2003
1,Nick Collison,Kansas,F,18.5,2.2,10.0,3.3,6.7,1.2,1.9,...,38.0,32.4,7.4,13.3,0.554,0.567,3.3,5.3,0.635,2003
2,Hollis Price,Oklahoma,G,18.0,2.8,2.7,0.5,2.3,1.6,0.2,...,34.0,33.6,5.7,12.6,0.453,0.563,3.8,4.1,0.929,2003
3,Rickey Paulding,Missouri,G,17.4,2.2,5.5,1.9,3.6,0.7,0.4,...,33.0,34.9,6.0,13.4,0.450,0.535,3.0,3.8,0.800,2003
4,Kirk Hinrich,Kansas,G,17.3,3.5,3.8,1.0,2.7,1.9,0.4,...,37.0,33.5,6.3,13.2,0.475,0.567,2.4,3.4,0.704,2003
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3153,Brooks Manzer,Oklahoma State,G,0.0,0.3,0.3,0.0,0.3,0.0,0.0,...,3.0,1.3,0.0,0.0,,,0.0,0.0,,2023
3154,Carson Sager,Oklahoma State,G,0.0,0.0,0.1,0.0,0.1,0.0,0.0,...,7.0,1.1,0.0,0.0,,,0.0,0.0,,2023
3155,Trey Stuart,TCU,G,0.0,0.0,0.3,0.2,0.2,0.0,0.0,...,6.0,1.0,0.0,0.3,0.000,0.000,0.0,0.0,,2023
3156,Zach Gonsoulin,TCU,G,0.0,0.0,0.2,0.0,0.2,0.0,0.0,...,6.0,1.0,0.0,0.2,0.000,0.000,0.0,0.0,,2023


In [8]:
season_avg_df = all_data[all_data['Player'] != 'League Average']

In [9]:
season_avg_df

Unnamed: 0,Player,Team,Pos,PTS,AST,TRB,ORB,DRB,STL,BLK,...,G,MP,FG,FGA,FG%,eFG%,FT,FTA,FT%,Year
0,Andre Emmett,Texas Tech,G,21.8,1.9,6.6,2.3,4.4,1.9,0.5,...,34.0,37.0,8.7,17.5,0.499,0.508,4.0,5.6,0.712,2003
1,Nick Collison,Kansas,F,18.5,2.2,10.0,3.3,6.7,1.2,1.9,...,38.0,32.4,7.4,13.3,0.554,0.567,3.3,5.3,0.635,2003
2,Hollis Price,Oklahoma,G,18.0,2.8,2.7,0.5,2.3,1.6,0.2,...,34.0,33.6,5.7,12.6,0.453,0.563,3.8,4.1,0.929,2003
3,Rickey Paulding,Missouri,G,17.4,2.2,5.5,1.9,3.6,0.7,0.4,...,33.0,34.9,6.0,13.4,0.450,0.535,3.0,3.8,0.800,2003
4,Kirk Hinrich,Kansas,G,17.3,3.5,3.8,1.0,2.7,1.9,0.4,...,37.0,33.5,6.3,13.2,0.475,0.567,2.4,3.4,0.704,2003
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3152,Yaya Keita,Oklahoma,F,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.0,1.5,0.0,0.0,,,0.0,0.0,,2023
3153,Brooks Manzer,Oklahoma State,G,0.0,0.3,0.3,0.0,0.3,0.0,0.0,...,3.0,1.3,0.0,0.0,,,0.0,0.0,,2023
3154,Carson Sager,Oklahoma State,G,0.0,0.0,0.1,0.0,0.1,0.0,0.0,...,7.0,1.1,0.0,0.0,,,0.0,0.0,,2023
3155,Trey Stuart,TCU,G,0.0,0.0,0.3,0.2,0.2,0.0,0.0,...,6.0,1.0,0.0,0.3,0.000,0.000,0.0,0.0,,2023


In [10]:
season_avg_df.columns

Index(['Player', 'Team', 'Pos', 'PTS', 'AST', 'TRB', 'ORB', 'DRB', 'STL',
       'BLK', 'TOV', 'G', 'MP', 'FG', 'FGA', 'FG%', 'eFG%', 'FT', 'FTA', 'FT%',
       'Year'],
      dtype='object')

In [11]:
new_column = ['Player', 'Team', 'Pos', 'PTS', 'AST', 'TRB', 'ORB', 'DRB',
       'STL', 'BLK', 'TOV', 'G', 'MP', 'FG', 'FGA', 'FG%', 'eFG%', 'FT', 'FTA',
       'FT%', 'Year']

In [12]:
new_column = [column.lower() for column in new_column]

In [13]:
season_avg_df.columns = new_column

In [14]:
season_avg_df

Unnamed: 0,player,team,pos,pts,ast,trb,orb,drb,stl,blk,...,g,mp,fg,fga,fg%,efg%,ft,fta,ft%,year
0,Andre Emmett,Texas Tech,G,21.8,1.9,6.6,2.3,4.4,1.9,0.5,...,34.0,37.0,8.7,17.5,0.499,0.508,4.0,5.6,0.712,2003
1,Nick Collison,Kansas,F,18.5,2.2,10.0,3.3,6.7,1.2,1.9,...,38.0,32.4,7.4,13.3,0.554,0.567,3.3,5.3,0.635,2003
2,Hollis Price,Oklahoma,G,18.0,2.8,2.7,0.5,2.3,1.6,0.2,...,34.0,33.6,5.7,12.6,0.453,0.563,3.8,4.1,0.929,2003
3,Rickey Paulding,Missouri,G,17.4,2.2,5.5,1.9,3.6,0.7,0.4,...,33.0,34.9,6.0,13.4,0.450,0.535,3.0,3.8,0.800,2003
4,Kirk Hinrich,Kansas,G,17.3,3.5,3.8,1.0,2.7,1.9,0.4,...,37.0,33.5,6.3,13.2,0.475,0.567,2.4,3.4,0.704,2003
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3152,Yaya Keita,Oklahoma,F,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.0,1.5,0.0,0.0,,,0.0,0.0,,2023
3153,Brooks Manzer,Oklahoma State,G,0.0,0.3,0.3,0.0,0.3,0.0,0.0,...,3.0,1.3,0.0,0.0,,,0.0,0.0,,2023
3154,Carson Sager,Oklahoma State,G,0.0,0.0,0.1,0.0,0.1,0.0,0.0,...,7.0,1.1,0.0,0.0,,,0.0,0.0,,2023
3155,Trey Stuart,TCU,G,0.0,0.0,0.3,0.2,0.2,0.0,0.0,...,6.0,1.0,0.0,0.3,0.000,0.000,0.0,0.0,,2023


In [15]:
numeric_columns = ['pts', 'ast','trb', 'orb','drb', 'stl',
 'blk','tov','g','mp','fg','fga','fg%','efg%','ft','fta','ft%']

In [16]:
career_avg_df = season_avg_df.groupby('player').agg(
    {column: 'mean' for column in numeric_columns}
).reset_index()

In [17]:
career_avg_df = career_avg_df.round(2)

In [18]:
career_avg_df = career_avg_df.sort_values(by = 'pts', ascending = False).reset_index(drop = True)

In [19]:
acc_career_avg_df = career_avg_df

In [20]:
career_avg_df['mp'].mean(), career_avg_df['g'].mean()

(15.290613915416099, 23.108712534059944)

In [21]:
career_avg_df

Unnamed: 0,player,pts,ast,trb,orb,drb,stl,blk,tov,g,mp,fg,fga,fg%,efg%,ft,fta,ft%
0,Trae Young,27.4,8.70,3.9,0.40,3.5,1.7,0.3,5.2,32.0,35.4,8.20,19.3,0.42,0.52,7.40,8.6,0.86
1,Michael Beasley,26.2,1.20,12.4,4.00,8.4,1.3,1.6,2.9,33.0,31.5,9.30,17.5,0.53,0.56,6.50,8.5,0.77
2,Kevin Durant,25.8,1.30,11.1,3.00,8.1,1.9,1.9,2.8,35.0,35.9,8.70,18.5,0.47,0.54,6.00,7.3,0.82
3,Andre Emmett,21.2,1.85,6.6,2.15,4.5,1.6,0.4,2.1,34.0,37.0,8.15,16.0,0.51,0.53,4.45,6.1,0.72
4,Cade Cunningham,20.1,3.50,6.2,0.70,5.5,1.6,0.8,4.0,27.0,35.4,6.50,14.8,0.44,0.52,4.90,5.8,0.85
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1463,Songo Adoki,0.0,0.00,0.0,0.00,0.0,0.0,0.0,0.0,1.0,0.0,0.00,0.0,,,0.00,0.0,
1464,Grayson George,0.0,0.00,0.3,0.20,0.2,0.0,0.0,0.0,6.0,0.5,0.00,0.0,,,0.00,0.0,
1465,Guy Ikpah,0.0,0.00,0.0,0.00,0.0,0.0,0.0,0.0,1.0,1.0,0.00,0.0,,,0.00,0.0,
1466,Patrick Muldoon,0.0,0.00,0.5,0.10,0.4,0.0,0.0,0.3,8.0,1.1,0.00,0.3,0.00,0.00,0.00,0.0,


In [22]:
career_avg_df = career_avg_df[career_avg_df['g'] > 20]
career_avg_df = career_avg_df[career_avg_df['mp'] > 15]

In [23]:
career_avg_df

Unnamed: 0,player,pts,ast,trb,orb,drb,stl,blk,tov,g,mp,fg,fga,fg%,efg%,ft,fta,ft%
0,Trae Young,27.4,8.70,3.90,0.40,3.50,1.70,0.30,5.20,32.0,35.40,8.20,19.30,0.42,0.52,7.40,8.60,0.86
1,Michael Beasley,26.2,1.20,12.40,4.00,8.40,1.30,1.60,2.90,33.0,31.50,9.30,17.50,0.53,0.56,6.50,8.50,0.77
2,Kevin Durant,25.8,1.30,11.10,3.00,8.10,1.90,1.90,2.80,35.0,35.90,8.70,18.50,0.47,0.54,6.00,7.30,0.82
3,Andre Emmett,21.2,1.85,6.60,2.15,4.50,1.60,0.40,2.10,34.0,37.00,8.15,16.00,0.51,0.53,4.45,6.10,0.72
4,Cade Cunningham,20.1,3.50,6.20,0.70,5.50,1.60,0.80,4.00,27.0,35.40,6.50,14.80,0.44,0.52,4.90,5.80,0.85
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
918,Mylik Wilson,2.5,1.70,1.70,0.30,1.40,1.20,0.50,0.80,32.0,15.20,1.00,2.20,0.44,0.45,0.50,0.80,0.63
921,Kevin Noreen,2.5,0.75,3.55,1.65,1.90,0.45,0.40,0.70,32.0,16.85,0.75,1.55,0.51,0.55,0.80,1.20,0.67
934,Hans Brase,2.4,0.60,3.80,0.70,3.10,0.20,0.40,0.70,21.0,15.40,0.80,2.60,0.30,0.34,0.60,0.80,0.76
957,Philip Jurick,2.2,0.50,5.35,1.80,3.55,0.20,1.40,0.90,30.0,16.90,1.00,1.65,0.64,0.64,0.20,0.55,0.37


In [24]:
career_avg_df.to_csv('big_12_career_avg_copy.csv')