# NBA All star rosters collection 

## Data Scrape - All-Star Rosters

### Import libraries

In [1]:

import pandas as pd
import requests
from bs4 import BeautifulSoup
import time

### testing scrape

In [2]:
#Testing scrape on the 2021 NBA All-Star game
url = 'https://www.basketball-reference.com/allstar/NBA_2021.html'
rec = requests.get(url)

In [3]:
rec.status_code

200

In [4]:
soup = BeautifulSoup(rec.content, 'lxml')

In [5]:
#table for first team
table = soup.find_all('table')[1]

In [6]:
table.find('tbody').find_all('tr')[0].find('th').text

'Luka Dončić'

In [7]:
#looping through first table on site to get players from the first team
[tr.find('th').text for tr in table.find('tbody').find_all('tr') if tr.find('th').text != ('Reserves')]

['Luka Dončić',
 'Stephen Curry',
 'Giannis Antetokounmpo',
 'Nikola Jokić',
 'LeBron James',
 '',
 'Chris Paul',
 'Jaylen Brown',
 'Paul George',
 'Damian Lillard',
 'Domantas Sabonis',
 'Rudy Gobert']

In [8]:
#table for second team
table_2 = soup.find_all('table')[2]

In [9]:
table_2.find('tbody').find_all('tr')[0].find('th').text

'Kyrie Irving'

In [10]:
#looping through to get players from the second team
[tr.find('th').text for tr in table_2.find('tbody').find_all('tr') if tr.find('th').text != 'Reserves']

['Kyrie Irving',
 'Bradley Beal',
 'Kawhi Leonard',
 'Jayson Tatum',
 'Zion Williamson',
 '',
 'James Harden',
 'Donovan Mitchell',
 'Zach LaVine',
 'Nikola Vučević',
 'Julius Randle',
 'Mike Conley']

### Complete scrape

In [12]:
#getting All-Star rosters from 2006-2021
player_list = []

for i in range(2006, 2022):
    print(f'Scraping the {i} All-Star game')
    url = f'https://www.basketball-reference.com/allstar/NBA_{i}.html'
    rec = requests.get(url)
    if rec.status_code == 200:
        soup = BeautifulSoup(rec.content, 'lxml')
        
        #getting players for team 1
        table_1 = soup.find_all('table')[1]
        team_1 = [tr.find('th').text for tr in table_1.find('tbody').find_all('tr') if tr.find('th').text != ('Reserves')]
        player_list.extend(team_1)
        
        #getting players for team 2
        table_2 = soup.find_all('table')[2]
        team_2 = [tr.find('th').text for tr in table_2.find('tbody').find_all('tr') if tr.find('th').text != ('Reserves')]
        player_list.extend(team_2)
        
    else: 
        print('website error')
    
    time.sleep(1)
        

Scraping the 2006 All-Star game
Scraping the 2007 All-Star game
Scraping the 2008 All-Star game
Scraping the 2009 All-Star game
Scraping the 2010 All-Star game
Scraping the 2011 All-Star game
Scraping the 2012 All-Star game
Scraping the 2013 All-Star game
Scraping the 2014 All-Star game
Scraping the 2015 All-Star game
Scraping the 2016 All-Star game
Scraping the 2017 All-Star game
Scraping the 2018 All-Star game
Scraping the 2019 All-Star game
Scraping the 2020 All-Star game
Scraping the 2021 All-Star game


In [13]:
#creating dataframe
all_star_df = pd.DataFrame(player_list, columns = ['Name'])

In [14]:
all_star_df.head()

Unnamed: 0,Name
0,Dwyane Wade
1,LeBron James
2,Allen Iverson
3,Shaquille O'Neal
4,Vince Carter


In [17]:
all_star_df.shape

(415, 1)

In [18]:
#Dropping duplicates
#Many players have been selected for multiple games
all_star_df.drop_duplicates(inplace = True)

In [19]:
#need to remove empty name
all_star_df.sort_values(by = 'Name').head()

Unnamed: 0,Name
5,
114,Al Horford
2,Allen Iverson
36,Amar'e Stoudemire
283,Andre Drummond


In [20]:
#dropping missing value
all_star_df.drop(5, inplace = True)

In [21]:
#resetting index
all_star_df.reset_index(drop = True, inplace = True)

In [23]:
all_star_df

Unnamed: 0,Name
0,Dwyane Wade
1,LeBron James
2,Allen Iverson
3,Shaquille O'Neal
4,Vince Carter
...,...
108,Jaylen Brown
109,Zion Williamson
110,Zach LaVine
111,Julius Randle


In [24]:
#Removing special characters from name
all_star_df.loc[all_star_df['Name'] == 'Nikola Vučević' , 'Name'] = 'Nikola Vucevic'

No need to clean other players such as Luka Dončić, Nikola Jokić or Goran Dragić as they will not appear in teh college file

In [25]:
#saving to csv file
all_star_df.to_csv('all_star_rosters.csv')