In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
from plotly.offline import plot
import unittest
from urllib.request import urlopen

### WNBA Salary Data Webscraping
**100 highest paid players for 2020** <br>
*(data from spotrac.com)*

In [4]:
URL = 'https://www.spotrac.com/wnba/rankings/average/'
page = requests.get(URL).text
soup = BeautifulSoup(page, 'lxml')

names= []
for name in soup.find_all('a', class_= 'team-name'):
  names.append(name.get_text())

teams=[]
for team in soup.find_all('div', class_= 'rank-position'):
  teams.append(team.get_text())

avg_salaries=[]
for salaries in soup.find_all('span', class_= 'info'):
  avg_salaries.append(salaries.get_text())

pos= []
pos2= []
for p in soup.find_all('td', class_= 'center small'):
  pos.append(p.get_text())
  
for i in pos:
    j = i.replace(' ','').replace('\n', '')
    pos2.append(j)
pos2[136] = 'F'

age= []
for v in pos2:
    if len(v) % 2 == 0:
        age.append(v)
        
positions= []
for v in pos2:
    if len(v) % 2 != 0:
        positions.append(v)

wnba= 'WNBA'

wnba_salaries = pd.DataFrame(
    {'Player': names,
     'League': wnba,
     'Team': teams,
     'Position': positions,
     'Age': age,
     'Avg_Salary': avg_salaries
    })
wnba_salaries

Unnamed: 0,Player,League,Team,Position,Age,Avg_Salary
0,Skylar Diggins-Smith,WNBA,PHX,G,30,"$224,870"
1,Elena Delle Donne,WNBA,WAS,F,31,"$224,870"
2,DeWanna Bonner,WNBA,CON,F,33,"$224,870"
3,Diana Taurasi,WNBA,PHX,G,38,"$224,772"
4,Natasha Howard,WNBA,NY,F,29,"$224,675"
...,...,...,...,...,...,...
95,Seimone Augustus,WNBA,LA,G,37,"$70,040"
96,Nia Coffey,WNBA,LA,F,25,"$70,040"
97,Bria Holmes,WNBA,LA,G,27,"$70,040"
98,Marine Johannes,WNBA,NY,G,26,"$69,020"


### WNBA Salary Unit Testing

In [4]:
class Test_WNBA_Salary(unittest.TestCase):
  bs = None
  def setUpClass():
    url3 = 'https://www.spotrac.com/wnba/rankings/average/'
    Test_WNBA_Salary.bs = BeautifulSoup(urlopen(url3), 'html.parser')

  #test page title
  def test_wtitle(self):
    wtitle = Test_WNBA_Salary.bs.find('h1').get_text()
    self.assertEqual(' WNBA Salary Rankings ', wtitle)

  #test player name
  def test_wname(self):
    wplayerName = Test_WNBA_Salary.bs.find('a',{'class':'team-name'}).get_text()
    self.assertEqual('Elena Delle Donne', wplayerName)
 
  #test team name
  def test_wteam(self):
    wteam = Test_WNBA_Salary.bs.find('div',{'class':'rank-position'}).get_text()
    self.assertEqual('  WAS', wteam)
 
  #test player salary
  def test_wsalary(self):
    wsalary = Test_WNBA_Salary.bs.find('span',{'class':'info'}).get_text()
    self.assertEqual('$224,870  ', wsalary)

  #test player position
  def test_wposition(self):
    wposition = Test_WNBA_Salary.bs.find('td',{'class':'center'}).get_text()
    self.assertEqual('1', wposition)

if __name__ == '__main__':
  unittest.main(argv=[''], verbosity=2)

test_wname (__main__.Test_WNBA_Salary) ... ok
test_wposition (__main__.Test_WNBA_Salary) ... ok
test_wsalary (__main__.Test_WNBA_Salary) ... ok
test_wteam (__main__.Test_WNBA_Salary) ... ok
test_wtitle (__main__.Test_WNBA_Salary) ... ok

----------------------------------------------------------------------
Ran 5 tests in 0.725s

OK


SystemExit: False

### NBA Salary Data Webscraping
**100 highest paid players for 2020** <br>
*(data from spotrac.com)*

In [6]:
URL = 'https://www.spotrac.com/nba/rankings/average/'
page = requests.get(URL).text
soup = BeautifulSoup(page, 'lxml')

men_names= []
for name in soup.find_all('a', class_= 'team-name'):
  men_names.append(name.get_text())

men_teams=[]
for team in soup.find_all('div', class_= 'rank-position'):
  men_teams.append(team.get_text())

men_avg_salaries=[]
for salaries in soup.find_all('span', class_= 'info'):
  men_avg_salaries.append(salaries.get_text())

men_age= []
for a in soup.find_all('td', class_= 'center xs-hide'):
  men_age.append(a.get_text())

men_pos= []
men_pos2= []
for p in soup.find_all('td', class_= 'center', ):
  men_pos.append(p.get_text())

for i in men_pos:
    j = i.replace(' ','').replace('\n', '')
    men_pos2.append(j)

digits= []
men_positions= []
for ps in men_pos2:
  if ps.isdigit():
    digits.append(ps)
  elif ps== '':
    digits.append(ps)
  else:
    men_positions.append(ps)

nba= 'NBA'

nba_salaries = pd.DataFrame(
    {'Player': men_names,
     'League' : nba,
     'Team': men_teams,
     'Position': men_positions,
     'Age': men_age,
     'Avg_Salary': men_avg_salaries
    })
nba_salaries

Unnamed: 0,Player,League,Team,Position,Age,Avg_Salary
0,John Wall,NBA,HOU,PG,30,"$42,782,880"
1,James Harden,NBA,BKN,SG,31,"$42,782,880"
2,Russell Westbrook,NBA,WAS,PG,31,"$41,358,814"
3,Kevin Durant,NBA,BKN,SF,32,"$41,063,925"
4,Stephen Curry,NBA,GSW,PG,32,"$40,231,758"
...,...,...,...,...,...,...
95,Trevor Ariza,NBA,MIA,SF,35,"$12,500,000"
96,Kelly Olynyk,NBA,HOU,PF,29,"$12,500,000"
97,Cory Joseph,NBA,DET,PG,29,"$12,400,000"
98,Jusuf Nurkic,NBA,POR,C,26,"$12,000,000"


### NBA Salary Unit Testing

In [5]:
class Test_NBA_Salary(unittest.TestCase):
  bs = None
  def setUpClass():
    url2 = 'https://www.spotrac.com/nba/rankings/average/'
    Test_NBA_Salary.bs = BeautifulSoup(urlopen(url2), 'html.parser')

  #test page title
  def test_title(self):
    title = Test_NBA_Salary.bs.find('h1').get_text()
    self.assertEqual('NBA Financial Rankings', title)

  #test player name
  def test_name(self):
    playerName = Test_NBA_Salary.bs.find('a',{'class':'team-name'}).get_text()
    self.assertEqual('John Wall', playerName)
  
  #test team name
  def test_team(self):
    team = Test_NBA_Salary.bs.find('div',{'class':'rank-position'}).get_text()
    self.assertEqual('  HOU', team)

  #test player salary
  def test_salary(self):
    salary = Test_NBA_Salary.bs.find('span',{'class':'info'}).get_text()
    self.assertEqual('$42,782,880  ', salary)

  #test player age
  def test_age(self):
    age = Test_NBA_Salary.bs.find('td',{'class':'center xs-hide'}).get_text()
    self.assertEqual(' 30 ', age)

  #test player position
  def test_position(self):
    position = Test_NBA_Salary.bs.find('td',{'class':'center'}).get_text()
    self.assertEqual('1', position)

if __name__ == '__main__':
  unittest.main(argv=[''], verbosity=2)


test_age (__main__.Test_NBA_Salary) ... ok
test_name (__main__.Test_NBA_Salary) ... ok
test_position (__main__.Test_NBA_Salary) ... ok
test_salary (__main__.Test_NBA_Salary) ... ok
test_team (__main__.Test_NBA_Salary) ... ok
test_title (__main__.Test_NBA_Salary) ... ok
test_wname (__main__.Test_WNBA_Salary) ... ok
test_wposition (__main__.Test_WNBA_Salary) ... ok
test_wsalary (__main__.Test_WNBA_Salary) ... ok
test_wteam (__main__.Test_WNBA_Salary) ... ok
test_wtitle (__main__.Test_WNBA_Salary) ... ok

----------------------------------------------------------------------
Ran 11 tests in 1.506s

OK


SystemExit: False

### NBA Stats Data Webscraping
**Per-game avgs for 2020-21 season** <br>
*(data from basketballreference.com)*

In [7]:
url = 'https://www.basketball-reference.com/leagues/NBA_2021_per_game.html'
html_doc = requests.get(url)

#parse the html from site:
parsed_html = BeautifulSoup(html_doc.content, 'html.parser')

#extract specific table we are interested in (per-game stats for each player):
table = parsed_html.find(id='per_game_stats')

##Header:
#Locate the table header, extract header values, and store in list:
table_header = table.find('thead') #html 'thead' element contains all header-related data
header_elements = table_header.find_all('th') #store all 'th' (header) elements from 'thead' element 

headers = [] #initialize empty list to later store headers
for header in header_elements:
    item = header.get_text().strip() #extract each header value (text)
    headers.append(item) #append each header value to list
headers


##Body:
#Locate table body, extract data values, and store in list:
table_body = table.find('tbody') #html 'tbody' element contains all body-related data
body_rows = table_body.find_all('tr') #store all 'tr' (row) elements from 'tbody' element 

rows = [] #initialize empty list to later store data rows
for row in body_rows:
    row_header = row.find('th').get_text() #extract the row's header (season)
    items = row.find_all('td') #extract data values from row
    row = [row_header] #initialize list  w/ row header
    for item in items: #iterate through the values of the row and store each in row list
        row.append(item.get_text())
    rows.append(row)
    

nba_stats = pd.DataFrame(rows, columns = headers) 

#Some players (who switched teams mid-year) are duplicated - have a total record and 
#individual records for each team they played for that year.
#Need to delete these duplicated records and include only the "total" entries
nba_stats = nba_stats.drop_duplicates(['Player'])
nba_stats = nba_stats.drop(columns= ['Rk', 'Pos', 'Age', 'Tm', 'eFG%'])


#Replace accented letters to english equivalent for merging with salary data (where there are no accents)
nba_stats['Player'] = nba_stats['Player'].str.replace(
    u'ć', 'c').str.replace(u'Š', 'S').str.replace(u'č', 'c').str.replace(u'ā', 'a').str.replace(u'ģ', 'g').str.replace(u'ņ', 'n').str.replace(u'ū', 'u')

nba_stats

Unnamed: 0,Player,G,GS,MP,FG,FGA,FG%,3P,3PA,3P%,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
0,Precious Achiuwa,58,3,11.9,1.9,3.6,.538,0.0,0.0,,...,.515,1.2,2.2,3.4,0.5,0.3,0.5,0.7,1.5,4.8
1,Jaylen Adams,7,0,2.6,0.1,1.1,.125,0.0,0.3,.000,...,,0.0,0.4,0.4,0.3,0.0,0.0,0.0,0.1,0.3
2,Steven Adams,58,58,27.7,3.3,5.3,.614,0.0,0.1,.000,...,.444,3.7,5.2,8.9,1.9,0.9,0.7,1.3,1.9,7.6
3,Bam Adebayo,59,59,33.5,7.2,12.7,.567,0.0,0.1,.250,...,.797,2.3,6.8,9.1,5.4,1.2,1.0,2.7,2.2,18.9
4,LaMarcus Aldridge,26,23,25.9,5.4,11.4,.473,1.2,3.1,.388,...,.872,0.7,3.8,4.5,1.9,0.4,1.1,1.0,1.8,13.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
711,Delon Wright,57,33,27.2,3.6,7.9,.460,0.9,2.5,.371,...,.802,1.0,3.2,4.2,4.3,1.6,0.5,1.2,1.2,10.0
714,Thaddeus Young,62,19,24.2,5.5,9.6,.571,0.2,0.6,.263,...,.619,2.4,3.7,6.1,4.4,1.1,0.5,2.0,2.3,12.1
715,Trae Young,59,59,34.1,7.7,17.7,.436,2.2,6.3,.353,...,.882,0.6,3.2,3.9,9.5,0.9,0.2,4.2,1.8,25.4
716,Cody Zeller,42,20,20.3,3.6,6.6,.550,0.1,0.7,.143,...,.700,2.5,4.3,6.7,1.8,0.6,0.4,1.0,2.4,9.0


### NBA Stats Unit Testing

In [None]:
class Test_NBA_Stats(unittest.TestCase):
    bs = None
    def setUpClass():
      url = 'https://www.basketball-reference.com/leagues/NBA_2021_per_game.html'
      Test_NBA_Stats.bs = BeautifulSoup(urlopen(url), 'html.parser')

    #test page title
    def test_page_title(self):
      pageTitle = Test_NBA_Stats.bs.find('h1').get_text()
      self.assertEqual('\n2020-21 NBA Player Stats: Per Game\n', pageTitle);

    #test player name from table
    def test_player_name(self):
      tableName = Test_NBA_Stats.bs.find('td',{'class':'left '}).get_text()
      self.assertEqual('Precious Achiuwa', tableName)

    #test player position from table
    def test_table_position(self):
      tablePosition = Test_NBA_Stats.bs.find('td',{'class':'center'}).get_text()
      self.assertEqual('PF', tablePosition)

if __name__ == '__main__':
  unittest.main(argv=[''], verbosity=2)

### WNBA Stats data upload (CSV)
**Per-game avgs for 2020 season** <br>
*(data from basketballreference.com)*

In [9]:
wnba_stats = pd.read_csv('/Users/allie/Documents/cs5010/wnba_stats.csv') #Does not have DREB columns (must create)

#Delete the duplicated records and include only the "total" entries for players that were on multiple teams that year
wnba_stats = wnba_stats.drop_duplicates(['Player'])
wnba_stats = wnba_stats.drop(columns= ['Team', 'Pos', 'G.1', 'MP.1'])
wnba_stats['DRB'] = wnba_stats['TRB'] - wnba_stats['ORB']

#Replace accented letters to english equivalent for merging with salary data (where there are no accents)
wnba_stats['Player'] = wnba_stats['Player'].str.replace(
    u'ć', 'c').str.replace(u'Š', 'S').str.replace(u'č', 'c').str.replace(u'ā', 'a').str.replace(u'ģ', 'g').str.replace(u'ņ', 'n').str.replace(u'ū', 'u')

wnba_stats

FileNotFoundError: [Errno 2] No such file or directory: '/Users/allie/Documents/cs5010/wnba_stats.csv'

## Combine Stats and Salary Data for NBA 
**Key = "Player**

In [10]:
nba_df = pd.merge(nba_stats, nba_salaries, on= 'Player')
nba_df['Position'] = nba_df['Position'].replace('PF','F', regex=True).replace('SF','F', regex=True)
nba_df['Position'] = nba_df['Position'].replace('PG','G', regex=True).replace('SG','G', regex=True)

nba_df['salary_float'] = nba_df['Avg_Salary'].str.strip('$').replace(',','', regex=True).astype(float) #create column of salaries as float values
nba_revenue = float(7400000000)
nba_df['salary_ratio'] = nba_df['salary_float']/nba_revenue
nba_df['salary_ratio'] = nba_df['salary_ratio'].round(5)
nba_df


Unnamed: 0,Player,G,GS,MP,FG,FGA,FG%,3P,3PA,3P%,...,TOV,PF,PTS,League,Team,Position,Age,Avg_Salary,salary_float,salary_ratio
0,Steven Adams,58,58,27.7,3.3,5.3,.614,0.0,0.1,.000,...,1.3,1.9,7.6,NBA,NOP,C,27,"$25,000,000",25000000.0,0.00338
1,Giannis Antetokounmpo,56,56,33.1,10.3,18.3,.562,1.2,3.7,.311,...,3.6,2.8,28.4,NBA,MIL,F,25,"$25,000,000",25000000.0,0.00338
2,Trevor Ariza,25,22,27.2,3.2,7.7,.420,1.7,4.7,.359,...,0.8,1.8,9.3,NBA,MIA,F,35,"$12,500,000",12500000.0,0.00169
3,Marvin Bagley III,41,40,26.1,5.9,11.7,.504,0.9,2.6,.349,...,1.4,2.4,14.4,NBA,SAC,F,21,"$12,724,946",12724946.0,0.00172
4,Harrison Barnes,58,58,36.2,5.5,11.1,.497,1.7,4.4,.391,...,1.6,1.3,16.1,NBA,SAC,F,28,"$21,250,000",21250000.0,0.00287
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
90,Andrew Wiggins,67,67,33.2,6.9,14.7,.472,2.0,5.2,.382,...,1.8,2.2,18.3,NBA,GSW,F,25,"$29,542,010",29542010.0,0.00399
91,Justise Winslow,24,0,19.3,2.7,8.0,.340,0.3,2.0,.125,...,1.4,1.8,6.4,NBA,MEM,F,24,"$13,000,000",13000000.0,0.00176
92,Christian Wood,41,41,32.3,8.0,15.6,.514,1.9,5.0,.374,...,2.0,2.1,21.0,NBA,HOU,F,25,"$13,666,667",13666667.0,0.00185
93,Thaddeus Young,62,19,24.2,5.5,9.6,.571,0.2,0.6,.263,...,2.0,2.3,12.1,NBA,CHI,F,32,"$14,545,000",14545000.0,0.00197


## Combine Stats and Salary Data for WNBA 
**Key = "Player**

In [11]:
wnba_df = pd.merge(wnba_salaries, wnba_stats, on='Player')

wnba_df['salary_float'] = wnba_df['Avg_Salary'].str.strip('$').replace(',','', regex=True).astype(float) #create column of salaries as float values
wnba_revenue = float(60000000)
wnba_df['salary_ratio'] = wnba_df['salary_float']/wnba_revenue
wnba_df['salary_ratio'] = wnba_df['salary_ratio'].round(5)

wnba_df

NameError: name 'wnba_salaries' is not defined

## Merge NBA and WNBA Data
Once stats were combined with salaries, we were left with: <br>
70 WNBA Players <br>
95 NBA Players <br> 
**Total of 165 unique variables for exploratory analysis**

In [31]:

# combine NBA & WNBA data
all_df = pd.concat([nba_df, wnba_df])

# Create column easy  to sort and compare (float)
all_df = all_df.sort_values('salary_float', ascending=False).reset_index(drop = True)

# Calculated column of Games Played Percentage
all_df['GS%'] = all_df['GS'].astype(float) / all_df['G'].astype(float)

# Player Ranked in order from Highest to Lowest Salary
all_df['Salary_Rank'] = all_df.index + 1

# Drop any rows with null values
all_df = all_df.dropna()
all_df

Unnamed: 0,Player,G,GS,MP,FG,FGA,FG%,3P,3PA,3P%,...,PTS,League,Team,Position,Age,Avg_Salary,salary_float,salary_ratio,GS%,Salary_Rank
0,James Harden,42,42,37.1,8.0,17.2,.463,2.8,7.8,.358,...,25.2,NBA,BKN,G,31,"$42,782,880",42782880.0,0.00578,1.000000,1
1,John Wall,40,40,32.2,7.3,18.2,.404,2.0,6.2,.317,...,20.6,NBA,HOU,G,30,"$42,782,880",42782880.0,0.00578,1.000000,2
2,Russell Westbrook,53,53,35.6,8.3,18.8,.439,1.3,4.1,.315,...,21.8,NBA,WAS,G,31,"$41,358,814",41358814.0,0.00559,1.000000,3
3,Kevin Durant,25,22,32.5,9.4,17.3,.544,2.5,5.4,.467,...,27.5,NBA,BKN,F,32,"$41,063,925",41063925.0,0.00555,0.880000,4
4,Stephen Curry,53,53,34.1,10.2,20.9,.489,5.2,12.1,.427,...,31.3,NBA,GSW,G,32,"$40,231,758",40231758.0,0.00544,1.000000,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
158,Theresa Plaisance,13,0,90,0.8,2.2,0.379,0.4,1.3,0.294,...,2.5,WNBA,WAS,F,28,"$70,040",70040.0,0.00117,0.000000,159
159,Lexie Brown,17,13,374,2.2,6.5,0.342,0.8,3.1,0.269,...,6.4,WNBA,CHI,G,26,"$70,040",70040.0,0.00117,0.764706,160
162,Seimone Augustus,21,0,332,2.6,5.2,0.491,0.6,1,0.545,...,5.9,WNBA,LA,G,36,"$70,040",70040.0,0.00117,0.000000,163
163,Nia Coffey,15,1,230,1.1,2.5,0.421,0.5,1.4,0.333,...,2.7,WNBA,LA,F,25,"$70,040",70040.0,0.00117,0.066667,164


In [32]:
all_df.to_csv(r'C:\Users\allie\Documents\cs5010\NBAvsWNBA.csv', index = False)
