In [1]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import time
import pandas as pd
import re

In [2]:
def url_to_soup(url):
    '''
    Takes in a URL string and returns a bs4 object
    '''
    html = urlopen(url)
    return BeautifulSoup(html, 'lxml')
    
def get_row_data(soup, start):
    '''
    Takes a bs4 object and a start index and returns a list of lists representing the table data
    '''    
    rows = soup.findAll('tr')[start:]
    rows_data = [
                [td.getText() for td in row.findAll('td')] 
                for row in rows
                ]
    
    return rows_data
    
def get_seasons(corpus):
    '''
    Takes a text corpus, returns a list of seasons found in the form yyyy-yy
    '''
    
    return re.findall(r'\d+-\d+', corpus)

def get_table_headers(soup, l, index):
    '''
    Takes in a soup object, limit, and index
    
    Args:
        soup (bs4 object): soup object 
        limit (int):
        years (int):
        
    Returns:
        headers : list of strings which represent column labels
    '''
    headers = [
               header.getText() 
               for header in soup.findAll('tr', limit=l)[index].findAll('th')
               ]
    return headers

In [3]:
url = 'https://www.basketball-reference.com/friv/mvp.html'

soup = url_to_soup(url)
headers = get_table_headers(soup, 2, 0)
rows_data = get_row_data(soup,1)

df_shortlist = pd.DataFrame(rows_data, columns = headers[1:])
df_shortlist

Unnamed: 0,Player,Team,W,L,W/L%,G,GS,MP,FG,FGA,...,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Unnamed: 20,Prob%
0,Stephen Curry,GSW,11,1,0.917,12,12,33.8,9.2,20.3,...,5.5,6.3,6.4,1.6,0.7,3.3,1.5,28.4,,41.4%
1,Nikola Jokić,DEN,8,4,0.667,11,11,32.2,9.8,16.6,...,11.2,14.1,6.1,1.5,0.9,3.5,3.2,25.1,,38.6%
2,Kevin Durant,BRK,9,4,0.692,13,13,34.7,11.2,18.9,...,7.9,8.4,5.2,0.5,0.6,3.4,1.2,29.4,,7.9%
3,Jimmy Butler,MIA,7,5,0.583,11,11,33.0,8.0,15.2,...,3.7,5.5,5.1,2.1,0.4,1.9,2.0,23.6,,2.5%
4,Montrezl Harrell,WAS,8,3,0.727,11,2,29.6,6.5,10.1,...,6.8,9.3,1.9,0.6,1.2,1.4,2.4,18.1,,2.4%
5,Chris Paul,PHO,8,3,0.727,11,11,31.9,5.1,10.4,...,3.8,4.4,11.0,2.4,0.5,2.4,2.0,14.1,,2.2%
6,Giannis Antetokounmpo,MIL,6,7,0.462,12,12,32.9,9.5,19.2,...,9.9,11.8,6.0,1.1,1.8,3.0,3.0,26.6,,1.5%
7,Rudy Gobert,UTA,8,4,0.667,12,12,30.8,5.1,7.1,...,12.3,15.5,0.9,1.0,1.8,2.3,2.8,15.0,,1.3%
8,James Harden,BRK,9,4,0.692,13,13,34.2,6.0,14.0,...,6.7,7.8,9.0,1.1,0.8,4.8,2.4,19.8,,1.2%
9,Draymond Green,GSW,11,1,0.917,12,12,29.5,3.1,5.9,...,6.6,8.3,7.2,1.3,0.8,2.9,2.7,7.9,,1.0%


In [4]:
features = ['Age','G','MP','PTS','TRB','AST','STL','BLK','FG%','3P%','FT%','WS','WS/48','W/L%','SRS']

In [5]:
headers

['Rk',
 'Player',
 'Team',
 'W',
 'L',
 'W/L%',
 'G',
 'GS',
 'MP',
 'FG',
 'FGA',
 'FG%',
 '3P',
 '3PA',
 '3P%',
 '2P',
 '2PA',
 '2P%',
 'eFG%',
 'FT',
 'FTA',
 'FT%',
 'ORB',
 'DRB',
 'TRB',
 'AST',
 'STL',
 'BLK',
 'TOV',
 'PF',
 'PTS',
 '\xa0',
 'Prob%']

In [6]:
set(features) - set(headers)

{'Age', 'SRS', 'WS', 'WS/48'}