In [1]:
import datetime as dt
import os
from pathlib import Path
from random import randint
import re
import requests
import sys
import time

import basketball_reference_web_scraper
from basketball_reference_web_scraper import client
from bs4 import BeautifulSoup
import git
import pandas as pd
from urllib.request import urlopen

sys.path.append('../')

import src.util as ut

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

GIT_ROOT_DIR = ut.get_git_root(os.getcwd())
DATA_DIR = os.path.join(GIT_ROOT_DIR, 'data')

SECONDS_SLEEP = 2

# THIS IS WHAT YOU NEED TO CHANGE
GAME_YEAR = 2020
GAME_MONTH = 8
GAME_DAY = 7

In [2]:
def normalize_position(position):
    """
    Add full roster positions
    
    For example, if PG, then the player is also a G, if PF, a player is also an F. 
    All players are UTIL
    """
    position_normalized = position.split('/')
    position_normalized = list(filter(None, position_normalized))  # Remove empty strings
    
    if 'PG' in position_normalized or 'SG' in position_normalized:
        position_normalized.append('G')
    if 'SF' in position_normalized or 'PF' in position_normalized:
        position_normalized.append('F')
    position_normalized.append('UTIL')
    position_normalized = list(set(position_normalized))
    position_normalized.sort()
    
    position_normalized = '/'.join(position_normalized)
    
    return position_normalized

In [3]:
def get_fantasy_salary(game_year, game_month, game_day):
    """
    Scraping DraftKings salary data from RotoGuru.com
    """
    url_roto = "http://rotoguru1.com/cgi-bin/hyday.pl?mon={}&day={}&year={}&game=dk".format(game_month, game_day, game_year)
    print('Scraping salary information for date {}-{}-{}'.format(game_year, str(game_month).rjust(2, '0'), str(game_day).rjust(2, '0')))

    teams, positions, players, starters, salaries = [], [], [], [], []

    soup = BeautifulSoup(urlopen(url_roto),'lxml')

    #Check if there were any games on a given date
    soup_table = soup.find('body').find('table', border="0", cellspacing="5")

    soup_rows = soup_table.find_all('tr')

    for row in soup_rows:
        if row.find('td').has_attr('colspan') == False:
            if row.find('a').get_text() != '':

                position = row.find_all('td')[0].get_text()
                position = normalize_position(position)

                player_tmp = row.find('a').get_text().split(", ")
                player = player_tmp[1] + ' ' + player_tmp[0]

                starter_tmp = row.find_all('td')[1].get_text()

                if '^' in starter_tmp:
                    starter = True
                else:
                    starter = False

                salary_tmp = row.find_all('td')[3].get_text()
                salary = re.sub('[$,]', '', salary_tmp)

                team = row.find_all('td')[4].get_text()

                positions.append(position)
                players.append(player)
                starters.append(starter)
                salaries.append(salary)
                teams.append(team)

    df = pd.DataFrame({'date': [dt.date(game_year, game_month, game_day) for i in range(len(players))], 
                       'team': [team.upper() for team in teams],
                       'position': positions,
                       'name': players,
                       'starter': starters,
                       'salary': salaries})

#     df.to_csv(os.path.join(DATA_DIR, 'DKSalary', season, 'salary_'+date+'.csv'), index=False)

    time.sleep(SECONDS_SLEEP)
    return df

In [4]:
df = get_fantasy_salary(game_year=GAME_YEAR, game_month=GAME_MONTH, game_day=GAME_DAY)

Scraping salary information for date 2020-08-07


In [8]:
df.head(5)

Unnamed: 0,date,team,position,name,starter,salary
0,2020-08-07,WAS,G/PG/UTIL,Ishmael Smith,True,5000
1,2020-08-07,BKN,G/SG/UTIL,Caris LeVert,True,7100
2,2020-08-07,NOR,G/PG/SG/UTIL,Jrue Holiday,True,9300
3,2020-08-07,OKC,G/PG/UTIL,Chris Paul,True,8100
4,2020-08-07,MEM,G/SG/UTIL,Dillon Brooks,True,5400


In [6]:
df.shape

(181, 6)