In [1]:
from bs4 import BeautifulSoup
import requests
import re
import pandas as pd
import time
import os


def get_players_and_links(search_link, pages):
    
    players = []
    links = []
        
    for page in range(1, pages):
        link2 = ''.join([search_link,str(page)])
        soup = BeautifulSoup(requests.get(link2).content, 'lxml')

        table = soup.find('table', attrs={'class':'listing'})
        table_rows = table.find_all('tr')

        
        for tr in table_rows[1:]:
            td = tr.find_all('td')
            row = [tr.text for tr in td]
            players.append(row)

        for link in soup.findAll('a', attrs={'href': re.compile("19/players/prices")}):
                            links.append(link.get('href'))
        
        assert len(links)==len(players)
    
    return players, links


class PlayerItem:
    def __init__(self, item, link):
        #item = l[1]
        self.name = item[1].split('\n ')[1].strip()
        self.program = item[1].split('\n ')[-1].strip()
        self.ovr = int(item[2])
        self.pos = item[3]
        self.date = None
        self.link = 'https://www.muthead.com' + link
        
    def string(self):
        return f"Player: {self.name}\nProgram: {self.program}\nOVR: {self.ovr}\nDate: {self.date}"
    
    def as_dataframe(self):
        return pd.DataFrame([{'name': self.name, 'program': self.program, 'date': self.date, 
                              'link': self.link, 'ovr': self.ovr, 'pos': self.pos}])
    
def get_release_date(link):
    soup = BeautifulSoup(requests.get(link).content, 'lxml')
    try:
        date = soup.find_all('script', type="text/javascript")[1].string.split('series:')[1].split('Date.UTC')[1].split(')')[0].lstrip('()').split(', ')
        date = [int(x) for x in date]
    except:
        date = soup.find_all('script', type="text/javascript")
        print('(exception)')
    # year, month, day = date
    return date


def get_and_save_player_dates(goal_program, date):
    global pages_map
    global link_map
    
    n_pages = pages_map[goal_program]
    link = f'https://www.muthead.com/19/players?filter-market=4&filter-program-19={link_map[goal_program]}&page='
    players, links = get_players_and_links(link, n_pages)
    
    # create list of PlayerItem objects to store data including dates
    player_items = [PlayerItem(item=player, link=link_) for player, link_ in zip(players, links)]#[:5]
    
    if os.path.isfile(f'{date}_{goal_program}.csv'):
        df = pd.read_csv(f'{date}_{goal_program}.csv', index_col=0)
        print('reading previous file... (not doing anything yet)')
        
    # TODO: put full filtering here to limit unneccessary scraping
    
    
    
    # Fetch release dates of players
    for p in player_items:
        time.sleep(5)
        p.date = get_release_date(p.link)
    
    # Convert to dataframe and save as csv
    df = pd.concat([p.as_dataframe() for p in player_items])
    df.reset_index(drop=True, inplace=True)

    df['date_dtype'] = df['date'].apply(lambda x: type(x[0])).astype(str)
    df['dated'] = df['date_dtype'].apply(lambda x: x =="<class 'int'>")*1
    
    df['full'] = 0
    indices = [idx[0] for name, idx in df.groupby(['name'])['ovr'].groups.items()]
    df['index'] = df.index
    df['full'] = df['index'].apply(lambda x: 1 if x in indices else 0)

    df.to_csv(f'{date}_{goal_program}.csv')
    
    return df

In [7]:
# Pages of players, TODO: lazy! make it better
pages_map = {'L': 16, 'UL': 9, 'Ghosts': 1, 'SBP': 1, 'Gauntlet': 1} 
# Legend: 227, UL: 265
link_map = {'L': 227, 'UL': 265, 'Ghosts': 256, 'SBP': 262, 'Gauntlet': 224}

# INPUTS
goal_program = 'L'
date = 'apr27'

df = get_and_save_player_dates(goal_program=goal_program, date='apr27')

In [8]:
df.head()

Unnamed: 0,date,link,name,ovr,pos,program,date_dtype,dated,full,index
0,"[2019, 0, 26]",https://www.muthead.com/19/players/prices/5238...,Tedy Bruschi,96,MLB,Legends,<class 'int'>,1,1,0
1,"[2019, 0, 26]",https://www.muthead.com/19/players/prices/5237...,Kam Chancellor,96,SS,Legends,<class 'int'>,1,1,1
2,"[2019, 0, 26]",https://www.muthead.com/19/players/prices/5237...,Larry Fitzgerald,96,WR,Legends,<class 'int'>,1,1,2
3,"[2019, 0, 12]",https://www.muthead.com/19/players/prices/5235...,Chris Johnson,96,HB,Legends,<class 'int'>,1,1,3
4,"[2019, 0, 19]",https://www.muthead.com/19/players/prices/5237...,Ed Too Tall Jones,96,LE,Legends,<class 'int'>,1,1,4


In [9]:
full = df[ df['full']==1]
full['dated'].value_counts()

1    84
Name: dated, dtype: int64

In [11]:
full.to_csv('april27_full_legend.csv')