In [1]:
import requests
import pandas as pd
import bs4
import re
import json
import logging

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Team name normalization dictionary
TEAM_NAME_MAPPING = {
    'sanfrancisco49ers': '49ers',
    'dallascowboys': 'cowboys',
    'philadelphiaeagles': 'eagles',
    'buffalobills': 'bills',
    'newyorkjets': 'jets',
    'newenglandpatriots': 'patriots',
    'baltimoreravens': 'ravens',
    'denverbroncos': 'broncos',
    'pittsburghsteelers': 'steelers',
    'neworleanssaints': 'saints',
    'kansascitychiefs': 'chiefs',
    'miamidolphins': 'dolphins',
    'washingtoncommanders': 'commanders',
    'cincinnatibengals': 'bengals',
    'clevelandbrowns': 'browns',
    'greenbaypackers': 'packers',
    'losangeleschargers': 'chargers',
    'jacksonvillejaguars': 'jaguars',
    'tampabaybuccaneers': 'buccaneers',
    'seattleseahawks': 'seahawks',
    'indianapoliscolts': 'colts',
    'carolinapanthers': 'panthers',
    'tennesseetitans': 'titans',
    'newyorkgiants': 'giants',
    'detroitlions': 'lions',
    'losangelesrams': 'rams',
    'minnesotavikings': 'vikings',
    'atlantafalcons': 'falcons',
    'arizonacardinals': 'cardinals',
    'houstontexans': 'texans',
    'chicagobears': 'bears',
    'lasvegasraiders': 'raiders'
}

In [2]:
def fetch_fantasy_pros_data(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        return response.text
    except requests.RequestException as e:
        logging.error(f"Error fetching data from {url}: {e}")
        return None

def parse_fantasy_pros_data(html_content):
    soup = bs4.BeautifulSoup(html_content, "html.parser")
    scripts = soup.find_all("script")
    for script in scripts:
        if script.string:
            match = re.search("var ecrData = {.*};", script.string)
            if match:
                json_data = match.group(0).replace("var ecrData = ", "").replace(";", "")
                return json.loads(json_data)
    logging.warning("No ecrData found in the HTML content")
    return None

def clean_player_data(df):
    df['player_name'] = df['player_name'].str.replace(r"(?:I{1,3}|IV|V?I{0,3})\s*$", " ", regex=True)
    df['player_name'] = df['player_name'].str.replace(r"(Jr|Sr)\s*$", " ", regex=True)
    df['player_name'] = df.player_name.str.lower().replace('\s+', '', regex=True)
    df = df.replace(TEAM_NAME_MAPPING)
    return df

def process_fantasy_pros_data(base_url, rankings_list):
    frames = []
    for page in rankings_list:
        url = f'{base_url}/{page}.php'
        logging.info(f"Fetching data from {url}")
        html_content = fetch_fantasy_pros_data(url)
        if html_content:
            data = parse_fantasy_pros_data(html_content)
            if data:
                df = pd.json_normalize(data["players"])
                frames.append(df)
            else:
                logging.warning(f"No data found for {url}")
        else:
            logging.warning(f"Failed to fetch data from {url}")
    
    if not frames:
        logging.error("No data frames to process")
        return None
    
    final_df = pd.concat(frames)
    final_df = clean_player_data(final_df)
    
    columns_to_drop = ['player_id', 'sportsdata_id', 'player_opponent', 'player_opponent_id', 
                       'player_ecr_delta', 'start_sit_grade', 'player_positions', 'player_eligibility',
                       'rank_min', 'rank_max', 'rank_ave', 'rank_std', 'player_yahoo_positions', 
                       'player_short_name', 'player_page_url', 'player_filename', 'player_square_image_url', 
                       'player_image_url', 'player_yahoo_id', 'cbs_player_id', 'player_bye_week', 
                       'player_owned_avg', 'player_owned_espn', 'player_owned_yahoo', "note", "tag", "recommendation"]
    
    final_df = final_df.drop(columns=[col for col in columns_to_drop if col in final_df.columns])
    return final_df

In [3]:
def main():
    base_url = 'http://www.fantasypros.com/nfl/rankings'
    rankings_list = ['qb', 'ppr-rb', 'ppr-wr', 'ppr-te', 'dst']
    
    final_df = process_fantasy_pros_data(base_url, rankings_list)
    
    if final_df is not None:
        final_df.to_csv("../data/fantasy_pros.csv", index=False)
        logging.info("Data successfully saved to fantasy_pros.csv")
    else:
        logging.error("Failed to process FantasyPros data")
    
    return final_df

# Run the main function
final_df = main()

# Display the first few rows of the dataframe
if final_df is not None:
    display(final_df.head())
else:
    print("No data to display")

2024-07-08 02:13:26,931 - INFO - Fetching data from http://www.fantasypros.com/nfl/rankings/qb.php
2024-07-08 02:13:27,098 - INFO - Fetching data from http://www.fantasypros.com/nfl/rankings/ppr-rb.php
2024-07-08 02:13:27,389 - INFO - Fetching data from http://www.fantasypros.com/nfl/rankings/ppr-wr.php
2024-07-08 02:13:27,538 - INFO - Fetching data from http://www.fantasypros.com/nfl/rankings/ppr-te.php
2024-07-08 02:13:27,671 - INFO - Fetching data from http://www.fantasypros.com/nfl/rankings/dst.php
  df = df.replace(TEAM_NAME_MAPPING)
2024-07-08 02:13:27,870 - INFO - Data successfully saved to fantasy_pros.csv


Unnamed: 0,player_name,player_team_id,player_position_id,rank_ecr,pos_rank
0,joshallen,BUF,QB,1,QB1
1,patrickmahomes,KC,QB,2,QB2
2,lamarjackson,BAL,QB,3,QB3
3,jalenhurts,PHI,QB,4,QB4
4,anthonyrichardson,IND,QB,5,QB5
