# Data Collection

## College Football Database

In [None]:
# Import college football api
!pip install --q cfbd python-dotenv

In [1]:
# Get general dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time

In [2]:
# Install API from college football data
import cfbd
from dotenv import dotenv_values

# Load stuff from .env file
env_vars = dotenv_values('.env')

configuration = cfbd.Configuration()
configuration.api_key['Authorization'] = env_vars.get('CFBD_API_KEY')
configuration.api_key_prefix['Authorization'] = env_vars.get('CFBD_API_KEY_PREFIX')

api_config = cfbd.ApiClient(configuration)

In [8]:
# Create a teams api instance and a games api instance and explore
games_api = cfbd.GamesApi(api_config)
#players_api = cfbd.PlayersApi(api_config)

### Team Information

In [78]:
# Get team info
def get_team_info(**kwargs):
  team_info = kwargs['api'].get_fbs_teams()
  return team_info

# Parse the plays into a dataframe
def team_info_to_df(teams): 
  teams_dict = [dict(
    team = t.school,
    abbreviation = t.abbreviation,
    team_id = t.id,
    conference = t.conference,
    stadium_capacity = t.location.capacity,
    logo = t.logos[0]
  ) for t in teams if t.id is not None]
  teams_info_df = pd.DataFrame(teams_dict)
  return teams_info_df

In [80]:
# Grab Team API object
teams_api = cfbd.TeamsApi(api_config)

# Run our functions to get all info we need from that API
team_info_df = team_info_to_df(
    get_team_info(api=teams_api)
)
team_info_df.head()

Unnamed: 0,team,abbreviation,team_id,conference,stadium_capacity,logo
0,Air Force,AFA,2005,Mountain West,46692.0,http://a.espncdn.com/i/teamlogos/ncaa/500/2005...
1,Akron,AKR,2006,Mid-American,30000.0,http://a.espncdn.com/i/teamlogos/ncaa/500/2006...
2,Alabama,ALA,333,SEC,101821.0,http://a.espncdn.com/i/teamlogos/ncaa/500/333.png
3,Appalachian State,APP,2026,Sun Belt,30000.0,http://a.espncdn.com/i/teamlogos/ncaa/500/2026...
4,Arizona,ARIZ,12,Pac-12,50782.0,http://a.espncdn.com/i/teamlogos/ncaa/500/12.png


In [42]:
team_info_df.to_csv('data/team_info.csv', index=False)

### Team Records Info (2000-Present)

In [6]:
# Get team records
def get_records(**kwargs):
  records = []
  for year in kwargs['years']:
    for team in kwargs['teams']:    
      records += kwargs['api'].get_team_records(year=year, team=team)
    time.sleep(1)  
  return records

# Parse the plays into a dataframe
def records_to_df(records): 
  records_dict = [dict(
    team = r.team,
    team_id = r.team_id,
    year = r.year,
    conference = r.conference,
    games_played = r.total.games,
    expected_wins = r.expected_wins,
    wins = r.total.wins,
    losses = r.total.losses,
    home_wins = r.home_games.wins,
    home_losses = r.home_games.losses,
    away_wins = r.away_games.wins,
    away_losses = r.away_games.losses,
  ) for r in records if r.team is not None]
  records_df = pd.DataFrame(records_dict)
  return records_df 

Note: The CFBD doesn't handle numpy.int64 types. You need to use range() rather than list(np.arange()) because the elements of a list from range will be regular python ints.

In [38]:
type(list(np.arange(2000,2002))[0])

numpy.int64

In [39]:
#games_api.get_team_records(year=2023, team='South Carolina')
years = range(2000,2023+1)
teams = [team for team in team_info_df.team]

records_df = records_to_df(
    get_records(api=games_api, years=years, teams=teams)
)
records_df.head()

Unnamed: 0,team,team_id,year,conference,games_played,expected_wins,wins,losses,home_wins,home_losses,away_wins,away_losses
0,Air Force,2005,2000,Mountain West,12,0.0,9,3,5,1,3,2
1,Akron,2006,2000,Mid-American,11,0.0,6,5,3,3,3,2
2,Alabama,333,2000,SEC,11,0.0,3,8,3,3,0,5
3,Arizona,12,2000,Pac-10,11,0.0,5,6,2,4,3,2
4,Arizona State,9,2000,Pac-10,12,0.0,6,6,3,3,3,2


In [41]:
records_df.to_csv('data/team_records_by_year.csv', index=False)

### Team Performance Ratings

In [18]:
ratings_api = cfbd.RatingsApi(api_config)

In [15]:
# Get team records
def get_ratings(**kwargs):
  elos = []
  fpis = []
  #sps = []
  for year in kwargs['years']:
    for team in kwargs['teams']:    
        elos += kwargs['api'].get_elo_ratings(year=year, team=team)
        fpis += kwargs['api'].get_fpi_ratings(year=year, team=team)
        conf_rating += kwargs['api'].get_conference_sp_ratings(year=year, conferenece=team)
  #time.sleep(1)  
  return {'elos': elos, 'fpis': fpis}

# Parse the plays into a dataframe
def ratings_to_df(**kwargs): 
  elos_df = pd.DataFrame([dict(team=r.team, year=r.year, conference=r.conference, elo=r.elo) for r in kwargs['elo'] if r.team is not None])
  fpis_df = pd.DataFrame([dict(team=r.team, year=r.year, conference=r.conference, fpi=r.fpi) for r in kwargs['fpi'] if r.team is not None])
  #sps_df = pd.DataFrame([dict(team=r.team, year=r.year, conference=r.conference, sp=r.sp) for r in kwargs['sp'] if r.team is not None])
  ratings_df = pd.merge(elos_df, fpis_df, on=['team', 'year', 'conference'])
  return ratings_df 

In [81]:
years = range(2000,2023+1)
teams = [team for team in team_info_df.team]
team_ratings_df = ratings_to_df(elo = get_ratings(api=ratings_api, years = years, teams = teams)['elos'],
                           fpi = get_ratings(api=ratings_api, years = years, teams = teams)['fpis'])
team_ratings_df.head()

Unnamed: 0,team,year,conference,elo,fpi
0,Air Force,2005,Mountain West,1419.0,-2.383
1,Akron,2005,Mid-American,1394.0,-8.552
2,Alabama,2005,SEC,1798.0,14.764
3,Arizona,2005,Pac-10,1480.0,4.534
4,Arizona State,2005,Pac-10,1692.0,14.189


In [82]:
# Combine with conference ratings
def get_conference_ratings(years):
    conferences, ratings, years_list = [], [], []
    for year in years:
        conference_ratings = ratings_api.get_conference_sp_ratings(year=year)
        conferences += [element.conference for element in conference_ratings]
        ratings += [element.rating for element in conference_ratings]
        years_list += [element.year for element in conference_ratings]
    return pd.DataFrame({'conference': conferences, 'conference_rating': ratings, 'year': years_list})

In [83]:
years = range(2000,2023+1)
conference_ratings_df = get_conference_ratings(years)
conference_ratings_df.head()

Unnamed: 0,conference,conference_rating,year
0,ACC,3.477778,2000
1,Big 12,9.45,2000
2,Big East,12.175,2000
3,Big Ten,8.190909,2000
4,Big West,-10.683333,2000


In [84]:
team_conf_ratings_df = pd.merge(team_ratings_df, conference_ratings_df, on=['conference','year'])
team_conf_ratings_df

Unnamed: 0,team,year,conference,elo,fpi,conference_rating
0,Air Force,2005,Mountain West,1419.0,-2.383,-0.811111
1,Akron,2005,Mid-American,1394.0,-8.552,-10.025000
2,Alabama,2005,SEC,1798.0,14.764,7.625000
3,Arizona,2005,Pac-10,1480.0,4.534,11.090000
4,Arizona State,2005,Pac-10,1692.0,14.189,11.090000
...,...,...,...,...,...,...
2190,Western Kentucky,2023,Conference USA,1449.0,-5.151,-7.888889
2191,Western Michigan,2023,Mid-American,1223.0,-13.543,-11.308333
2192,West Virginia,2023,Big 12,1611.0,5.931,5.514286
2193,Wisconsin,2023,Big Ten,1631.0,6.043,6.078571


In [85]:
team_conf_ratings_df.to_csv('data/team_conference_ratings.csv', index=False)

### Recruiting Data
- Team recruiting rankings and ratings for all teams 2000-2023
- Player rankings for each team all years
    - Get blue chip ratio
    - Usable for recruiting visualizer

In [86]:
# Get recruiting api
recruiting_api = cfbd.RecruitingApi(api_config)

In [91]:
# Get team recruiting object and just return everything
team_recruiting = recruiting_api.get_recruiting_teams()

In [98]:
# Traverse the object and turn into DF
points = [entry.points for entry in team_recruiting]
ranks = [entry.rank for entry in team_recruiting]
teams = [entry.team for entry in team_recruiting]
years = [entry.year for entry in team_recruiting]

team_recruiting_df = pd.DataFrame({'team': teams, 'year': years, 'rank': ranks, 'points': points})
team_recruiting_df.head()

Unnamed: 0,team,year,rank,points
0,Tennessee,2000,1,252.66
1,Florida,2000,2,239.17
2,Florida State,2000,3,225.16
3,Alabama,2000,4,199.31
4,Penn State,2000,5,196.61


In [124]:
team_recruiting_df.to_csv('data/team_recruiting.csv', index=False)

In [102]:
# Get all recruits for each year for Streamlit visualizer and blue chip ratio
player_recruiting = recruiting_api.get_recruiting_players(year=2023)

In [113]:
[row.name for row in player_recruiting[:5]]

['Arch Manning',
 'Keon Keeley',
 'Nico Iamaleava',
 'Zachariah Branch',
 'Nicholaus Iamaleava']

In [121]:
year_list = range(2000,2023 + 1)

names, years, stars, schools, states, rankings, ratings = [], [], [], [], [], [], []

for year in year_list:
    player_recruiting = recruiting_api.get_recruiting_players(year=year)
    for entry in player_recruiting:
        names.append(entry.name)
        years.append(entry.year)
        stars.append(entry.stars)
        schools.append(entry.committed_to)
        states.append(entry.state_province)
        rankings.append(entry.ranking)
        ratings.append(entry.rating)

player_recruiting_df = pd.DataFrame({'name': names, 'year': years, 'star': stars,
                                     'school': schools, 'state': states, 'ranking': rankings,
                                     'rating': ratings})
player_recruiting_df.head()

Unnamed: 0,name,year,star,school,state,ranking,rating
0,D.J. Williams,2000,5,Miami,CA,1.0,0.9998
1,Brock Berlin,2000,5,Florida,LA,2.0,0.9998
2,Charles Rogers,2000,5,Michigan State,MI,3.0,0.9988
3,Travis Johnson,2000,5,Florida State,CA,4.0,0.9982
4,Marcus Houston,2000,5,Colorado,CO,5.0,0.998


In [122]:
player_recruiting_df.to_csv('data/player_recruiting.csv', index=False)

### Player Data (only from 2014 - Present)
- Returning players
- Returning production

In [125]:
player_api = cfbd.PlayersApi(api_config)

In [136]:
returning = player_api.get_returning_production(year=2014)

In [137]:
years_list = range(2014, 2023+1)
years, teams, conferences, passing_usages, rushing_usages, usages = [], [], [], [] ,[] ,[] 
for year in years_list:
    returning = player_api.get_returning_production(year=year)
    for row in returning:
        years.append(row.season)
        teams.append(row.team)
        conferences.append(row.conference)
        passing_usages.append(row.passing_usage)
        rushing_usages.append(row.rushing_usage)
        usages.append(row.usage)

returning_df = pd.DataFrame({'year': years, 'team': teams, 'conference': conferences,
                             'passing_usage': passing_usages, 'rushing_usage': rushing_usages, 'usages': usages})
returning_df.head()    

Unnamed: 0,year,team,conference,passing_usage,rushing_usage,usages
0,2014,Air Force,Mountain West,0.751,0.652,0.684
1,2014,Akron,Mid-American,1.0,0.989,0.952
2,2014,Alabama,SEC,0.078,0.911,0.589
3,2014,Arizona,Pac-12,0.0,0.048,0.142
4,2014,Arizona State,Pac-12,0.99,0.562,0.713


In [138]:
returning_df.to_csv('data/returning_players_2014.csv', index=False)

### Team Stats
- Get season by season stats for teams
- Predicted Points Added

In [139]:
stats_api = cfbd.StatsApi(api_config)

### Draft Stats
- Get draft picks by team to potentially investigate recruiting vs. draft picks

In [140]:
draft_api = cfbd.DraftApi(api_config)

In [145]:
draft = draft_api.get_draft_picks(year=2023)

In [147]:
years, names, teams, conferences, picks, rounds, positions = [], [], [], [] ,[] ,[], []
year_list = range(2000,2023+1)
for year in year_list:
    draft = draft_api.get_draft_picks(year=year)
    for row in draft:
        names.append(row.name)
        years.append(row.year)
        teams.append(row.college_team)
        conferences.append(row.college_conference)
        picks.append(row.pick)
        rounds.append(row.round)
        positions.append(row.position)

draft_df = pd.DataFrame({'name': names, 'team': teams, 'conference': conferences,
                             'pick': picks, 'round': rounds, 'position': positions})
draft_df.head()

Unnamed: 0,name,team,conference,pick,round,position
0,Bryce Young,Alabama,SEC,1,1,Quarterback
1,C.J. Stroud,Ohio State,Big Ten,2,1,Quarterback
2,Will Anderson Jr.,Alabama,SEC,3,1,Outside Linebacker
3,Anthony Richardson,Florida,SEC,4,1,Quarterback
4,Devon Witherspoon,Illinois,Big Ten,5,1,Cornerback


In [None]:
draft_df.to_csv('data/draft_info.csv', index=False)

## On3 NIL Valuations
https://www.on3.com/nil/news/about-on3-nil-valuation-per-post-value/

In [95]:
# Function to format followers into integers
def values_to_int(string):
    if string[0] == '$':
        string = string[1::]
    if 'K' in string:
        number = float(string[:-1])  # Convert the string to float, excluding the 'K' suffix
        return int(number * 1000)    # Multiply the number by 1000 and convert it to an integer
    elif 'M' in string:
        number = float(string[:-1])
        return int(number * 1000000)
    else:
        return int(float(string))     # Convert the string to float and then to an integer

In [128]:
#scrape_NIL_100('https://www.on3.com/nil/rankings/player/nil-100/')
url = 'https://www.on3.com/nil/rankings/player/college/football/'
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
table = soup.find_all('div', {'class': 'NilPlayerRankingItem_itemContainer___Uo0_'})

In [164]:
names, ranks, schools, positions, followers, valuations = [], [], [], [], [], []
for row in table:
    
    names.append(row.find('div', {'class': 'NilPlayerRankingItem_nameYearContainer__kVMqH'}).text)
    
    ranks.append(row.find('span', {'class': "MuiTypography-root MuiTypography-body1 NilPlayerRankingItem_playerRank__NQmBq css-z52hnt"}).text)
    
    school = row.find('div', {'class': 'NilPlayerRankingItem_statusItem__gikz_'})
    if school:
        schools.append(row.find('div', {'class': 'NilPlayerRankingItem_statusItem__gikz_'}).find('img')['title'])
    else:
        schools.append(None)    
    
    positions.append(row.find('span', {'class': "MuiTypography-root MuiTypography-body1 NilPlayerRankingItem_position__WIvtI css-z52hnt"}).text)
    
    followers.append(values_to_int(row.find('p', {'class': "MuiTypography-root MuiTypography-body1 NilPlayerRankingItem_followersNumber__xG05J css-z52hnt"}).text))
    
    valuation_container = row.find('div', {'class': 'NilPlayerRankingItem_valuationContainer__nV9Sj'})
    valuation = valuation_container.find('p',{'class': 'MuiTypography-root MuiTypography-body1 NilPlayerRankingItem_valuationCurrency___Pa_U css-z52hnt'})
    if valuation:
        valuations.append(values_to_int(valuation.text))
    else:
        valuations.append(None)    

nil_df = pd.DataFrame({'name': names, 'rank': ranks, 'school': schools,
                       'position': positions, 'follwers': followers, 'valuation': valuations})
nil_df.head() 


Unnamed: 0,name,rank,school,position,follwers,valuation
0,Shedeur Sanders,1,colorado buffaloes,QB,2500000,4600000.0
1,Travis Hunter,2,colorado buffaloes,CB,2300000,2700000.0
2,Arch Manning,3,texas longhorns,QB,301000,2400000.0
3,Quinn Ewers,4,texas longhorns,QB,265000,1900000.0
4,Jalen Milroe,5,alabama crimson tide,QB,223000,1600000.0


In [167]:
nil_df.to_csv('data/nil_data.csv', index=False)