In [None]:
# taking 2020-2024 to mean seasons 2020-2021 to 2024-2025

In [1]:
# import libraries
import pandas as pd
# import randint necessary library
from random import randint, uniform
## import logging
import logging
# for scraping intervals
import datetime as dt 
# for sleep function
import time


In [3]:
## headers
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "           
                         "AppleWebKit/537.36 (KHTML, like Gecko) "
                         "Chrome/124.0.0.0 Safari/537.36"}

In [5]:
base_url = "https://www.baseball-reference.com/leagues/majors/202"
df_list = []
broken_links = []

for i, number in enumerate(range(0,5), start = 0):
    url = f"{base_url}{number}-free-agents.shtml"
    print(f"Scraping page {i}, url: {url}")
    try:
        df = pd.read_html(url)[0]
        df["source_url"] = url
        df_list.append(df)
    except Exception as e:
        print(f"Encountered an issue: {e} at {url}")
        broken_links.append(url)
    finally:
        snoozer = uniform(5,20)
        print(f"Snoozing for {snoozer} seconds before next scrape")
        time.sleep(snoozer)

print("done scraping all urls")

Scraping page 0, url: https://www.baseball-reference.com/leagues/majors/2020-free-agents.shtml
Snoozing for 19.189627670427548 seconds before next scrape
Scraping page 1, url: https://www.baseball-reference.com/leagues/majors/2021-free-agents.shtml
Snoozing for 16.491097135815657 seconds before next scrape
Scraping page 2, url: https://www.baseball-reference.com/leagues/majors/2022-free-agents.shtml
Snoozing for 13.597517860535227 seconds before next scrape
Scraping page 3, url: https://www.baseball-reference.com/leagues/majors/2023-free-agents.shtml
Snoozing for 11.08569706441123 seconds before next scrape
Scraping page 4, url: https://www.baseball-reference.com/leagues/majors/2024-free-agents.shtml
Snoozing for 6.752626203435501 seconds before next scrape
done scraping all urls


In [7]:
# concat list into a single dataframe
df = pd.concat(df_list, ignore_index= True )
df

Unnamed: 0,Rk,Name,Date,To Team,From Team,Age,WAR3,Yrs,G,AB,...,WHIP,G.1,GS,SV,IP,H.1,HR.1,BB.1,SO,source_url
0,1,Ben Gamel,2021-05-09,Pittsburgh Pirates,MIL,29,0.8,5,442.0,1239.0,...,,,,,,,,,,https://www.baseball-reference.com/leagues/maj...
1,2,Brian Goodwin,2021-05-05,Chicago White Sox,CIN,30,1.9,5,357.0,1009.0,...,,,,,,,,,,https://www.baseball-reference.com/leagues/maj...
2,3,Christian Bethancourt,2021-05-01,Pittsburgh Pirates,PHI,29,,5,161.0,469.0,...,3.375,6.0,0.0,0.0,5.1,7.0,1.0,11.0,3.0,https://www.baseball-reference.com/leagues/maj...
3,4,Aaron Wilkerson,2021-05-01,Los Angeles Dodgers,MIL,32,-0.5,3,,,...,,,,,,,,,,https://www.baseball-reference.com/leagues/maj...
4,5,Tim Adleman,2021-04-30,Cincinnati Reds,DET,33,,4,,,...,,,,,,,,,,https://www.baseball-reference.com/leagues/maj...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022,396,Chris Devenski,2024-10-31,New York Mets,SEA,34,-1.0,9,25.0,1.0,...,1.135,303.0,7.0,8.0,400.0,343.0,58.0,111.0,415.0,https://www.baseball-reference.com/leagues/maj...
2023,397,Geoff Hartlieb,2024-10-18,New York Yankees,COL,31,-0.3,5,34.0,0.0,...,1.866,64.0,0.0,0.0,79.1,93.0,11.0,55.0,76.0,https://www.baseball-reference.com/leagues/maj...
2024,398,Yohan Ramírez,2024-10-13,Pittsburgh Pirates,BOS,30,-0.9,5,4.0,0.0,...,1.379,140.0,0.0,7.0,169.0,147.0,20.0,86.0,173.0,https://www.baseball-reference.com/leagues/maj...
2025,399,Zach Logue,2024-09-12,Los Angeles Dodgers,ATL,29,-1.5,3,0.0,0.0,...,1.557,19.0,10.0,0.0,70.0,87.0,17.0,22.0,56.0,https://www.baseball-reference.com/leagues/maj...


In [9]:
# checking column headers are all there
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2027 entries, 0 to 2026
Data columns (total 33 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Rk          2027 non-null   int64  
 1   Name        2027 non-null   object 
 2   Date        2027 non-null   object 
 3   To Team     2025 non-null   object 
 4   From Team   2027 non-null   object 
 5   Age         2027 non-null   int64  
 6   WAR3        1959 non-null   float64
 7   Yrs         2027 non-null   int64  
 8   G           1790 non-null   float64
 9   AB          1790 non-null   float64
 10  R           1790 non-null   float64
 11  H           1790 non-null   float64
 12  HR          1790 non-null   float64
 13  RBI         1790 non-null   float64
 14  SB          1790 non-null   float64
 15  BB          1790 non-null   float64
 16  BA          1447 non-null   float64
 17  OBP         1455 non-null   float64
 18  SLG         1447 non-null   float64
 19  OPS         1447 non-null  

In [11]:
# checking that row headers are not being included, if it was it would show up in row 26
df.iloc[20:31]

Unnamed: 0,Rk,Name,Date,To Team,From Team,Age,WAR3,Yrs,G,AB,...,WHIP,G.1,GS,SV,IP,H.1,HR.1,BB.1,SO,source_url
20,21,Jesse Biddle,2021-04-02,Atlanta Braves,CIN,29,-0.4,3,69.0,2.0,...,1.592,91.0,0.0,1.0,92.1,93.0,11.0,54.0,94.0,https://www.baseball-reference.com/leagues/maj...
21,22,Zack Granite,2021-04-02,Chicago White Sox,NYY,28,,3,,,...,,,,,,,,,,https://www.baseball-reference.com/leagues/maj...
22,23,Roenis Elías,2021-04-01,Seattle Mariners,WSN,32,0.6,6,11.0,7.0,...,1.32,126.0,54.0,14.0,388.0,364.0,44.0,148.0,325.0,https://www.baseball-reference.com/leagues/maj...
23,24,Matt Magill,2021-04-01,Seattle Mariners,SEA,31,0.5,5,15.0,8.0,...,1.55,112.0,6.0,5.0,149.2,150.0,28.0,82.0,158.0,https://www.baseball-reference.com/leagues/maj...
24,25,Carl Edwards Jr.,2021-03-31,Atlanta Braves,SEA,29,0.8,6,182.0,3.0,...,1.096,199.0,0.0,3.0,180.2,97.0,15.0,101.0,242.0,https://www.baseball-reference.com/leagues/maj...
25,26,Juan Graterol,2021-03-31,Toronto Blue Jays,MIN,32,-0.1,4,,,...,,,,,,,,,,https://www.baseball-reference.com/leagues/maj...
26,27,Tony Wolters,2021-03-31,Chicago Cubs,COL,29,1.6,5,391.0,1075.0,...,,,,,,,,,,https://www.baseball-reference.com/leagues/maj...
27,28,Todd Frazier,2021-03-30,Pittsburgh Pirates,NYM,35,5.2,10,1231.0,4357.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,https://www.baseball-reference.com/leagues/maj...
28,29,Tommy Hunter,2021-03-30,New York Mets,PHI,34,2.1,13,102.0,4.0,...,1.241,472.0,75.0,22.0,863.1,877.0,119.0,194.0,591.0,https://www.baseball-reference.com/leagues/maj...
29,30,Derek Dietrich,2021-03-29,New York Yankees,TEX,31,0.9,8,746.0,2191.0,...,,,,,,,,,,https://www.baseball-reference.com/leagues/maj...


In [13]:
# convert date column to datetime type 
df['Date'] = pd.to_datetime(df['Date'])

In [15]:
# sort df by date, because each season has a descending order
df = df.sort_values(by='Date')

In [17]:
df

Unnamed: 0,Rk,Name,Date,To Team,From Team,Age,WAR3,Yrs,G,AB,...,WHIP,G.1,GS,SV,IP,H.1,HR.1,BB.1,SO,source_url
379,380,AJ Ramos,2020-09-05,Colorado Rockies,CHC,34,-0.4,8,356.0,0.0,...,1.280,377.0,0.0,99.0,368.2,269.0,25.0,203.0,427.0,https://www.baseball-reference.com/leagues/maj...
378,379,Justin Smoak,2020-09-09,San Francisco Giants,MIL,34,2.3,11,1286.0,4153.0,...,,,,,,,,,,https://www.baseball-reference.com/leagues/maj...
377,378,Stevie Wilkerson,2020-10-29,Baltimore Orioles,BAL,29,-0.8,2,133.0,375.0,...,1.125,4.0,0.0,1.0,5.1,6.0,2.0,0.0,1.0,https://www.baseball-reference.com/leagues/maj...
376,377,Kendall Graveman,2020-10-29,Seattle Mariners,SEA,30,-1.2,6,6.0,5.0,...,1.371,94.0,80.0,0.0,464.2,499.0,60.0,138.0,301.0,https://www.baseball-reference.com/leagues/maj...
375,376,Abraham Almonte,2020-10-30,Atlanta Braves,SDP,32,-0.4,8,376.0,1038.0,...,,,,,,,,,,https://www.baseball-reference.com/leagues/maj...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1631,5,Jose Quintana,2025-03-05,Milwaukee Brewers,NYM,36,7.6,13,95.0,156.0,...,1.276,359.0,333.0,0.0,1969.2,1923.0,201.0,591.0,1727.0,https://www.baseball-reference.com/leagues/maj...
1630,4,Jalen Beeks,2025-03-10,Houston Astros,PIT,31,0.6,6,7.0,5.0,...,1.421,202.0,19.0,15.0,347.2,356.0,36.0,138.0,329.0,https://www.baseball-reference.com/leagues/maj...
1629,3,Dillon Tate,2025-03-12,Toronto Blue Jays,TOR,31,1.1,5,7.0,0.0,...,1.168,190.0,0.0,9.0,215.2,187.0,19.0,65.0,170.0,https://www.baseball-reference.com/leagues/maj...
1627,1,Travis Jankowski,2025-03-13,Chicago White Sox,TEX,34,1.7,10,681.0,1507.0,...,,,,,,,,,,https://www.baseball-reference.com/leagues/maj...
