In [37]:
import pandas as pd
import numpy as py

In [39]:
pip install requests




In [40]:
import requests

In [43]:
pip install beautifulsoup4

Note: you may need to restart the kernel to use updated packages.


In [47]:
url = 'https://www.pro-football-reference.com/years/#years'
response = requests.get(url)
if response.status_code == 200:
    htmlcontent = response.text
    print('Successfully extracted data')
else: 
    print('There was a problem')

Successfully extracted data


In [None]:
response = requests.get(url)
response.raise_for_status() 

In [56]:
import requests

# Make a GET request to fetch the page content
response = requests.get(url)
response.raise_for_status()  # Check if the request was successful

# Parse the page content with BeautifulSoup
soup = BeautifulSoup(response.text, 'html.parser')

# Find tables on the page and extract them into pandas dataframes
# (assuming each year's stats are within HTML tables)
tables = pd.read_html(str(soup))

# Print each table
for i, table in enumerate(tables):
    print(f"Table {i + 1}")
    print(table.head())  # Display the first few rows of each table for inspection
    print("\n")


Table 1
   Year   Lg                                         Unnamed: 2
0  2024  NFL                                                NaN
1  2023  NFL  Super Bowl LVIII: Kansas City Chiefs (AFC,11-6...
2  2022  NFL  Super Bowl LVII: Kansas City Chiefs (AFC,14-3)...
3  2021  NFL  Super Bowl LVI: Los Angeles Rams (NFC,12-5) de...
4  2020  NFL  Super Bowl LV: Tampa Bay Buccaneers (NFC,11-5)...




### Part 1 Replace Headers

In [59]:
# Select the table you want to modify (assuming we want the first table here)
table = tables[0]  # Modify this if you want a different table

# Replace headers in the table
# Replacing "Year" with "Yr" and "Lg" with "league"
table = table.rename(columns={"Year": "Yr", "Lg": "league"})

# Print the modified table
print("Modified Table with Updated Headers:")
print(table.head())

Modified Table with Updated Headers:
     Yr league                                         Unnamed: 2
0  2024    NFL                                                NaN
1  2023    NFL  Super Bowl LVIII: Kansas City Chiefs (AFC,11-6...
2  2022    NFL  Super Bowl LVII: Kansas City Chiefs (AFC,14-3)...
3  2021    NFL  Super Bowl LVI: Los Angeles Rams (NFC,12-5) de...
4  2020    NFL  Super Bowl LV: Tampa Bay Buccaneers (NFC,11-5)...


### Part 2 Format data into a more readable format

In [74]:
# Step 7: Clean up and format the data for readability
# Removing any extraneous whitespace from column names
table.columns = table.columns.str.strip()

# Display settings for better readability in pandas
pd.set_option("display.max_columns", None)  # Show all columns
pd.set_option("display.width", 1000)        # Set display width
pd.set_option("display.float_format", '{:,.0f}'.format)  # No decimals for floats

# Print the cleaned and formatted table
print("Cleaned and Formatted Table:")
print(table.head())

Cleaned and Formatted Table:
     Yr league                                                                                             Unnamed: 2
0  2024    NFL                                                                                                    NaN
1  2023    NFL  Super Bowl LVIII: Kansas City Chiefs (AFC,11-6) defeated San Francisco 49ers (NFC,12-5), Score: 25-22
2  2022    NFL   Super Bowl LVII: Kansas City Chiefs (AFC,14-3) defeated Philadelphia Eagles (NFC,14-3), Score: 38-35
3  2021    NFL       Super Bowl LVI: Los Angeles Rams (NFC,12-5) defeated Cincinnati Bengals (AFC,10-7), Score: 23-20
4  2020    NFL     Super Bowl LV: Tampa Bay Buccaneers (NFC,11-5) defeated Kansas City Chiefs (AFC,14-2), Score: 31-9


### Part 3 Find Duplicates

In [78]:
# Find duplicate rows
duplicates = table[table.duplicated()]

# Print duplicate rows if any
if not duplicates.empty:
    print("Duplicate Rows Found:")
    print(duplicates)
else:
    print("No duplicate rows found.")

No duplicate rows found.


### Part 4 Identify outliers and bad data

In [86]:
# Detect outliers using the Interquartile Range (IQR) for numerical columns
outliers = pd.DataFrame()
for column in table.select_dtypes(include=['number']).columns:
    Q1 = table[column].quantile(0.25)
    Q3 = table[column].quantile(0.75)
    IQR = Q3 - Q1
    # Define outliers as data points that fall outside of 1.5 * IQR from the Q1 or Q3
    outliers_in_col = table[(table[column] < (Q1 - 1.5 * IQR)) | (table[column] > (Q3 + 1.5 * IQR))]
    outliers = pd.concat([outliers, outliers_in_col])

# Removing duplicates from the outliers dataframe if a row appears multiple times
outliers = outliers.drop_duplicates()

# Identify "bad data" like missing values or unexpected types
bad_data = table[table.isnull().any(axis=1)]  # Rows with any missing values

# Print outliers and bad data
if not outliers.empty:
    print("Outliers Found:")
    print(outliers)
else:
    print("No outliers found.")

if not bad_data.empty:
    print("\nRows with Bad Data (e.g., missing values):")
    print(bad_data)
else:
    print("\nNo bad data found.")


No outliers found.

Rows with Bad Data (e.g., missing values):
     Yr league Unnamed: 2
0  2024    NFL        NaN


### Part 5 Conduct Fuzzy Matching

In [91]:
pip install fuzzywuzzy[speedup]

Collecting fuzzywuzzy[speedup]Note: you may need to restart the kernel to use updated packages.

  Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl.metadata (4.9 kB)
Collecting python-levenshtein>=0.12 (from fuzzywuzzy[speedup])
  Downloading python_Levenshtein-0.26.1-py3-none-any.whl.metadata (3.7 kB)
Collecting Levenshtein==0.26.1 (from python-levenshtein>=0.12->fuzzywuzzy[speedup])
  Downloading levenshtein-0.26.1-cp311-cp311-win_amd64.whl.metadata (3.2 kB)
Downloading python_Levenshtein-0.26.1-py3-none-any.whl (9.4 kB)
Downloading levenshtein-0.26.1-cp311-cp311-win_amd64.whl (98 kB)
   ---------------------------------------- 0.0/98.5 kB ? eta -:--:--
   ------------------------------------- -- 92.2/98.5 kB 2.6 MB/s eta 0:00:01
   ---------------------------------------- 98.5/98.5 kB 802.5 kB/s eta 0:00:00
Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Installing collected packages: fuzzywuzzy, Levenshtein, python-levenshtein
Successfully installed Levenshtein-0.26.1 

In [93]:
pip install rapidfuzz

Note: you may need to restart the kernel to use updated packages.


In [99]:
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
from rapidfuzz import fuzz, process

In [101]:
# Perform fuzzy matching on the "league" column using rapidfuzz
# Set a similarity threshold (e.g., 90) for fuzzy matches
similarity_threshold = 90
fuzzy_matches = []

# Use process.extract to get similar entries within the "league" column
for i, entry in enumerate(table['league']):
    matches = process.extract(entry, table['league'], scorer=fuzz.ratio)
    # Filter matches that are above the threshold and not the exact same row
    similar_entries = [match for match in matches if match[1] >= similarity_threshold and match[0] != entry]
    
    # Collect results for each entry with its similar matches
    if similar_entries:
        fuzzy_matches.append((entry, similar_entries))

# Print fuzzy matches if any
if fuzzy_matches:
    print("Fuzzy Matches Found:")
    for entry, matches in fuzzy_matches:
        print(f"\nOriginal Entry: {entry}")
        print("Similar Entries:")
        for match in matches:
            print(f" - {match[0]} (Similarity: {match[1]}%)")
else:
    print("No fuzzy matches found.")

No fuzzy matches found.
