## Final Project: Fantasy Sports Assistant 

#### 1.1 Handle imports up-front 


In [3]:
import urllib.request
import pandas as pd
from bs4 import BeautifulSoup 

#### 2.0 Load the Dataset


In [4]:

# Base URL for scraping, leaving the year and week range variables
base_url = "https://www.footballguys.com/playerhistoricalstats?pos=flex&yr={}&startwk={}&stopwk={}&profile=pi"

# Initialize a list to hold all the player data across years and weeks
all_player_data = []

# Loop over years from 1996 to 2024
for year in range(1996, 2025):  # From 1996 to 2024 (inclusive)
    # Loop over weeks 1 to 18
    for week in range(1, 19):  # Week 1 to Week 18
        # Format the URL with the current year, startwk, and stopwk
        url = base_url.format(year, week, week)
        
        # Send a GET request to the website
        response = requests.get(url)
        
        # Check if the request was successful
        if response.status_code == 200:
            print(f"Successfully fetched data for Week {week} of Year {year}!")
        else:
            print(f"Failed to retrieve data for Week {week} of Year {year} with status code {response.status_code}")
            continue  # Skip this week if there's an issue
        
        # Parse the page content using BeautifulSoup
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find the table containing the player stats (you may need to adjust the selector)
        table = soup.find('table')  # Modify this selector based on the page structure

        # If the table exists, proceed with scraping the rows
        if table:
            rows = table.find_all('tr')

            # Iterate through each row (skip header row) and extract the required data
            for row in rows[1:]:  # Skip the first row (header row)
                cols = row.find_all('td')

                # Check if there are enough columns in the row (to avoid index errors)
                if len(cols) > 1:  # Ensure there's at least a Player and Position column
                    # Extract the player name, position, and points (adjust index based on table structure)
                    player_name = cols[0].text.strip()   # Player Name in first column
                    position = cols[1].text.strip()      # Position in second column
                    points = cols[2].text.strip() if len(cols) > 2 else None  # Points in third column (adjust index)

                    # Store player info in a dictionary, including Year and Week
                    player_info = {
                        "Year": year,
                        "Week": week,
                        "Player": player_name,
                        "Position": position,
                        "Points": points  
                    }

                    # Append player info to all_player_data list
                    all_player_data.append(player_info)
        else:
            print(f"Table not found for Week {week} of Year {year}")

# Convert the list of player data into a pandas DataFrame
df = pd.DataFrame(all_player_data)

# Display the first few rows of the combined data
print(df.head())

# Optionally, save the data to a CSV file
df.to_csv('player_stats_1996_to_2024.csv', index=False)

NameError: name 'requests' is not defined

In [55]:
print(df)

                                Position  Points
Player Year Week                                
1      2024 18          Jahmyr Gibbs DET    22.0
10     2024 18         De'Von Achane MIA    23.0
100    2024 18           Chris Conley SF    32.0
101    2024 18       KaVontae Turpin DAL    28.0
102    2024 18        Dallas Goedert PHI    29.0
...                                  ...     ...
95     2024 18          Tony Pollard TEN    27.0
96     2024 18         Antonio Gibson NE    26.0
97     2024 18    Travis Etienne Jr. JAX    25.0
98     2024 18       Brenton Strange JAX    24.0
99     2024 18           Chris Brooks GB    24.0

[344 rows x 2 columns]


In [57]:
print(df.columns)

Index(['Position', 'Points'], dtype='object')


### 3.0  Data Transformation - Time Series Data for Players

In [50]:
# Set the multi-index with 'Player', 'Year', and 'Week'
df.set_index(['Player', 'Year', 'Week'], inplace=True)

# Sort the DataFrame by the new multi-index
df.sort_index(inplace=True)

# Display the DataFrame to confirm the changes
print(df.head())

                            Position Points
Player Year Week                           
1      1996 1        Terry Allen WAS   28.0
            2       Eddie George HOU   23.0
            3       Curtis Martin NE   23.0
            4     LeShon Johnson ARI   25.0
            5        Terry Allen WAS   28.0


In [51]:
# Initialize an empty list to hold player data
all_player_data = []

# Example loop to add player data
for row in rows[1:]:
    cols = row.find_all('td')
    if len(cols) > 1:
        player_info = {
            "Year": year,
            "Week": week,
            "Player": cols[0].text.strip(),
            "Position": cols[1].text.strip(),
            "Points": cols[2].text.strip() if len(cols) > 2 else None
        }
        # Append to the list (not DataFrame directly)
        all_player_data.append(player_info)

# Convert list of player data to DataFrame at the end
df = pd.DataFrame(all_player_data)

# Display the DataFrame
print(df.head())

   Year  Week Player             Position Points
0  2024    18      1     Jahmyr Gibbs DET   22.0
1  2024    18      2     Drake London ATL   23.0
2  2024    18      3   Bijan Robinson ATL   22.0
3  2024    18      4    Derrick Henry BAL   30.0
4  2024    18      5  Jonathan Taylor IND   25.0


In [52]:
print(player_data)

                         Position  Points
Player Year Week                         
99     2024 18    Chris Brooks GB    24.0


In [53]:
df['Points'] = pd.to_numeric(df['Points'], errors='coerce')

# Step 2: Set multi-index with 'Player', 'Year', and 'Week'
df.set_index(['Player', 'Year', 'Week'], inplace=True)

# Sort the DataFrame for better visualization and chronological order
df.sort_index(inplace=True)

# Display the first few rows to verify the DataFrame structure
print(df.head())

# Step 3: Loop through the data to create input-output pairs
input_features = []
output_labels = []

# Loop through each player and year to create time-series pairs
for player, player_data in df.groupby('Player'):
    for year, year_data in player_data.groupby('Year'):
        # Sort by week to ensure the data is in chronological order
        year_data = year_data.sort_values('Week')

        # Create input-output pairs for consecutive weeks
        for i in range(len(year_data) - 1):
            # Current week's points as input feature
            input_row = year_data.iloc[i]['Points']

            # Next week's points as output label
            output_row = year_data.iloc[i + 1]['Points']

            # Append the features and labels
            input_features.append(input_row)
            output_labels.append(output_row)

# Step 4: Combine input features and output labels into a final DataFrame
features_df = pd.DataFrame(input_features, columns=['Week_Points'])
labels_df = pd.DataFrame(output_labels, columns=['Next_Week_Points'])

# Combine both features and labels into a final DataFrame
final_df = pd.concat([features_df, labels_df], axis=1)

# Display the first few rows of the final dataset
print(final_df.head())

# Optionally, save the data to a CSV file
final_df.to_csv('qb_player_stats_2024_input_output.csv', index=False)

                             Position  Points
Player Year Week                             
1      2024 18       Jahmyr Gibbs DET    22.0
10     2024 18      De'Von Achane MIA    23.0
100    2024 18        Chris Conley SF    32.0
101    2024 18    KaVontae Turpin DAL    28.0
102    2024 18     Dallas Goedert PHI    29.0
Empty DataFrame
Columns: [Week_Points, Next_Week_Points]
Index: []


In [54]:
# If needed, convert to numpy arrays for machine learning models
import numpy as np
features_array = np.array(features_df)
labels_array = np.array(labels_df)

# Check the shapes of the arrays
print(features_array.shape, labels_array.shape)

(0, 1) (0, 1)


#### 2.0 EDA 

### 2.1 Missing and/or extreme values 