<h1 style="text-align:center; font-size:36px; font-weight:bold;">Comprehensive Data-Driven Analysis NBA Players</h1>


<h3 style="text-align:center; font-size:22px; font-weight:bold; font-family: 'Arial', sans-serif; color:#BBBBBB;">
    Created with passion by: Hirad Pejman, Mehran Mahdiani, Raana Fatahi, Sama Zohari 🚀🔥
</h3>


<div style="text-align:center;">
    <img src="project_cover.png" alt="Project Cover" style="width:60%; border-radius:10px;">
</div>

# import libraries 

In [16]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import csv
import re
import time
import math
from datetime import datetime

# aval az hame bayad list e baikon haro be tafkik har fasl dar biarim darbiarim

In [46]:
class NBAScraper:
    """
    A class for scraping NBA player statistics from Basketball Reference.
    """
    BASE_URL = "https://www.basketball-reference.com"
    HEADERS = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36",
        "Accept-Language": "en-US,en;q=0.9",
    }
    
    def __init__(self):
        """
        Initializes the scraper and finds the most recent "Player Stats" page.
        """
        self.current_page = self.get_player_stats_url()
        self.year_links = []
        self.player_links = []
    
    def get_player_stats_url(self):
        """
        Extracts the URL of the "Player Stats" page from the main site.
        """
        response = requests.get(self.BASE_URL, headers=self.HEADERS)
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Find the "Player Stats" link
        for link in soup.find_all('a'):
            if link.text == 'Player Stats':
                return f"{self.BASE_URL}{link['href']}"
        
        raise ValueError("Player Stats page not found")
    
    def get_yearly_links(self, num_years=5):
        """
        Retrieves links for the past `num_years` seasons.
        """
        current_url = self.current_page
        
        for _ in range(num_years):
            response = requests.get(current_url, headers=self.HEADERS)
            soup = BeautifulSoup(response.content, 'html.parser')
            
            # Find the "Previous Season" button to get past season links
            prev_button = soup.find(class_="button2 prev")
            if prev_button and 'href' in prev_button.attrs:
                current_url = f"{self.BASE_URL}{prev_button['href']}"
                self.year_links.append(current_url)
            else:
                break  # Stop if no previous season link is found
    
    def extract_player_links(self):
        """
        Extracts player profile links from the stored yearly season pages.
        """
        for link in self.year_links:
            response = requests.get(link, headers=self.HEADERS)
            soup = BeautifulSoup(response.content, 'html.parser')
            
            table_rows = soup.select('table.stats_table tbody tr')
            player_per_year = []
            
            for row in table_rows:
                player_cell = row.find('td', class_='left')
                if player_cell and player_cell.find('a'):
                    href = player_cell.find('a')['href']
                    player_per_year.append(f"{self.BASE_URL}{href}")
            
            self.player_links.append(self.remove_duplicates(player_per_year))
    
    @staticmethod
    def remove_duplicates(lst):
        """
        Removes duplicate URLs from the list while preserving order.
        """
        seen = set()
        return [x for x in lst if not (x in seen or seen.add(x))]
    
    def process_data(self):
        """
        Runs the complete scraping process and refines extracted data.
        """
        self.get_yearly_links()
        self.extract_player_links()
        
        # Adjust the extracted data based on manual filtering (removing last few entries for some years)
        if len(self.player_links) >= 5:
            self.player_links[1] = self.player_links[1][:-2]  # Trim last 2 players for 2023
            self.player_links[2] = self.player_links[2][:-1]  # Trim last player for 2022
            self.player_links[4] = self.player_links[4][:-1]  # Trim last player for 2020
    
    def display_results(self):
        """
        Prints the number of players extracted per year.
        """
        years = [2024, 2023, 2022, 2021, 2020]
        for i, year in enumerate(years):
            print(f"{year}: {len(self.player_links[i])} players")
    
# Run the scraper
scraper = NBAScraper()
scraper.process_data()
scraper.display_results()

2024: 572 players
2023: 539 players
2022: 605 players
2021: 540 players
2020: 529 players


In [13]:
## khob ma midhnunim tanha chiazyi k mikhaym az har fasl inas: 1- 50 taye aval, 2- nafarate team 2 ghahreman, 3- azaye list e michael jordan trophy
## midhunim azaye list e michael jordan trophy ghatan too 650 taye aval hastan tanha tarsi ke darim ine ke adamaye team e ghahreman nakhan bashan 
## bara hamin mitunim nehine konim!!

In [14]:
## aval lazem darim ta bbinim ch team hayi dar che sali ghahreman shodan!!!

## Champ Team Lists

In [34]:
class BasketballChampionsScraper:
    def __init__(self, url):
        self.url = url
        self.soup = None
        self.season_data = []
        self.filtered_years = ['2019-20', '2020-21', '2021-22', '2022-23', '2023-24']
        self.df = None
        self.champ_teams = []

    def fetch_data(self):
        """Sends a request to the webpage and parses the HTML content."""
        response = requests.get(self.url)
        if response.status_code == 200:
            self.soup = BeautifulSoup(response.content, 'html.parser')
        else:
            raise Exception(f"Failed to fetch data. Status Code: {response.status_code}")

    def extract_data(self):
        """Extracts champion team data from the HTML content."""
        stats_table = self.soup.find('table', {'id': 'stats'})
        if not stats_table:
            raise Exception("Could not find the stats table on the page.")

        rows = stats_table.find_all('tr')
        for row in rows:
            cols = row.find_all('td')
            if cols:
                season = row.find('th').text.strip()
                champion_cell = cols[1].find('a')

                if champion_cell:
                    champion = champion_cell.text.strip()
                    team_abbr = champion_cell['href'].split('/')[-2]
                    season_year = str(int(season[:4]) + 1)  # Adjusting year by +1
                    champion_link = f"https://www.basketball-reference.com/teams/{team_abbr}/{season_year}.html"
                else:
                    champion = cols[1].text.strip()
                    champion_link = None

                self.season_data.append({'Season': season, 'Champion': champion, 'Champion Link': champion_link})

    def filter_data(self):
        """Filters the data for specific seasons."""
        self.df = pd.DataFrame(self.season_data)
        self.df = self.df[self.df['Season'].isin(self.filtered_years)]
        self.champ_teams = self.df['Champion Link'].dropna().tolist()

    def save_to_csv(self, filename='Champ_of_each_season_with_links.csv'):
        """Saves the filtered data to a CSV file."""
        if self.df is not None:
            self.df.to_csv(filename, index=False)

    def display_results(self):
        """Prints the filtered DataFrame and champion team links."""
        print(self.df)
        print(self.champ_teams)

    def run(self):
        """Executes all steps in sequence."""
        self.fetch_data()
        self.extract_data()
        self.filter_data()
        self.save_to_csv()
        self.display_results()

# Running the scraper
scraper = BasketballChampionsScraper('https://www.basketball-reference.com/leagues/')
scraper.run()


    Season               Champion  \
1  2023-24         Boston Celtics   
2  2022-23         Denver Nuggets   
3  2021-22  Golden State Warriors   
4  2020-21        Milwaukee Bucks   
5  2019-20     Los Angeles Lakers   

                                       Champion Link  
1  https://www.basketball-reference.com/teams/BOS...  
2  https://www.basketball-reference.com/teams/DEN...  
3  https://www.basketball-reference.com/teams/GSW...  
4  https://www.basketball-reference.com/teams/MIL...  
5  https://www.basketball-reference.com/teams/LAL...  
['https://www.basketball-reference.com/teams/BOS/2024.html', 'https://www.basketball-reference.com/teams/DEN/2023.html', 'https://www.basketball-reference.com/teams/GSW/2022.html', 'https://www.basketball-reference.com/teams/MIL/2021.html', 'https://www.basketball-reference.com/teams/LAL/2020.html']


## Champ team Players Link!

In [42]:
import pandas as pd
from bs4 import BeautifulSoup
import requests

# List of URLs for scraping
urls = champ_teams  

# HTTP headers for requests
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}

def scrape_totals_table(url):
    """Scrapes the 'Totals' table from the given URL and saves it as a CSV file."""
    response = requests.get(url, headers=HEADERS)
    if response.status_code != 200:
        print(f"Failed to fetch URL: {url}. Status code: {response.status_code}")
        return

    soup = BeautifulSoup(response.text, 'html.parser')

    # Find the "Totals" table
    table = next((t for t in soup.find_all('table') if t.find('thead') and 'Totals' in t.text), None)
    if not table:
        print(f"Table containing 'Totals' not found for {url}.")
        return

    # Extract headers
    table_headers = [th.text.strip() for th in table.find('thead').find_all('th') if th.text.strip()]

    # Extract rows
    rows = table.find('tbody').find_all('tr')
    data = []
    for row in rows:
        if row.get('class') and 'thead' in row.get('class'):
            continue  # Skip internal header rows
        row_data = [td.text.strip() for td in row.find_all('td')]

        # Ensure row_data matches header length
        row_data = (row_data + [None] * len(table_headers))[:len(table_headers)]

        data.append(row_data)

    # Create DataFrame and save it as CSV
    team_year = url.split('/')[-1].replace('.html', '')
    output_file_path = f'{team_year}_Totals.csv'
    pd.DataFrame(data, columns=table_headers).to_csv(output_file_path, index=False, encoding='utf-8')
    print(f"Data saved to {output_file_path}")

def scrape_champ_player_links():
    """Scrapes player profile links from the champion teams' pages."""
    champ_player_links = {}

    for url in urls:
        response = requests.get(url, headers=HEADERS)
        if response.status_code != 200:
            print(f"Failed to fetch {url}. Status code: {response.status_code}")
            continue

        soup = BeautifulSoup(response.content, 'html.parser')
        table = soup.find('tbody')
        if not table:
            print(f"No player data found for {url}")
            continue

        player_links = [
            'https://www.basketball-reference.com' + a['href']
            for a in table.find_all('a') if 'players' in a['href']
        ]

        team_year = url.split('/')[-1].replace('.html', '')
        champ_player_links[team_year] = player_links

    return champ_player_links

# Scrape player links
champ_player_links = scrape_champ_player_links()

# Example: Print links for 2024 champion players
print(champ_player_links.get('2024', []))


['https://www.basketball-reference.com/players/b/bantoda01.html', 'https://www.basketball-reference.com/players/b/brissos01.html', 'https://www.basketball-reference.com/players/b/brownja02.html', 'https://www.basketball-reference.com/players/d/davisjd01.html', 'https://www.basketball-reference.com/players/h/hausesa01.html', 'https://www.basketball-reference.com/players/h/holidjr01.html', 'https://www.basketball-reference.com/players/h/horfoal01.html', 'https://www.basketball-reference.com/players/k/kornelu01.html', 'https://www.basketball-reference.com/players/m/mykhasv01.html', 'https://www.basketball-reference.com/players/p/peterdr01.html', 'https://www.basketball-reference.com/players/p/porzikr01.html', 'https://www.basketball-reference.com/players/p/pritcpa01.html', 'https://www.basketball-reference.com/players/q/quetane01.html', 'https://www.basketball-reference.com/players/s/sprinja01.html', 'https://www.basketball-reference.com/players/s/stevela01.html', 'https://www.basketball-

In [43]:
## Clean Data

In [47]:
# Extracting the first 60 player links from each year using NBAScraper data
links_60 = {year: scraper.player_links[i][:60] for i, year in enumerate([2024, 2023, 2022, 2021, 2020])}

# Combining all the first 60 player links
all_60players = sum(links_60.values(), [])

# Combining with champion team player links
champion_links = [
    champ_player_links.get(str(year), []) for year in years
]
all_links = all_60players + sum(champion_links, [])

# Removing duplicate links
unique_players_links = list(set(all_links))

# Final result
print(len(unique_players_links))


193


In [49]:
## natije migirm kole bazikon hayi ke ma bahashoon kar darim 193 ta bazikon haatsand!

In [53]:
## hala shoro be scrap kardane in ha mikonim!
## in code beyne hardota bazikon 2s time sleep darad va bad azinke har 50 ta ro scrap mikone 20s time sleep mide!

In [52]:
class PlayerScraper:
    def __init__(self, url):
        self.url = url
        self.soup = None
        self.data = {}

    def fetch_data(self):
        headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"}
        response = requests.get(self.url, headers=headers)
        if response.status_code == 200:
            self.soup = BeautifulSoup(response.text, 'html.parser')
        else:
            raise Exception(f"Failed to fetch data. Status code: {response.status_code}")

    def extract_name_and_id(self):
        name_tag = self.soup.find('h1').find('span')
        self.data['Name'] = name_tag.text.strip() if name_tag else "Not Listed"
        self.data['P_ID'] = self.url.split('/')[-1].replace('.html', '')

    def extract_team_and_id(self):
        team_tag = self.soup.find('strong', string='Team')
        self.data['Team'] = team_tag.find_next('a').text.strip() if team_tag else "Not Listed"
        team_link = team_tag.find_next('a')['href'] if team_tag and team_tag.find_next('a') else "No link available"
        self.data['T_ID'] = team_link.split('/')[-2] if team_link != "No link available" else "Not Listed"

    def extract_physical_stats(self):
        height, weight = "None", "None"
        for tag in self.soup.find_all('p'):
            text = tag.get_text(strip=True)
            match = re.search(r'\((\d+)cm,\s*(\d+)kg\)', text)
            if match:
                height, weight = int(match.group(1)), int(match.group(2))
                break
        self.data['H(cm)'] = height
        self.data['W(Kg)'] = weight

    def extract_age(self):
        age_tag = self.soup.find('span', {'id': 'necro-birth'})
        if age_tag:
            birth_date = age_tag.get('data-birth')
            if birth_date:
                birth_year = int(birth_date.split('-')[0])
                current_year = datetime.now().year
                self.data['Age'] = current_year - birth_year
                return
        self.data['Age'] = "None"

    def extract_experience(self):
        experience_tag = self.soup.find('strong', string='Experience:')
        experience_years = experience_tag.next_sibling.strip() if experience_tag else "0"
        experience_years = experience_years.replace('\xa0', ' ').strip()  # Remove nbsp
        if "years" in experience_years.lower():
            experience_years = experience_years.split()[0]
        self.data['Exp_Yrs'] = int(experience_years) if experience_years.isdigit() else 0

    def extract_entry_age(self):
        age = self.data.get('Age', "None")
        experience = self.data.get('Exp_Yrs', 0)
        if age != "None" and experience != "None" and isinstance(age, int) and isinstance(experience, int):
            self.data['Entry_Age'] = age - experience
        else:
            self.data['Entry_Age'] = "None"

class BasicPlayerScraper:
    def __init__(self, url):
        self.scraper = PlayerScraper(url)

    def scrape_basic_info(self):
        self.scraper.fetch_data()
        self.scraper.extract_name_and_id()
        self.scraper.extract_team_and_id()
        self.scraper.extract_physical_stats()
        self.scraper.extract_age()
        self.scraper.extract_experience()
        self.scraper.extract_entry_age()
        return {
            'Name': self.scraper.data.get('Name', "None"),
            'P_ID': self.scraper.data.get('P_ID', "None"),
            'Team': self.scraper.data.get('Team', "None"),
            'T_ID': self.scraper.data.get('T_ID', "None"),
            'H(cm)': self.scraper.data.get('H(cm)', "None"),
            'W(Kg)': self.scraper.data.get('W(Kg)', "None"),
            'Entry_Age': self.scraper.data.get('Entry_Age', "None"),
        }

def process_player_links(player_links, initial_sleep_time=2, max_retries=5):
    players_data = []
    sleep_time = initial_sleep_time

    for index, url in enumerate(player_links):
        retries = 0
        while retries < max_retries:
            try:
                print(f"Processing {index + 1}/{len(player_links)}: {url}")
                scraper = BasicPlayerScraper(url)
                player_data = scraper.scrape_basic_info()
                players_data.append(player_data)
                time.sleep(sleep_time)

                # Check if we've processed 50 players, then pause for 40 seconds
                if (index + 1) % 50 == 0:
                    print("Pausing for 20 seconds...")
                    time.sleep(20)
                
                break
            except Exception as e:
                if "429" in str(e):
                    retries += 1
                    sleep_time += 2  # Increase sleep time dynamically
                    print(f"429 Too Many Requests. Retrying {retries}/{max_retries} after {sleep_time} seconds...")
                    time.sleep(sleep_time)
                else:
                    print(f"Error processing {url}: {e}")
                    break

    # Save to CSV
    file_name = "players_data.csv"
    with open(file_name, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(['Name', 'P_ID', 'Team', 'T_ID', 'H(cm)', 'W(Kg)', 'Entry_Age'])
        writer.writerows([[p['Name'], p['P_ID'], p['Team'], p['T_ID'], p['H(cm)'], p['W(Kg)'], p['Entry_Age']] for p in players_data])

    print(f"Data saved to {file_name}")
    return players_data



process_player_links(unique_players_links)


Processing 1/193: https://www.basketball-reference.com/players/a/aytonde01.html
Processing 2/193: https://www.basketball-reference.com/players/s/simonan01.html
Processing 3/193: https://www.basketball-reference.com/players/m/mykhasv01.html
Processing 4/193: https://www.basketball-reference.com/players/g/georgpa01.html
Processing 5/193: https://www.basketball-reference.com/players/o/oubreke01.html
Processing 6/193: https://www.basketball-reference.com/players/h/howardw01.html
Processing 7/193: https://www.basketball-reference.com/players/h/hardeja01.html
Processing 8/193: https://www.basketball-reference.com/players/w/whiteha01.html
Processing 9/193: https://www.basketball-reference.com/players/k/kornelu01.html
Processing 10/193: https://www.basketball-reference.com/players/b/butleji01.html
Processing 11/193: https://www.basketball-reference.com/players/i/ingrabr01.html
Processing 12/193: https://www.basketball-reference.com/players/c/craigto01.html
Processing 13/193: https://www.basket

[{'Name': 'Deandre Ayton',
  'P_ID': 'aytonde01',
  'Team': 'Portland Trail Blazers',
  'T_ID': 'POR',
  'H(cm)': 213,
  'W(Kg)': 113,
  'Entry_Age': 21},
 {'Name': 'Anfernee Simons',
  'P_ID': 'simonan01',
  'Team': 'Portland Trail Blazers',
  'T_ID': 'POR',
  'H(cm)': 190,
  'W(Kg)': 82,
  'Entry_Age': 20},
 {'Name': 'Svi Mykhailiuk',
  'P_ID': 'mykhasv01',
  'Team': 'Utah Jazz',
  'T_ID': 'UTA',
  'H(cm)': 201,
  'W(Kg)': 92,
  'Entry_Age': 22},
 {'Name': 'Paul George',
  'P_ID': 'georgpa01',
  'Team': 'Philadelphia 76ers',
  'T_ID': 'PHI',
  'H(cm)': 203,
  'W(Kg)': 99,
  'Entry_Age': 21},
 {'Name': 'Kelly Oubre Jr.',
  'P_ID': 'oubreke01',
  'Team': 'Philadelphia 76ers',
  'T_ID': 'PHI',
  'H(cm)': 203,
  'W(Kg)': 92,
  'Entry_Age': 21},
 {'Name': 'Dwight Howard',
  'P_ID': 'howardw01',
  'Team': 'Not Listed',
  'T_ID': 'Not Listed',
  'H(cm)': 208,
  'W(Kg)': 120,
  'Entry_Age': 40},
 {'Name': 'James Harden',
  'P_ID': 'hardeja01',
  'Team': 'Los Angeles Clippers',
  'T_ID': 'LAC

In [54]:
## hala ma kole team hayi ke bahashoon kar darim, team haye hamin 193 ta bazikone

In [55]:
def extract_unique_team_ids(players_data):
    t_ids = list(set([player['T_ID'] for player in players_data if player['T_ID'] != "Not Listed"]))
    return t_ids

# Load data from CSV and extract unique T_IDs
def load_and_extract_t_ids(file_name):
    with open(file_name, mode='r', encoding='utf-8') as file:
        reader = csv.DictReader(file)
        t_ids = list(set(row['T_ID'] for row in reader if row['T_ID'] != "Not Listed"))
    return t_ids
file_name = "players_data.csv"
unique_team_ids = load_and_extract_t_ids(file_name)
print(unique_team_ids)
print(len(unique_team_ids))

['TOR', 'PHI', 'WAS', 'MIL', 'NYK', 'CHO', 'HOU', 'BOS', 'BRK', 'IND', 'SAC', 'LAC', 'MIA', 'MIN', 'DAL', 'CLE', 'DET', 'SAS', 'DEN', 'CHI', 'NOP', 'ORL', 'GSW', 'MEM', 'ATL', 'OKC', 'LAL', 'UTA', 'PHO', 'POR']
30


In [57]:
## alan faghat kafie in ID haye bala ro be code zir bedim!

In [58]:
class BasketballTeamScraper:
    def __init__(self, T_ID):
        self.T_ID = T_ID
        self.base_url = f"https://www.basketball-reference.com/teams/{self.T_ID}/"
        self.soup = None
        self.team_info = None

    def fetch_data(self):
        """Fetch the webpage content and parse it with BeautifulSoup."""
        response = requests.get(self.base_url)
        if response.status_code == 200:
            self.soup = BeautifulSoup(response.text, 'html.parser')
        else:
            raise Exception(f"Failed to fetch the webpage. Status code: {response.status_code}")

    def extract_team_info(self):
        """Extract information about the team."""
        if not self.soup:
            raise Exception("No HTML content found. Call fetch_data() first.")

        # Extract Location
        location_tag = self.soup.find("strong", string="Location:")
        location = location_tag.next_sibling.strip() if location_tag else "N/A"

        # Extract Team Name
        team_name_tag = self.soup.find("h1").find("span")
        team_name = team_name_tag.text.strip() if team_name_tag else "N/A"

        # Extract Active Seasons and Established Year
        seasons_tag = self.soup.find("strong", string="Seasons:")
        if seasons_tag:
            seasons_text = seasons_tag.parent.get_text(strip=True).replace("Seasons:", "").strip()
            active_seasons = seasons_text.split(";")[0].strip()
            established_year = seasons_text.split(";")[1].strip().split("-")[0]
        else:
            active_seasons = "N/A"
            established_year = "N/A"

        # Extract Playoff Appearances
        playoff_appearances_tag = self.soup.find("strong", string="Playoff Appearances:")
        playoff_appearances = playoff_appearances_tag.next_sibling.strip() if playoff_appearances_tag else "N/A"

        # Extract Championships
        championships_tag = self.soup.find("strong", string="Championships:")
        championships = championships_tag.next_sibling.strip() if championships_tag else "N/A"

        # Store the data as a dictionary
        self.team_info = {
            "Team ID": self.T_ID,
            "Location": location,
            "Team Name": team_name,
            "Active Seasons": active_seasons,
            "Established Year": established_year,
            "Playoff Appearances": playoff_appearances,
            "Championships": championships
        }

    def save_to_csv(self, filename):
        """Save the extracted team info to a CSV file."""
        if not self.team_info:
            raise Exception("No data to save. Call extract_team_info() first.")

        # Convert the dictionary to a DataFrame
        df = pd.DataFrame([self.team_info])
        df.to_csv(filename, index=False)

        return df

# Example Usage
if __name__ == "__main__":
    T_ID = "LAL"  # Example Team ID for Los Angeles Lakers
    scraper = BasketballTeamScraper(T_ID)

    try:
        scraper.fetch_data()
        scraper.extract_team_info()

        # Save to CSV and display as a table
        output_filename = f"{T_ID}_team_info.csv"
        team_info_df = scraper.save_to_csv(output_filename)

        print("Team Information Table:")
        print(team_info_df)

    except Exception as e:
        print("Error:", e)

Team Information Table:
  Team ID                 Location           Team Name Active Seasons  \
0     LAL  Los Angeles, California  Los Angeles Lakers             77   

  Established Year Playoff Appearances Championships  
0             1948                  64            17  


In [60]:
# Initialize an empty list to store all team data
all_team_data = []

# Iterate through each team ID and scrape its data
for T_ID in unique_team_ids:
    scraper = BasketballTeamScraper(T_ID)
    try:
        scraper.fetch_data()
        scraper.extract_team_info()
        all_team_data.append(scraper.team_info)
        print(f"Successfully scraped data for team: {T_ID}")
        
        # Add a delay to slow down the scraping process
        time.sleep(2)  # Wait 5 seconds between each request
    except Exception as e:
        print(f"Error scraping data for team {T_ID}: {e}")

# Save all team data to a CSV file
output_filename = "all_teams_data.csv"
df = pd.DataFrame(all_team_data)
df.to_csv(output_filename, index=False)

print(f"All team data saved to {output_filename}")

Successfully scraped data for team: TOR
Successfully scraped data for team: PHI
Successfully scraped data for team: WAS
Successfully scraped data for team: MIL
Successfully scraped data for team: NYK
Error scraping data for team CHO: 'NoneType' object has no attribute 'find'
Successfully scraped data for team: HOU
Successfully scraped data for team: BOS
Error scraping data for team BRK: 'NoneType' object has no attribute 'find'
Successfully scraped data for team: IND
Successfully scraped data for team: SAC
Successfully scraped data for team: LAC
Successfully scraped data for team: MIA
Successfully scraped data for team: MIN
Successfully scraped data for team: DAL
Successfully scraped data for team: CLE
Successfully scraped data for team: DET
Successfully scraped data for team: SAS
Successfully scraped data for team: DEN
Successfully scraped data for team: CHI
Error scraping data for team NOP: 'NoneType' object has no attribute 'find'
Successfully scraped data for team: ORL
Successfully

In [61]:
## Link of MVP Players

In [62]:
# Finding the NBA MVP page
mvp_page_url = None
for i in s:
    if i.text == 'NBA MVP':
        mvp_page_url = 'https://www.basketball-reference.com' + i['href']
        break

# Fetch the MVP voting page
if mvp_page_url:
    page = requests.get(mvp_page_url, headers=headers)
    soup = BeautifulSoup(page.content, 'html.parser')

    # Extract the first 5 MVP voting links
    mvp_links = [
        'https://www.basketball-reference.com' + i['href']
        for i in soup.find_all('a') if i.text == 'V'
    ][:5]  # Keep only the first 5 links

# Extract MVP player profile links per year
mvp_player_links = {}

for i, link in enumerate(mvp_links):
    page = requests.get(link, headers=headers)
    soup = BeautifulSoup(page.content, 'html.parser')

    # Find player links from the first table
    tables = soup.find_all('tbody')
    if tables:
        mvp_player_links[2024 - i] = [
            'https://www.basketball-reference.com' + a['href']
            for a in tables[0].find_all('a') if 'players' in a['href']
        ]

# Assigning MVP player links for each year
mvp2024_link = mvp_player_links.get(2024, [])
mvp2023_link = mvp_player_links.get(2023, [])
mvp2022_link = mvp_player_links.get(2022, [])
mvp2021_link = mvp_player_links.get(2021, [])
mvp2020_link = mvp_player_links.get(2020, [])

# Print the number of MVP player links found per year
for year in range(2024, 2019, -1):
    print(f"{year}: {len(mvp_player_links.get(year, []))} players")


2024: 9 players
2023: 13 players
2022: 12 players
2021: 15 players
2020: 12 players


## Jordan Scrapper

In [73]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

def scrape_mvp_tables(urls):
    all_data = []

    for url in urls:
        try:
            # Send a request to the webpage with timeout
            response = requests.get(url, timeout=10)
            response.raise_for_status()  # Raise HTTPError for bad responses
            soup = BeautifulSoup(response.content, 'html.parser')

            # Find the MVP table
            mvp_table = soup.find('table', {'id': 'mvp'})

            if not mvp_table:
                print(f"MVP table not found on {url}")
                continue

            # Extract all rows from the table
            rows = mvp_table.find_all('tr')

            # Extract data from each row
            for row in rows:
                cols = row.find_all(['th', 'td'])  # Include header cells for correct indexing
                if len(cols) >= 8:  # Ensure enough columns are present
                    rank = cols[0].text.strip()
                    player = cols[1].text.strip()
                    team = cols[3].text.strip()
                    points_won = cols[5].text.strip()
                    points_max = cols[6].text.strip()
                    share = cols[7].text.strip()
                    all_data.append({
                        'Rank': rank,
                        'Player': player,
                        'Tm': team,
                        'Pts Won': points_won,
                        'Pts Max': points_max,
                        'Share': share,
                        'Source URL': url
                    })

            # Add a delay between requests
            time.sleep(2)

        except requests.exceptions.RequestException as e:
            print(f"Error fetching data from {url}: {e}")

    # Convert to DataFrame
    df = pd.DataFrame(all_data)

    # Save to CSV
    df.to_csv('mvp_data_combined.csv', index=False)

    # Print the extracted data
    print(df)

# Example usage
urls = list(mvp_links)  # Use dynamically extracted MVP voting page URLs
scrape_mvp_tables(urls)


    Rank                   Player   Tm  Pts Won  Pts Max  Share  \
0   Rank                   Player   Tm  Pts Won  Pts Max  Share   
1      1             Nikola Jokić  DEN      926      990  0.935   
2      2  Shai Gilgeous-Alexander  OKC      640      990  0.646   
3      3              Luka Dončić  DAL      566      990  0.572   
4      4    Giannis Antetokounmpo  MIL      192      990  0.194   
..   ...                      ...  ...      ...      ...    ...   
61     8           Damian Lillard  POR       23     1010  0.023   
62     9             Nikola Jokić  DEN       18     1010  0.018   
63    10            Pascal Siakam  TOR       17     1010  0.017   
64    11             Jimmy Butler  MIA        9     1010  0.009   
65    12             Jayson Tatum  BOS        1     1010  0.001   

                                           Source URL  
0   https://www.basketball-reference.com/awards/aw...  
1   https://www.basketball-reference.com/awards/aw...  
2   https://www.basketball-

In [74]:
## Hala man faghat bayad bazikon haye har fasl ro stats eshoono Scrap konam!

In [None]:
def scrape_nba_totals(url, season):
    # Send a request to the webpage
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Failed to retrieve the page for {season}. Status code: {response.status_code}")
        return

    soup = BeautifulSoup(response.content, "html.parser")

    # Find the table containing player totals
    table = soup.find("table", {"id": "totals_stats"})
    if not table:
        print(f"Table with id 'totals_stats' not found for {season}.")
        return

    # Extract rows from the table
    rows = table.tbody.find_all("tr")
    if not rows:
        print(f"No rows found in the table for {season}.")
        return

    # Initialize an empty list to store player data
    player_data = []

    # Loop through all rows and extract data
    for i, row in enumerate(rows):
        cols = row.find_all("td")
        if cols:  # Skip rows without data
            try:
                player_info = {
                    "Player": cols[0].text.strip(),
                    "Age": cols[1].text.strip(),
                    "Team": cols[2].text.strip(),
                    "Position": cols[3].text.strip(),
                    "Games Played": cols[4].text.strip(),
                    "Games Started": cols[5].text.strip(),
                    "Minutes Played": cols[6].text.strip(),
                    "Field Goals": cols[7].text.strip(),
                    "Field Goal Attempts": cols[8].text.strip(),
                    "Field Goal Percentage": cols[9].text.strip(),
                    "Three Pointers": cols[10].text.strip(),
                    "Three Point Attempts": cols[11].text.strip(),
                    "Three Point Percentage": cols[12].text.strip(),
                    "Two Pointers": cols[13].text.strip(),
                    "Two Point Attempts": cols[14].text.strip(),
                    "Two Point Percentage": cols[15].text.strip(),
                    "Effective Field Goal Percentage": cols[16].text.strip(),
                    "Free Throws": cols[17].text.strip(),
                    "Free Throw Attempts": cols[18].text.strip(),
                    "Free Throw Percentage": cols[19].text.strip(),
                    "Offensive Rebounds": cols[20].text.strip(),
                    "Defensive Rebounds": cols[21].text.strip(),
                    "Total Rebounds": cols[22].text.strip(),
                    "Assists": cols[23].text.strip(),
                    "Steals": cols[24].text.strip(),
                    "Blocks": cols[25].text.strip(),
                    "Turnovers": cols[26].text.strip(),
                    "Personal Fouls": cols[27].text.strip(),
                    "Points": cols[28].text.strip(),
                }
                player_data.append(player_info)
            except IndexError:
                print(f"Skipping row {i + 1} due to missing data for {season}.")

        # Pause for 10 seconds after every 50 players
        if (i + 1) % 50 == 0:
            print(f"Processed {i + 1} players for {season}. Pausing for 10 seconds...")
            time.sleep(10)

    # Convert the list to a DataFrame
    df = pd.DataFrame(player_data)

    if df.empty:
        print(f"No data was extracted from the table for {season}.")
        return

    # Save the DataFrame to a CSV file
    filename = f"nba_totals_{season}.csv"
    df.to_csv(filename, index=False)

    print(f"Data extraction for {season} complete. Saved to '{filename}'.")

# Scrape data for seasons dynamically
base_url = "https://www.basketball-reference.com/leagues/NBA_{}_totals.html"

for year in range(2020, 2025):  # Loop through years 2020 to 2024
    scrape_nba_totals(base_url.format(year), str(year))
