In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
from datetime import datetime
import numpy as np
from typing import Dict, List, Optional, Tuple
import re
import warnings
warnings.filterwarnings('ignore')

In [None]:
class FBrefBaseScraper:
    """Base scraper class with common functionality"""
    
    def __init__(self):
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        })
        self.base_url = "https://fbref.com"
        
    def get_soup(self, url: str, delay: float = 1.0) -> BeautifulSoup:
        """Get BeautifulSoup object from URL with rate limiting"""
        time.sleep(delay)
        try:
            response = self.session.get(url, timeout=30)
            response.raise_for_status()
            return BeautifulSoup(response.content, 'html.parser')
        except Exception as e:
            print(f"Error fetching {url}: {e}")
            return None
    
    def extract_table_data(self, soup: BeautifulSoup, table_selector: str) -> pd.DataFrame:
        """Generic table extraction method"""
        if not soup:
            return pd.DataFrame()
            
        # Try different ways to find the table
        table = soup.find('table', {'id': table_selector})
        if not table:
            table = soup.find('table', class_='stats_table')
        if not table:
            tables = soup.find_all('table')
            table = tables[0] if tables else None
            
        if not table:
            return pd.DataFrame()
        
        # Extract headers
        headers = []
        thead = table.find('thead')
        if thead:
            header_rows = thead.find_all('tr')
            # Get the most complete header row
            for row in reversed(header_rows):
                row_headers = [th.get_text(strip=True) for th in row.find_all(['th', 'td'])]
                if len(row_headers) > len(headers):
                    headers = row_headers
        
        # Extract data rows
        tbody = table.find('tbody')
        rows_data = []
        if tbody:
            for row in tbody.find_all('tr'):
                if row.get('class') and any(cls in ['spacer', 'thead'] for cls in row.get('class')):
                    continue
                    
                row_data = []
                for cell in row.find_all(['td', 'th']):
                    row_data.append(cell.get_text(strip=True))
                
                if len(row_data) > 1:
                    rows_data.append(row_data)
        
        if not rows_data or not headers:
            return pd.DataFrame()
        
        # Ensure consistent column count
        max_cols = max(len(headers), max(len(row) for row in rows_data) if rows_data else 0)
        headers = headers[:max_cols] + [f'Col_{i}' for i in range(len(headers), max_cols)]
        
        for i, row in enumerate(rows_data):
            rows_data[i] = row[:max_cols] + [''] * (max_cols - len(row))
        
        return pd.DataFrame(rows_data, columns=headers)

In [None]:
class SeasonTablesScraper(FBrefBaseScraper):
    """Scraper for league tables across multiple seasons"""
    
    def get_season_urls(self, years_back: int = 10) -> List[Tuple[str, str]]:
        """Get URLs for league tables for the last N seasons"""
        current_year = datetime.now().year
        season_urls = []
        
        for i in range(years_back):
            season_start = current_year - i - 1
            season_end = season_start + 1
            season = f"{season_start}-{season_end}"
            season_url = f"{self.base_url}/en/comps/9/{season}/stats/{season}-Premier-League-Stats"
            season_urls.append((season, season_url))
        
        return season_urls
    
    def scrape_season_table(self, season: str, url: str) -> pd.DataFrame:
        """Scrape league table for a specific season"""
        print(f"Scraping league table for {season}...")
        soup = self.get_soup(url)
        
        df = self.extract_table_data(soup, 'stats_standard_combined')
        if not df.empty:
            df['season'] = season
            df['scraped_date'] = datetime.now().strftime('%Y-%m-%d')
        
        return df
    
    def scrape_all_seasons(self, years_back: int = 10) -> pd.DataFrame:
        """Scrape league tables for all seasons"""
        season_urls = self.get_season_urls(years_back)
        all_tables = []
        
        for season, url in season_urls:
            table_data = self.scrape_season_table(season, url)
            if not table_data.empty:
                all_tables.append(table_data)
        
        return pd.concat(all_tables, ignore_index=True) if all_tables else pd.DataFrame()


# Example usage for Block 2:
if __name__ == "__main__":
    # Scrape league tables only
    table_scraper = SeasonTablesScraper()
    league_tables = table_scraper.scrape_all_seasons(years_back=5)
    league_tables.to_csv('league_tables.csv', index=False)
    print(f"Saved {len(league_tables)} league table records")

In [None]:
class FixturesScraper(FBrefBaseScraper):
    """Scraper for match fixtures and results"""
    
    def get_fixtures_urls(self, years_back: int = 10) -> List[Tuple[str, str]]:
        """Get fixture URLs for multiple seasons"""
        current_year = datetime.now().year
        fixture_urls = []
        
        for i in range(years_back):
            season_start = current_year - i - 1
            season_end = season_start + 1
            season = f"{season_start}-{season_end}"
            fixtures_url = f"{self.base_url}/en/comps/9/{season}/schedule/{season}-Premier-League-Scores-and-Fixtures"
            fixture_urls.append((season, fixtures_url))
        
        return fixture_urls
    
    def scrape_season_fixtures(self, season: str, url: str) -> pd.DataFrame:
        """Scrape all fixtures for a season"""
        print(f"Scraping fixtures for {season}...")
        soup = self.get_soup(url)
        
        # Try multiple table IDs that FBref uses for fixtures
        table_ids = [f'sched_ks_17358_1', 'sched_2025_17358_1', 'fixtures']
        df = pd.DataFrame()
        
        for table_id in table_ids:
            df = self.extract_table_data(soup, table_id)
            if not df.empty:
                break
        
        if not df.empty:
            df['season'] = season
            df['scraped_date'] = datetime.now().strftime('%Y-%m-%d')
        
        return df
    
    def scrape_all_fixtures(self, years_back: int = 10) -> pd.DataFrame:
        """Scrape fixtures for all seasons"""
        fixture_urls = self.get_fixtures_urls(years_back)
        all_fixtures = []
        
        for season, url in fixture_urls:
            fixtures_data = self.scrape_season_fixtures(season, url)
            if not fixtures_data.empty:
                all_fixtures.append(fixtures_data)
        
        return pd.concat(all_fixtures, ignore_index=True) if all_fixtures else pd.DataFrame()


# Example usage for Block 3:
if __name__ == "__main__":
    # Scrape fixtures only
    fixtures_scraper = FixturesScraper()
    fixtures = fixtures_scraper.scrape_all_fixtures(years_back=5)
    fixtures.to_csv('fixtures.csv', index=False)
    print(f"Saved {len(fixtures)} fixture records")


In [None]:
class ShootingStatsScraper(FBrefBaseScraper):
    """Scraper specifically for team shooting statistics"""
    
    def get_shooting_url(self, season: str) -> str:
        """Build URL for shooting stats"""
        return f"{self.base_url}/en/comps/9/{season}/shooting/{season}-Premier-League-Stats"
    
    def scrape_team_shooting(self, season: str) -> pd.DataFrame:
        """Scrape team shooting stats for a season"""
        print(f"Scraping shooting stats for {season}...")
        url = self.get_shooting_url(season)
        soup = self.get_soup(url)
        
        df = self.extract_table_data(soup, 'stats_shooting')
        if not df.empty:
            df['season'] = season
            df['scraped_date'] = datetime.now().strftime('%Y-%m-%d')
        
        return df
    
    def scrape_all_shooting_stats(self, years_back: int = 10) -> pd.DataFrame:
        """Scrape shooting stats for all seasons"""
        current_year = datetime.now().year
        all_shooting = []
        
        for i in range(years_back):
            season_start = current_year - i - 1
            season_end = season_start + 1
            season = f"{season_start}-{season_end}"
            
            shooting_data = self.scrape_team_shooting(season)
            if not shooting_data.empty:
                all_shooting.append(shooting_data)
        
        return pd.concat(all_shooting, ignore_index=True) if all_shooting else pd.DataFrame()


# Example usage for Block 4:
if __name__ == "__main__":
    # Scrape shooting stats only
    shooting_scraper = ShootingStatsScraper()
    shooting_stats = shooting_scraper.scrape_all_shooting_stats(years_back=5)
    shooting_stats.to_csv('team_shooting_stats.csv', index=False)
    print(f"Saved {len(shooting_stats)} shooting stat records")

In [None]:
class PassingStatsScraper(FBrefBaseScraper):
    """Scraper specifically for team passing statistics"""
    
    def get_passing_url(self, season: str) -> str:
        """Build URL for passing stats"""
        return f"{self.base_url}/en/comps/9/{season}/passing/{season}-Premier-League-Stats"
    
    def scrape_team_passing(self, season: str) -> pd.DataFrame:
        """Scrape team passing stats for a season"""
        print(f"Scraping passing stats for {season}...")
        url = self.get_passing_url(season)
        soup = self.get_soup(url)
        
        df = self.extract_table_data(soup, 'stats_passing')
        if not df.empty:
            df['season'] = season
            df['scraped_date'] = datetime.now().strftime('%Y-%m-%d')
        
        return df
    
    def scrape_all_passing_stats(self, years_back: int = 10) -> pd.DataFrame:
        """Scrape passing stats for all seasons"""
        current_year = datetime.now().year
        all_passing = []
        
        for i in range(years_back):
            season_start = current_year - i - 1
            season_end = season_start + 1
            season = f"{season_start}-{season_end}"
            
            passing_data = self.scrape_team_passing(season)
            if not passing_data.empty:
                all_passing.append(passing_data)
        
        return pd.concat(all_passing, ignore_index=True) if all_passing else pd.DataFrame()


# Example usage for Block 5:
if __name__ == "__main__":
    # Scrape passing stats only
    passing_scraper = PassingStatsScraper()
    passing_stats = passing_scraper.scrape_all_passing_stats(years_back=5)
    passing_stats.to_csv('team_passing_stats.csv', index=False)
    print(f"Saved {len(passing_stats)} passing stat records")

In [None]:
class DefensiveStatsScraper(FBrefBaseScraper):
    """Scraper specifically for team defensive statistics"""
    
    def get_defense_url(self, season: str) -> str:
        """Build URL for defensive stats"""
        return f"{self.base_url}/en/comps/9/{season}/defense/{season}-Premier-League-Stats"
    
    def scrape_team_defense(self, season: str) -> pd.DataFrame:
        """Scrape team defensive stats for a season"""
        print(f"Scraping defensive stats for {season}...")
        url = self.get_defense_url(season)
        soup = self.get_soup(url)
        
        df = self.extract_table_data(soup, 'stats_defense')
        if not df.empty:
            df['season'] = season
            df['scraped_date'] = datetime.now().strftime('%Y-%m-%d')
        
        return df
    
    def scrape_all_defensive_stats(self, years_back: int = 10) -> pd.DataFrame:
        """Scrape defensive stats for all seasons"""
        current_year = datetime.now().year
        all_defense = []
        
        for i in range(years_back):
            season_start = current_year - i - 1
            season_end = season_start + 1
            season = f"{season_start}-{season_end}"
            
            defense_data = self.scrape_team_defense(season)
            if not defense_data.empty:
                all_defense.append(defense_data)
        
        return pd.concat(all_defense, ignore_index=True) if all_defense else pd.DataFrame()


# Example usage for Block 6:
if __name__ == "__main__":
    # Scrape defensive stats only
    defense_scraper = DefensiveStatsScraper()
    defense_stats = defense_scraper.scrape_all_defensive_stats(years_back=5)
    defense_stats.to_csv('team_defensive_stats.csv', index=False)
    print(f"Saved {len(defense_stats)} defensive stat records")

In [None]:
class PossessionStatsScraper(FBrefBaseScraper):
    """Scraper specifically for team possession statistics"""
    
    def get_possession_url(self, season: str) -> str:
        """Build URL for possession stats"""
        return f"{self.base_url}/en/comps/9/{season}/possession/{season}-Premier-League-Stats"
    
    def scrape_team_possession(self, season: str) -> pd.DataFrame:
        """Scrape team possession stats for a season"""
        print(f"Scraping possession stats for {season}...")
        url = self.get_possession_url(season)
        soup = self.get_soup(url)
        
        df = self.extract_table_data(soup, 'stats_possession')
        if not df.empty:
            df['season'] = season
            df['scraped_date'] = datetime.now().strftime('%Y-%m-%d')
        
        return df
    
    def scrape_all_possession_stats(self, years_back: int = 10) -> pd.DataFrame:
        """Scrape possession stats for all seasons"""
        current_year = datetime.now().year
        all_possession = []
        
        for i in range(years_back):
            season_start = current_year - i - 1
            season_end = season_start + 1
            season = f"{season_start}-{season_end}"
            
            possession_data = self.scrape_team_possession(season)
            if not possession_data.empty:
                all_possession.append(possession_data)
        
        return pd.concat(all_possession, ignore_index=True) if all_possession else pd.DataFrame()


# Example usage for Block 7:
if __name__ == "__main__":
    # Scrape possession stats only
    possession_scraper = PossessionStatsScraper()
    possession_stats = possession_scraper.scrape_all_possession_stats(years_back=5)
    possession_stats.to_csv('team_possession_stats.csv', index=False)
    print(f"Saved {len(possession_stats)} possession stat records")

In [None]:
class GoalkeepingStatsScraper(FBrefBaseScraper):
    """Scraper specifically for goalkeeping statistics"""
    
    def get_goalkeeping_url(self, season: str) -> str:
        """Build URL for goalkeeping stats"""
        return f"{self.base_url}/en/comps/9/{season}/goalkeeping/{season}-Premier-League-Stats"
    
    def scrape_team_goalkeeping(self, season: str) -> pd.DataFrame:
        """Scrape team goalkeeping stats for a season"""
        print(f"Scraping goalkeeping stats for {season}...")
        url = self.get_goalkeeping_url(season)
        soup = self.get_soup(url)
        
        df = self.extract_table_data(soup, 'stats_keeper')
        if not df.empty:
            df['season'] = season
            df['scraped_date'] = datetime.now().strftime('%Y-%m-%d')
        
        return df
    
    def scrape_all_goalkeeping_stats(self, years_back: int = 10) -> pd.DataFrame:
        """Scrape goalkeeping stats for all seasons"""
        current_year = datetime.now().year
        all_goalkeeping = []
        
        for i in range(years_back):
            season_start = current_year - i - 1
            season_end = season_start + 1
            season = f"{season_start}-{season_end}"
            
            goalkeeping_data = self.scrape_team_goalkeeping(season)
            if not goalkeeping_data.empty:
                all_goalkeeping.append(goalkeeping_data)
        
        return pd.concat(all_goalkeeping, ignore_index=True) if all_goalkeeping else pd.DataFrame()


# Example usage for Block 8:
if __name__ == "__main__":
    # Scrape goalkeeping stats only
    gk_scraper = GoalkeepingStatsScraper()
    gk_stats = gk_scraper.scrape_all_goalkeeping_stats(years_back=5)
    gk_stats.to_csv('team_goalkeeping_stats.csv', index=False)
    print(f"Saved {len(gk_stats)} goalkeeping stat records")

In [None]:
class PlayerStatsScraper(FBrefBaseScraper):
    """Scraper for individual player statistics"""
    
    def get_player_stats_url(self, season: str) -> str:
        """Get URL for player stats"""
        return f"{self.base_url}/en/comps/9/{season}/stats/{season}-Premier-League-Stats"
    
    def scrape_player_stats(self, season: str) -> pd.DataFrame:
        """Scrape individual player statistics"""
        print(f"Scraping player stats for {season}...")
        url = self.get_player_stats_url(season)
        soup = self.get_soup(url)
        
        # Look for player stats table (usually has different structure than team stats)
        df = self.extract_table_data(soup, 'stats_standard')
        
        if not df.empty:
            df['season'] = season
            df['scraped_date'] = datetime.now().strftime('%Y-%m-%d')
        
        return df
    
    def scrape_all_player_stats(self, years_back: int = 10) -> pd.DataFrame:
        """Scrape player stats for multiple seasons"""
        current_year = datetime.now().year
        all_player_data = []
        
        for i in range(years_back):
            season_start = current_year - i - 1
            season_end = season_start + 1
            season = f"{season_start}-{season_end}"
            
            player_data = self.scrape_player_stats(season)
            if not player_data.empty:
                all_player_data.append(player_data)
        
        return pd.concat(all_player_data, ignore_index=True) if all_player_data else pd.DataFrame()


# Example usage for Block 9:
if __name__ == "__main__":
    # Scrape player stats only
    player_scraper = PlayerStatsScraper()
    player_stats = player_scraper.scrape_all_player_stats(years_back=3)
    player_stats.to_csv('player_stats.csv', index=False)
    print(f"Saved {len(player_stats)} player stat records")

In [None]:
class TransferDataScraper(FBrefBaseScraper):
    """Scraper for transfer market data"""
    
    def get_transfers_url(self, season: str) -> str:
        """Get URL for transfer data"""
        return f"{self.base_url}/en/comps/9/{season}/transfers/{season}-Premier-League-Transfers"
    
    def scrape_season_transfers(self, season: str) -> pd.DataFrame:
        """Scrape transfer data for a season"""
        print(f"Scraping transfers for {season}...")
        url = self.get_transfers_url(season)
        soup = self.get_soup(url)
        
        # Look for transfers table
        df = self.extract_table_data(soup, 'transfers')
        
        if not df.empty:
            df['season'] = season
            df['scraped_date'] = datetime.now().strftime('%Y-%m-%d')
        
        return df
    
    def scrape_all_transfers(self, years_back: int = 10) -> pd.DataFrame:
        """Scrape transfer data for multiple seasons"""
        current_year = datetime.now().year
        all_transfers = []
        
        for i in range(years_back):
            season_start = current_year - i - 1
            season_end = season_start + 1
            season = f"{season_start}-{season_end}"
            
            transfers = self.scrape_season_transfers(season)
            if not transfers.empty:
                all_transfers.append(transfers)
        
        return pd.concat(all_transfers, ignore_index=True) if all_transfers else pd.DataFrame()


# Example usage for Block 10:
if __name__ == "__main__":
    # Scrape transfer data only
    transfer_scraper = TransferDataScraper()
    transfer_data = transfer_scraper.scrape_all_transfers(years_back=5)
    transfer_data.to_csv('transfers.csv', index=False)
    print(f"Saved {len(transfer_data)} transfer records")

In [None]:
class ComprehensiveDataScraper:
    """Master class that coordinates all data collection"""
    
    def __init__(self):
        self.season_tables_scraper = SeasonTablesScraper()
        self.fixtures_scraper = FixturesScraper()
        self.shooting_scraper = ShootingStatsScraper()
        self.passing_scraper = PassingStatsScraper()
        self.defense_scraper = DefensiveStatsScraper()
        self.possession_scraper = PossessionStatsScraper()
        self.goalkeeping_scraper = GoalkeepingStatsScraper()
        self.player_scraper = PlayerStatsScraper()
        self.transfer_scraper = TransferDataScraper()
    
    def scrape_all_data(self, years_back: int = 10, include_players: bool = True, 
                       include_transfers: bool = True) -> Dict[str, pd.DataFrame]:
        """Scrape all available data for match prediction"""
        
        print(f"Starting comprehensive data scraping for last {years_back} seasons...")
        all_data = {}
        
        # Core match data
        print("\n=== SCRAPING CORE DATA ===")
        all_data['season_tables'] = self.season_tables_scraper.scrape_all_seasons(years_back)
        all_data['fixtures'] = self.fixtures_scraper.scrape_all_fixtures(years_back)
        
        # Team statistics
        print("\n=== SCRAPING TEAM STATS ===")
        all_data['team_shooting'] = self.shooting_scraper.scrape_all_shooting_stats(years_back)
        all_data['team_passing'] = self.passing_scraper.scrape_all_passing_stats(years_back)
        all_data['team_defense'] = self.defense_scraper.scrape_all_defensive_stats(years_back)
        all_data['team_possession'] = self.possession_scraper.scrape_all_possession_stats(years_back)
        all_data['team_goalkeeping'] = self.goalkeeping_scraper.scrape_all_goalkeeping_stats(years_back)
        
        # Optional data
        if include_players:
            print("\n=== SCRAPING PLAYER DATA ===")
            all_data['player_stats'] = self.player_scraper.scrape_all_player_stats(years_back)
        
        if include_transfers:
            print("\n=== SCRAPING TRANSFER DATA ===")
            all_data['transfers'] = self.transfer_scraper.scrape_all_transfers(years_back)
        
        return all_data
    
    def save_all_data(self, data: Dict[str, pd.DataFrame], output_dir: str = "data/"):
        """Save all scraped data to CSV files"""
        import os
        os.makedirs(output_dir, exist_ok=True)
        
        for data_name, df in data.items():
            if not df.empty:
                filepath = os.path.join(output_dir, f"{data_name}.csv")
                df.to_csv(filepath, index=False)
                print(f"Saved {data_name}: {len(df)} records")