In [1]:
import requests
from bs4 import BeautifulSoup
import csv
import datetime
import time
from google.cloud import storage
from google.oauth2 import service_account
from google.cloud import dns
from io import StringIO

def my_handler(event, context):
    Today = datetime.date.today()
    Today_str = Today.strftime("%Y-%m-%d")
    
    # Your existing code starts here
    
    # Send a GET request to the URL
    url = "https://fbref.com/en/comps/20/Bundesliga-Stats"
    response = requests.get(url)
    
    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(response.content, "html.parser")
    
    # Find all h1 tags
    h1_tags = soup.find_all("h1")
    
    # Iterate through the h1 tags to find the desired line
    for h1_tag in h1_tags:
        if "Bundesliga Stats" in h1_tag.text:
            line = h1_tag.text.strip()
            break
    
    year = line.split("-")[1].split()[0]
    all_matches = []
    
    standings_url = "https://fbref.com/en/comps/20/Bundesliga-Stats"
    data = requests.get(standings_url)
    soup = BeautifulSoup(data.text, features ='html.parser')
    standings_table = soup.select('table.stats_table')[0]
    
    links = [l.get("href") for l in standings_table.find_all('a')]
    links = [l for l in links if '/squads/' in l]
    team_urls = [f"https://fbref.com{l}" for l in links]
        
    previous_season = soup.select("a.prev")[0].get("href")
    standings_url = f"https://fbref.com{previous_season}"
        
    for team_url in team_urls:
        team_name = team_url.split("/")[-1].replace("-Stats", "").replace("-", " ")
        data = requests.get(team_url)
        html_data = StringIO(data.text)
        scores_fixtures = []
        is_scores_fixtures = False
        for line in html_data.getvalue().split('\n'):
            if 'Scores & Fixtures' in line:
                is_scores_fixtures = True
            elif 'Shooting' in line or 'Goalkeeping' in line:
                is_scores_fixtures = False
            elif is_scores_fixtures:
                scores_fixtures.append(line)
        
        scores_fixtures = scores_fixtures[4:-1]  # Remove header and footer
        
        scores_fixtures_data = []
        for row in scores_fixtures:
            scores_fixtures_data.append(row.split(','))
        
        scores_fixtures_data = scores_fixtures_data[1:]  # Remove column headers
        
        # Convert date strings to datetime objects
        for i in range(len(scores_fixtures_data)):
            scores_fixtures_data[i][0] = datetime.datetime.strptime(scores_fixtures_data[i][0], '%Y-%m-%d').date()
        
        # Filter data for the last 7 days
        scores_fixtures_data = [row for row in scores_fixtures_data if Today - datetime.timedelta(days=7) < row[0] <= Today]
        
        scores_fixtures_df = []
        for row in scores_fixtures_data:
            scores_fixtures_df.append([row[0].strftime('%Y-%m-%d')] + row[1:])
        
        shooting_data = []
        is_shooting = False
        for line in data.text.split('\n'):
            if 'Shooting' in line:
                is_shooting = True
            elif 'Goalkeeping' in line:
                is_shooting = False
            elif is_shooting and 'op_dif' not in line:
                shooting_data.append(line.split(','))
        
        shooting_data = shooting_data[4:-1]  # Remove header and footer
        
        # Convert date strings to datetime objects
        for i in range(len(shooting_data)):
            shooting_data[i][0] = datetime.datetime.strptime(shooting_data[i][0], '%Y-%m-%d').date()
        
        # Filter data for the last 7 days
        shooting_data = [row for row in shooting_data if Today - datetime.timedelta(days=7) < row[0] <= Today]
        
        shooting_df = []
        for row in shooting_data:
            shooting_df.append([row[0].strftime('%Y-%m-%d')] + row[1:])
        
        goalkeeping_data = []
        is_goalkeeping = False
        for line in data.text.split('\n'):
            if 'Goalkeeping' in line:
                is_goalkeeping = True
            elif 'player_stats_all_comps' in line:
                is_goalkeeping = False
            elif is_goalkeeping and 'op_dif' not in line:
                goalkeeping_data.append(line.split(','))
        
        goalkeeping_data = goalkeeping_data[4:-1]  # Remove header and footer
        
        # Convert date strings to datetime objects
        for i in range(len(goalkeeping_data)):
            goalkeeping_data[i][0] = datetime.datetime.strptime(goalkeeping_data[i][0], '%Y-%m-%d').date()
        
        # Filter data for the last 7 days
        goalkeeping_data = [row for row in goalkeeping_data if Today - datetime.timedelta(days=7) < row[0] <= Today]
        
        goalkeeping_df = []
        for row in goalkeeping_data:
            goalkeeping_df.append([row[0].strftime('%Y-%m-%d')] + row[1:])
        
        team_data = []
        for row1, row2, row3 in zip(scores_fixtures_df, shooting_df, goalkeeping_df):
            team_data.append(row1 + row2[1:] + row3[1:])
        
        all_matches.extend(team_data)
        time.sleep(20)
    
    # Write the data to a CSV file
    with open('matches.csv', 'w', newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['Date', 'Time', 'Comp', 'Round', 'Day', 'Venue', 'GF', 'GA', 'Opponent', 'Poss', 'Sh', 'Save%'])
        writer.writerows(all_matches)
    
    # Your existing code ends here
    
    # Insert code to upload to GCS
    gcp_sa_credentials={
        "type": "service_account",
        "project_id": "footbalprediction-414107",
        "private_key_id": "ccae027241adbd582a4b8c05a4d530ab9089327a",
        "private_key": "-----BEGIN PRIVATE KEY-----\nMIIEvAIBADANBgkqhkiG9w0BAQEFAASCBKYwggSiAgEAAoIBAQCl6OkIEh6U3ES4\nLB6MU4O8rlGmHlqhCd1Hq+n+47y5DAbslF1396D9hIBCRpZGbQx5P1CPg4ZHwJq9\nHSXnk1KJF39gHd8emG+yE9A/vWFcuTvTPbU7VpX93yFMAzyFoMRjxoQsP1bE3Zl5\n9Ic1/zxbJwWxHOfbxiil9+94IxhnIXmHdYb7dFkkGQuipBs7Dtoy0Pshs4Wlf0ge\nWGeGRCVq5ayEUXsXIeBZUzOXe82831AmTKHmEV8vmawg0oSBuBRw2zrbTXknlP36\nuASn5I/Tdwh0K6qwb+79uda/Fd6W8u7OD26OTYrWWwP9OxTrfSY2rSlzdEVKEoCJ\ncZLRFFQLAgMBAAECggEAEgq5fgSxFFa9wa9z5PusqZyP9CTR4zhLkCCMUjPEJqAT\nCAV8V/BIWWQK1JTmK6Nuhf2SI6vVsut+3aOT/H0OA9WCfFp96c8Bj9fNai5h2O+r\nJ1DLFWMC7aLRn/lWyrX4arHrAVV5TfBGi+IWar37ph1LZlQdN05IcP/80noyjFMF\nQOhQmp4PySU0ICk2cPM49TEw+y20grzmjF8/eFWg1YUhyeQWeiANHPx2x8qC+qAi\n1Aq3ANT8den41NHo2sSxSphvjNE30fQtgCiTheeWlBzXeFPiMbyhO8AYwG6migXW\nnmxWE0WMJ6GL0L6Jl207MVm4st5Pur0Xenv8yww8DQKBgQDSFJm97zceU00DCXAr\nFnrDlYruqYWEXJ62A8+l6kCUNwtvCqt5oORAkygUC8kp3pPIy8fSIbat3ldtrG1f\nk7v7jRBuYu8Pxl5JLO147+6tf09h366ngyquGnOJ9VjhFnvwu1cgNrJe/lTXTT3y\na6mn5Wkk2qoDkOZUvbDn1opktwKBgQDKLKrRtoz4jwPL6l2BqET4d9O5qML3hC3n\nR45V7CxUvKUxOz8zI2y/1PV/BoF6pQDpTwEz9C1KXdt6dJuir+sUqKcbPCTwlCGW\nK2Q4L5uiXWtVhm/Jb6h3mWFOYSWiSACG/KwcbTteI2JfZZpB3YHlQcTgEwiOthiF\nTSC3La0/TQKBgFVUwM4BaKYMt+9P2hvmWZ8wEuq2OOF2rZDJI4MFD44kfaRw9Q3G\naHBCVbkuwFsdaXHaNCQKRaWB9ok5zINSAr0+oznzPZ9ut8WJVjwVWSFn4NqkfNDV\n2nQ1klCrM5raAyXZMp6HGRS0wcliOpNJX/QunvK1TvF73dL16fGBl10pAoGAZgOl\n6g3wEhev9bv7lMoAi1ODbUI/pr92niYYJzj1oYhS3oWjvT0Zya4+desleGo8DH3G\nAJ4sIEM91Qtz4OJdf14ee/qcNRy52dlpR4SWRpZW65/bVkxWOIsXc4JHiBxGz9Y9\nTla7xyOZpsNQ0/1eZv0Jx3szLTerJdAmOuf8bF0CgYAXxmiCvTBJChjSqSaiygBP\ntCUTeuXGU4arXHP9HtoUxx/pfX4b/y9Zx6aBeVmvEeaG1GzPOPQY5Sq2jcbFZZSI\n5FVdZUjrdNS++UaI+/brkr2nC58Q+oBJ5Q8XTBRXzMdTaaIiQe3rMJkOCTQ7Jo6n\ncW8+T0q3yU7XfgBPjztESg==\n-----END PRIVATE KEY-----\n",
        "client_email": "streamlit-python-0410@footbalprediction-414107.iam.gserviceaccount.com",
        "client_id": "114047689527607639710",
        "auth_uri": "https://accounts.google.com/o/oauth2/auth",
        "token_uri": "https://oauth2.googleapis.com/token",
        "auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs",
        "client_x509_cert_url": "https://www.googleapis.com/robot/v1/metadata/x509/streamlit-python-0410%40footbalprediction-414107.iam.gserviceaccount.com",
        "universe_domain": "googleapis.com"}

    project_id = gcp_sa_credentials["project_id"]
    credentials = service_account.Credentials.from_service_account_info(gcp_sa_credentials)
    client = dns.Client(project=project_id, credentials=credentials)
    
    client = storage.Client()
    bucket = client.get_bucket('bundesliga_0410')
    blob = bucket.blob('matches.csv')
    blob.download_to_filename('matches.csv')
    

    # Combine the new data with the existing CSV file
    with open('matches.csv', 'r', newline='') as csvfile:
        reader = csv.reader(csvfile)
        existing_data = list(reader)
    
    # Remove duplicated rows from all_matches
    all_matches_unique = [row for row in all_matches if row not in existing_data]
    
    # Combine the unique data with the existing CSV file
    combined_data = existing_data + all_matches_unique
    
    # Remove duplicates from combined_data
    combined_data_unique = []
    for row in combined_data:
        if row not in combined_data_unique:
            combined_data_unique.append(row)
    
    # Write the combined data to a new CSV file
    with open('combined_matches.csv', 'w', newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerows(combined_data_unique)
    
    # Upload the combined CSV file to the GCS bucket
    filename_in_bucket = bucket.blob('matches.csv')
    filename_in_bucket.upload_from_filename('combined_matches.csv')
    
    return "Data uploaded successfully to Google Cloud Storage", 200, {"Content-Type": "text/plain"}


In [2]:
import requests
from bs4 import BeautifulSoup
import csv
import datetime
import time
from google.cloud import storage
from google.oauth2 import service_account
from google.cloud import dns
from io import StringIO

In [3]:
    url = "https://fbref.com/en/comps/20/Bundesliga-Stats"
    response = requests.get(url)
    
    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(response.content, "html.parser")
    
    # Find all h1 tags
    h1_tags = soup.find_all("h1")
    
    # Iterate through the h1 tags to find the desired line
    for h1_tag in h1_tags:
        if "Bundesliga Stats" in h1_tag.text:
            line = h1_tag.text.strip()
            break
    
    year = line.split("-")[1].split()[0]
    all_matches = []

In [4]:
year

'2024'

In [5]:
    standings_url = "https://fbref.com/en/comps/20/Bundesliga-Stats"
    data = requests.get(standings_url)
    soup = BeautifulSoup(data.text, features ='html.parser')
    standings_table = soup.select('table.stats_table')[0]
    
    links = [l.get("href") for l in standings_table.find_all('a')]
    links = [l for l in links if '/squads/' in l]
    team_urls = [f"https://fbref.com{l}" for l in links]

    previous_season = soup.select("a.prev")[0].get("href")
    standings_url = f"https://fbref.com{previous_season}"
        
    for team_url in team_urls:
        team_name = team_url.split("/")[-1].replace("-Stats", "").replace("-", " ")
        data = requests.get(team_url)
        html_data = StringIO(data.text)
        scores_fixtures = []
        is_scores_fixtures = False
        for line in html_data.getvalue().split('\n'):
            if 'Scores & Fixtures' in line:
                is_scores_fixtures = True
            elif 'Shooting' in line or 'Goalkeeping' in line:
                is_scores_fixtures = False
            elif is_scores_fixtures:
                scores_fixtures.append(line)
        
        scores_fixtures = scores_fixtures[4:-1]  # Remove header and footer
        
        scores_fixtures_data = []

In [6]:
scores_fixtures

['<div id="content" role="main">',
 '',
 '',
 '',
 '          ',
 '      ',
 '      ',
 '      ',
 '      ',
 '',
 '<!-- fs_general_header -->',
 '<div class="adblock">',
 '<!-- div#fs_fs_general_header  -->',
 '<style>',
 '    #srcom   .adblock.primis,',
 '    #content .adblock.primis { height: auto; width: 728px; max-width:100%; aspect-ratio: 1.677419;  margin: auto; }',
 '</style>',
 '<div class="adblock primis">',
 '   <div id="FreeStarVideoAdContainer">',
 '      <div id="freestar-video-parent">',
 '         <div id="freestar-video-child"></div>',
 '      </div>',
 '   </div>',
 '</div>',
 '<!-- /div.#fs_fs_general_header -->',
 '',
 '',
 '</div>',
 '',
 '',
 '      ',
 '         ',
 '            ',
 '<div class="filter">',
 '<h4>2023-2024 Competitions</h4>',
 '<div class="">',
 '\t<a href="/en/squads/6a6967fc/2023-2024/all_comps/Darmstadt-98-Stats-All-Competitions">All Competitions</a>',
 '</div><div class=" current">',
 '\t<a>Bundesliga</a>',
 '</div><div class="">',
 '\t<a href

In [7]:
        for row in scores_fixtures:
            scores_fixtures_data.append(row.split(','))
        
        scores_fixtures_data = scores_fixtures_data[1:]  # Remove column headers

In [8]:
scores_fixtures_data

[[''],
 [''],
 [''],
 ['          '],
 ['      '],
 ['      '],
 ['      '],
 ['      '],
 [''],
 ['<!-- fs_general_header -->'],
 ['<div class="adblock">'],
 ['<!-- div#fs_fs_general_header  -->'],
 ['<style>'],
 ['    #srcom   .adblock.primis', ''],
 ['    #content .adblock.primis { height: auto; width: 728px; max-width:100%; aspect-ratio: 1.677419;  margin: auto; }'],
 ['</style>'],
 ['<div class="adblock primis">'],
 ['   <div id="FreeStarVideoAdContainer">'],
 ['      <div id="freestar-video-parent">'],
 ['         <div id="freestar-video-child"></div>'],
 ['      </div>'],
 ['   </div>'],
 ['</div>'],
 ['<!-- /div.#fs_fs_general_header -->'],
 [''],
 [''],
 ['</div>'],
 [''],
 [''],
 ['      '],
 ['         '],
 ['            '],
 ['<div class="filter">'],
 ['<h4>2023-2024 Competitions</h4>'],
 ['<div class="">'],
 ['\t<a href="/en/squads/6a6967fc/2023-2024/all_comps/Darmstadt-98-Stats-All-Competitions">All Competitions</a>'],
 ['</div><div class=" current">'],
 ['\t<a>Bundesliga