In [None]:
import requests
import json
import pandas as pd

# Extracting the average

In [122]:
# Define the URL of the page containing the salary data
url = "https://www.capology.com/uk/premier-league/salaries/2022-2023"

# Send a GET request to the URL to retrieve the page's HTML content
req = requests.get(url)

# Extract the text content of the page (HTML source)
page_src = req.text

# Find the starting index of the average salary section in the HTML content
# The salary information is located within a specific element containing 'accounting.formatMoney("'
start_index = page_src.index('$(\'#salaries-avg\').html(accounting.formatMoney("')

# Slice the page source starting from the location of the average salary element
page_src = page_src[start_index:]

# Split the sliced HTML content by double quotes to isolate the salary value
page_src = page_src.split('"')

# Extract the average salary value (second item in the split list) and assign it to 'avg'
avg = page_src[1]

# Print or return the average salary value
avg

'3523096'

# Parsing the Player Table

This code pulls player salary data from a webpage, isolating the relevant section in the HTML. It then splits the data by players and loops through each one to extract details like name, age, position, country, and salary. Each player’s information is stored in a dictionary, which is added to a list. This results in a structured list of player data for easy access and analysis.

In [123]:
# Extract the HTML content of the page as text
page_src = req.text

# Split the page source to isolate the JSON-like data within 'var data = ...'
# This step assumes there's only one occurrence of 'var data =' in the page source
page_src = page_src.split("var data =")[1]

# Further split the result to exclude any trailing characters after the JSON data
# We're aiming to end the data at the first occurrence of ";\n"
page_src = page_src.split(";\n")[0]

# Clean the extracted data by removing the outer square brackets to prepare it for parsing
page_src = page_src.replace("[", "").replace("]", "")

# Display the cleaned data
page_src

# Split the data into individual player records, using '{' as the delimiter
# Each entry in 'players' represents data for one player
players = page_src.split("{")

# Initialize an empty list to store each player's parsed data as dictionaries
players_data = []

# Loop through each player entry in 'players' to parse their information
for player_text in players:
    # Create an empty dictionary to store parsed fields for this player
    parsed_player_data = {}

    # Remove the trailing "}," from the player data for easier parsing
    player_text = player_text.replace("},", "")

    # Replace any double newlines with single newlines to standardize formatting
    player_text = player_text.replace("\n\n", "\n")

    # Split the player's data into individual fields based on newlines
    player_fields = player_text.split("\n")

    # Uncomment the following line to print each player's text if needed
    # print(player_text)

    # Loop through each field in the player's data to extract key-value pairs
    for player_field in player_fields:
        # Skip empty lines
        if len(player_field) == 0:
            continue
        # Extract name, URL, and flag image URL if the field contains 'name'
        elif 'name' in player_field:
            # Get the link
            parsed_player_data['url'] = player_field.split("href='")[1].split("'")[0]
            # Get the flag image URL
            parsed_player_data['flag_img_url'] = player_field.split("<img src='")[1].split("'")[0]
            # Extract the player's name
            parsed_player_data['name'] = player_field.split("loading='lazy'>")[1].split("<")[0]
        # Extract monetary values if the field contains 'accounting.formatMoney'
        elif "accounting.formatMoney" in player_field:
            # Get the field key and corresponding salary value
            field_key = player_field.split("'")[1].split("'")[0]
            value = player_field.split('"')[1].split('"')[0]
            parsed_player_data[field_key] = value
        # Extract position if 'position' is in the field
        elif "position" in player_field:
            parsed_player_data['position'] = player_field.split('"')[1].split('"')[0]
        # Extract age if 'age' is in the field
        elif "age" in player_field:
            parsed_player_data["age"] = player_field.split('"')[1].split('"')[0]
        # Extract country if 'country' is in the field
        elif "country" in player_field:
            parsed_player_data["country"] = player_field.split('"')[1].split('"')[0]

    # Append the parsed player data dictionary to the list of all players' data
    players_data.append(parsed_player_data)

# Display the final parsed data for the players
players_data

[{},
 {'url': '/player/cristiano-ronaldo-31083/',
  'flag_img_url': 'https://capology-e6a3.kxcdn.com/static/images/flags/portugal.svg',
  'name': 'Cristiano Ronaldo',
  'weekly_gross_eur': '31898925',
  'annual_gross_eur': '31898925',
  'adjusted_gross_eur': '31898925',
  'weekly_gross_gbp': '26780000',
  'annual_gross_gbp': '26780000',
  'adjusted_gross_gbp': '26780000',
  'weekly_gross_usd': '34693019',
  'annual_gross_usd': '34693019',
  'adjusted_gross_usd': '34693019',
  'weekly_net_eur': '29766771',
  'annual_net_eur': '29766771',
  'adjusted_net_eur': '29766771',
  'weekly_net_gbp': '24990000',
  'annual_net_gbp': '24990000',
  'adjusted_net_gbp': '24990000',
  'weekly_net_usd': '32374105',
  'annual_net_usd': '32374105',
  'adjusted_net_usd': '32374105',
  'position': 'F',
  'age': '37',
  'country': 'Portugal'},
 {'url': '/player/kevin-de-bruyne-33417/',
  'flag_img_url': 'https://capology-e6a3.kxcdn.com/static/images/flags/belgium.svg',
  'name': 'Kevin De Bruyne',
  'weekly_

# Loading it with Pandas

In [126]:

df = pd.DataFrame(players_data)
df = df[["name", "weekly_gross_eur"]]

df['weekly_gross_eur'] = pd.to_numeric(df["weekly_gross_eur"], errors="coerce")

df = df.query("weekly_gross_eur > 0")
df.to_csv("premier_league_salaries.csv", index=False)

df

Unnamed: 0,name,weekly_gross_eur
1,Cristiano Ronaldo,31898925.0
2,Kevin De Bruyne,24775864.0
3,Erling Haaland,23227372.0
4,David de Gea,23227372.0
5,Casemiro,21678881.0
...,...,...
587,Chem Campbell,154849.0
588,Jordan Smith,154849.0
589,Cody Drameh,154849.0
590,Leo Fuhr Hjelde,123879.0


In [None]:
xa23159@bristol.ac.uk

In [67]:
d = """
Fruit: Apple
Color: Red
Price: 2

Fruit: Orange
Color: Orange
Price: 1
"""

entries = d.split("\n\n")

parsed_data = []
for entry in entries:
    data_rows = entry.split("\n")
    parsed_entry = {}
    for row in data_rows:
        if len(row)==0:
            continue
        k = row.split(":")[0]
        v = row.split(":")[1]
        parsed_entry[k]=v
    parsed_data.append(parsed_entry)
        
parsed_data



[{'Fruit': ' Apple', 'Color': ' Red', 'Price': ' 2'},
 {'Fruit': ' Orange', 'Color': ' Orange', 'Price': ' 1'}]

In [32]:
page_src

'var data = [{\n            \'name\': "<a class=\'firstcol\' href=\'/player/cristiano-ronaldo-31083/\'><img src=\'https://capology-e6a3.kxcdn.com/static/images/flags/portugal.svg\' class=\'table-logo\' height=\'20\' width=\'20\' loading=\'lazy\'>Cristiano Ronaldo</a>",\n            \'weekly_gross_eur\': accounting.formatMoney("31898925"/52, "€ ", 0),\n            \'annual_gross_eur\': accounting.formatMoney("31898925", "€ ", 0),\n            \'adjusted_gross_eur\': accounting.formatMoney("31898925", "€ ", 0),\n            \'weekly_gross_gbp\': accounting.formatMoney("26780000"/52, "£ ", 0),\n            \'annual_gross_gbp\': accounting.formatMoney("26780000", "£ ", 0),\n            \'adjusted_gross_gbp\': accounting.formatMoney("26780000", "£ ", 0),\n            \'weekly_gross_usd\': accounting.formatMoney("34693019"/52, "$ ", 0),\n            \'annual_gross_usd\': accounting.formatMoney("34693019", "$ ", 0),\n            \'adjusted_gross_usd\': accounting.formatMoney("34693019", "$ ",

In [52]:
string = "Hello World fdsfsdf"

string.replace("World", "fdfs ")

'Hello fdfs  fdsfsdf'

In [37]:
string.split("World")

['Hello ', ' fdsfsdf']

In [None]:
"Apples var data = Bannanas, Pears".split("var data =")

['Apples ', ' Bannanas, Pears']

646345

In [17]:
page_src



In [11]:
len(page_src)

2038320

In [5]:
"My name is Finn".split("My name is")

['', ' Finn']