In [1]:
import requests
import random
import gzip
from io import StringIO, BytesIO

from datetime import datetime
from bs4 import BeautifulSoup

In [2]:
user_agent_list = [ 
	'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36', 
	'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36', 
	'Mozilla/5.0 (Macintosh; Intel Mac OS X 13_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.1 Safari/605.1.15', 
]
user_agent = None

for i in range(1, 4):
    user_agent = random.choice(user_agent_list)

In [3]:
headers = {
    'User-Agent': user_agent
}

In [4]:
link_and_description = {
    "https://www.sofascore.com/sitemaps/hi_sitemap_tournaments_football.xml.gz": "Download all the tournaments.",
    "https://www.sofascore.com/api/v1/event/12173473": "Match Details",
    "https://www.sofascore.com/api/v1/event/12173473/statistics": "Consists of both team statistics.",
    "https://www.sofascore.com/api/v1/event/12173473/shotmap": "Shots of both team players.",
    "https://www.sofascore.com/api/v1/event/12173473/average-positions": "Average positions of the player",
    "https://www.sofascore.com/api/v1/event/12173473/lineups": "Team lineup with formations and Player statistics.",
    "https://www.sofascore.com/api/v1/event/fan-rating/ranking/season/53654": "Stages of the Tournament and corresponding Matches",
    "http://api.sofascore.com/api/v1/unique-tournament/17/season/52186/team-events": "Tournament matches.",
    "http://api.sofascore.com/api/v1/category/1/unique-tournaments": "All the tournaments of the country.",
    "/api/v1/event/12173473/player/824200/heatmap": "Heatmap of the player",
    "http://api.sofascore.com/api/v1/sport/football/scheduled-events/2024-05-23": "Scheduled Events based on the date."
}

## Loading In The Tournaments.

In [5]:
req = requests.get('https://www.sofascore.com/sitemaps/hi_sitemap_tournaments_football.xml.gz', headers=headers)
# sitemap = gzip.GzipFile(fileobj=StringIO(req.content)).read()

In [6]:
sitemap = gzip.decompress(req.content)

In [7]:
soup = BeautifulSoup(sitemap, 'lxml')

In [8]:
tournament_list = soup.findAll('xhtml:link')

In [9]:
' '.join(tournament_list[200].attrs['href'].split('/')[-2].split('-')).capitalize()

'Int friendly games women'

In [10]:
tournaments = list()

In [11]:
for tournament in tournament_list:
    tmp_data = {}
    lang = tournament.attrs['hreflang']
    if lang != 'en':
        continue
    a =  tournament.attrs['href'].split('/')
    tmp_data['id'] = a[-1]
    tmp_data['tournament_name'] = ' '.join(a[-2].split('-')).capitalize()
    tmp_data['tournament_slug'] = a[-2]
    tmp_data['tournament_category'] = a[-3]
    tournaments.append(tmp_data)

In [12]:
tournaments

[{'id': '804',
  'tournament_name': 'U20 world championship women',
  'tournament_slug': 'u20-world-championship-women',
  'tournament_category': 'world'},
 {'id': '842',
  'tournament_name': 'Waff championship',
  'tournament_slug': 'waff-championship',
  'tournament_category': 'asia'},
 {'id': '1337',
  'tournament_name': 'International champions cup',
  'tournament_slug': 'international-champions-cup',
  'tournament_category': 'world'},
 {'id': '22527',
  'tournament_name': 'Af guarda 1a divisao',
  'tournament_slug': 'af-guarda-1a-divisao',
  'tournament_category': 'portugal-amateur'},
 {'id': '22536',
  'tournament_name': 'Regionalna liga rs jug',
  'tournament_slug': 'regionalna-liga-rs-jug',
  'tournament_category': 'bosnia-and-herzegovina-amateur'},
 {'id': '22537',
  'tournament_name': 'Af guarda taca distrital 2a divisao',
  'tournament_slug': 'af-guarda-taca-distrital-2a-divisao',
  'tournament_category': 'portugal-amateur'},
 {'id': '18934',
  'tournament_name': 'All russia

## Extracting the Tournament Stages data.

In [None]:
data_keys = {
    'tournament': ['name', 'slug', 'category', 'id' ],
    'uniqueTournament': ['name', 'slug', 'id'],
    'status': ['code', 'description', 'type'],
    'homeTeam': ['name', 'slug', 'shortName', 'gender', 'nameCode', 'id'],
    'awayTeam': ['name', 'slug', 'shortName', 'gender', 'nameCode', 'id'],
    'homeScore': ['current','display', 'period1', 'period2', 'normaltime', 'aggregated'],
    'awayScore': ['current','display', 'period1', 'period2', 'normaltime', 'aggregated'],
}

In [None]:
league_data = []

In [None]:
for event_data in data['events']:
    temp_data = {}
    temp_data['league_stage'] = {
        k: v for k, v in event_data['tournament'].items() if k in data_keys['tournament']}
    temp_data['league'] = {k: v for k, v in event_data['tournament']
                           ['uniqueTournament'].items() if k in data_keys['uniqueTournament']}
    temp_data['league_stage_status'] = {
        k: v for k, v in event_data['status'].items() if k in data_keys['status']}
    temp_data['home_team'] = {
        k: v for k, v in event_data['homeTeam'].items() if k in data_keys['homeTeam']}
    temp_data['away_team'] = {
        k: v for k, v in event_data['awayTeam'].items() if k in data_keys['awayTeam']}
    temp_data['home_score'] = {
        k: v for k, v in event_data['homeScore'].items() if k in data_keys['homeScore']}
    temp_data['away_score'] = {
        k: v for k, v in event_data['awayScore'].items() if k in data_keys['awayScore']}
    temp_data['match_id'] = event_data['id']
    temp_data['start_time_stamp'] = datetime.fromtimestamp(
        event_data['startTimestamp'])
    temp_data['match_slug'] = event_data['slug']
    league_data.append(temp_data)

In [None]:
league_data[0]

In [None]:
league_data

In [None]:
with open("fixtures/country-by-continent.json", 'r') as file:
    data = file.read()
    countries_by_continent = json.loads(data)

In [None]:
continents = {
    "Asia": 0,
    "Africa": 1,
    "Europe": 2,
    "North America": 3,
    "South America": 4,
    "Oceania": 5,
    'Antarctica': 6
}

fixture_data = []
for i, data in enumerate(countries_by_continent):
    tmp_data = dict()
    country, continent = data.values()
    tmp_data['model'] = 'stats.country'
    tmp_data['pk'] = i + 1
    tmp_data['fields'] = {
        'country': country,
        'continent': continents[continent]
    }
    fixture_data.append(tmp_data)

In [None]:
fixture_data