## Premier League Data Pipeline

In [3]:
# Definicion de Librerias

import requests
import pandas as pd
from configparser import ConfigParser
from utils import get_data
from deltalake import DeltaTable,write_deltalake
from datetime import datetime,timedelta
import os
import pyarrow as pa
import json


In [4]:
# Definicion de Variables de configuracion y de URL 
parser = ConfigParser()
parser.read('pipeline.conf')
api_key = parser.items('football-data')[0][1]
headers = {'X-AUTH-TOKEN':api_key}

url_base = 'https://api.football-data.org/v4/'

In [6]:
# Chequeo de la conexion a la API
requests.get('https://api.football-data.org/v4/matches', headers=headers).status_code

200

### Extraccion full de los equipos de la **Premier League** de la temporada 2024/2025

In [8]:
endpoint_teams = 'competitions/PL/teams'

data_PL = get_data(url_base,endpoint_teams,headers=headers)
data_PL.keys()

dict_keys(['count', 'filters', 'competition', 'season', 'teams'])

In [9]:
teams_data = data_PL['teams']
teams_data[0].keys()

dict_keys(['area', 'id', 'name', 'shortName', 'tla', 'crest', 'address', 'website', 'founded', 'clubColors', 'venue', 'runningCompetitions', 'coach', 'squad', 'staff', 'lastUpdated'])

In [10]:
# Creacion de un DataFrame con la informacion que queremos de los equipos
teams = []
for team in teams_data:
    coach = f'{team['coach']['firstName']} {team['coach']['lastName']}'
    teams.append(dict(id=team['id'],
                      team=team['name'],
                      Stadium=team['venue'],
                      dt=coach,
                      lastUpdated=team['lastUpdated']
                      )
                 )

In [11]:
columns = ['id','team','Stadium','dt','lastUpdated']
df_2024_2025_PL = pd.DataFrame(data=teams,columns=columns)
# id , team , Stadium , dt , LastUpdated
df_2024_2025_PL.head() # -> DataFrame con la informacion de los equipos de la Premier League

Unnamed: 0,id,team,Stadium,dt,lastUpdated
0,57,Arsenal FC,Emirates Stadium,Mikel Arteta,2022-02-10T19:48:56Z
1,58,Aston Villa FC,Villa Park,Unai Emery,2022-04-03T16:22:14Z
2,61,Chelsea FC,Stamford Bridge,Enzo Maresca,2022-02-10T19:24:40Z
3,62,Everton FC,Goodison Park,Sean Dyche,2022-02-10T19:47:42Z
4,63,Fulham FC,Craven Cottage,Marco Silva,2024-07-29T17:16:11Z


In [12]:
# Se sobreescriben todos los datos
write_deltalake(
    "data_lake/teams",
    df_2024_2025_PL,
    mode="overwrite"
)

In [13]:
teams_dt = DeltaTable("data_lake/teams").to_pandas()
teams_dt.head()

Unnamed: 0,id,team,Stadium,dt,lastUpdated
0,57,Arsenal FC,Emirates Stadium,Mikel Arteta,2022-02-10T19:48:56Z
1,58,Aston Villa FC,Villa Park,Unai Emery,2022-04-03T16:22:14Z
2,61,Chelsea FC,Stamford Bridge,Enzo Maresca,2022-02-10T19:24:40Z
3,62,Everton FC,Goodison Park,Sean Dyche,2022-02-10T19:47:42Z
4,63,Fulham FC,Craven Cottage,Marco Silva,2024-07-29T17:16:11Z


### Extraccion full de las plantillas de los equipos de la premier league de la temporada 2024/2025

Todavia no tienen un uso definido, pero sirven como datos estaticos (o de poca ocurrencia de actualizacion), ya que las plantillas solo cambian en el mercado invernal y en el mercado de verano.

In [14]:
# Guardamos la informacion que necesitemos de todos los equipos
squads = []
for team in teams_data:
    squad_data = team['squad']
    squad = []
    for player_data in squad_data:
        player = dict(id=player_data['id'],name=player_data['name'],position=player_data['position'],nationality=player_data['nationality']) 
        squad.append(player)
    
    squads.append(dict(id_team=team['id'],squad=squad))
    
# Este dataframe contiene por cada equipo -> (id del equipo, lista de jugadores)    
squads[0]['squad'][5:7]

[{'id': 112948,
  'name': 'Fabio Vieira',
  'position': 'Attacking Midfield',
  'nationality': 'Portugal'},
 {'id': 99813,
  'name': 'Bukayo Saka',
  'position': 'Right Winger',
  'nationality': 'England'}]

In [15]:
df_squads = pd.DataFrame(data=squads)
df_squads.head(10)
# Cada registro muestra el id_team y una lista de jugadores


Unnamed: 0,id_team,squad
0,57,"[{'id': 4832, 'name': 'David Raya', 'position'..."
1,58,"[{'id': 3141, 'name': 'Emiliano Martínez', 'po..."
2,61,"[{'id': 3189, 'name': 'Kepa Arrizabalaga', 'po..."
3,62,"[{'id': 3309, 'name': 'Jordan Pickford', 'posi..."
4,63,"[{'id': 3174, 'name': 'Bernd Leno', 'position'..."
5,64,"[{'id': 1795, 'name': 'Alisson', 'position': '..."
6,65,"[{'id': 3222, 'name': 'Ederson', 'position': '..."
7,66,"[{'id': 7544, 'name': 'André Onana', 'position..."
8,67,"[{'id': 3310, 'name': 'Nick Pope', 'position':..."
9,73,"[{'id': 3086, 'name': 'Guglielmo Vicario', 'po..."


In [16]:
# Normalizamos el json para que cada jugador sea un registro con su respectivo id_team, y sea mas
df_squads = pd.json_normalize(data=squads,record_path='squad',meta='id_team')
df_squads.head()

Unnamed: 0,id,name,position,nationality,id_team
0,4832,David Raya,Goalkeeper,Spain,57
1,5530,Aaron Ramsdale,Goalkeeper,England,57
2,153843,Karl Jakob Hein,Goalkeeper,Estonia,57
3,147286,Jakub Kiwior,Left-Back,Poland,57
4,133512,Riccardo Calafiori,Centre-Back,Italy,57


In [48]:
# Guardar en deltalake
write_deltalake(
    "data_lake/squads",
    df_squads,
    mode="overwrite"
)

### Extraccion full de los partidos de la premier league de la temporada 2024/2025
Para tener un dataset completo de la temporada 2024/2025 de la premier league, se extraen todos los partidos de la temporada.

In [18]:
endpoint_matches = 'competitions/PL/matches'
data_matches = get_data(url_base,endpoint_matches,headers=headers)
data_matches['matches'][0]


{'area': {'id': 2072,
  'name': 'England',
  'code': 'ENG',
  'flag': 'https://crests.football-data.org/770.svg'},
 'competition': {'id': 2021,
  'name': 'Premier League',
  'code': 'PL',
  'type': 'LEAGUE',
  'emblem': 'https://crests.football-data.org/PL.png'},
 'season': {'id': 2287,
  'startDate': '2024-08-16',
  'endDate': '2025-05-25',
  'currentMatchday': 1,
  'winner': None},
 'id': 497410,
 'utcDate': '2024-08-16T19:00:00Z',
 'status': 'TIMED',
 'matchday': 1,
 'stage': 'REGULAR_SEASON',
 'group': None,
 'lastUpdated': '2024-07-23T10:21:24Z',
 'homeTeam': {'id': 66,
  'name': 'Manchester United FC',
  'shortName': 'Man United',
  'tla': 'MUN',
  'crest': 'https://crests.football-data.org/66.png'},
 'awayTeam': {'id': 63,
  'name': 'Fulham FC',
  'shortName': 'Fulham',
  'tla': 'FUL',
  'crest': 'https://crests.football-data.org/63.png'},
 'score': {'winner': None,
  'duration': 'REGULAR',
  'fullTime': {'home': None, 'away': None},
  'halfTime': {'home': None, 'away': None}},


In [19]:
# Estado de todos los partidos matchday, id, status, utcDate
matches_data = []
for match in data_matches['matches']:
    matches_data.append(dict(matchday=match['matchday'],
                             id=match['id'],
                             status=match['status'],
                             date=match['utcDate'],
                             home=match['homeTeam']['id'],
                             away=match['awayTeam']['id'],
                             goals_home=match['score']['fullTime']['home'],
                             goals_away=match['score']['fullTime']['away']
                            )
                        )

matches_data = pd.DataFrame(data=matches_data)
matches_data.head()


Unnamed: 0,matchday,id,status,date,home,away,goals_home,goals_away
0,1,497410,TIMED,2024-08-16T19:00:00Z,66,63,,
1,1,497411,TIMED,2024-08-17T11:30:00Z,349,64,,
2,1,497412,TIMED,2024-08-17T14:00:00Z,57,76,,
3,1,497413,TIMED,2024-08-17T14:00:00Z,62,397,,
4,1,497414,TIMED,2024-08-17T14:00:00Z,67,340,,


In [20]:
matches_data.fillna(0,inplace=True)
matches_data

  matches_data.fillna(0,inplace=True)


Unnamed: 0,matchday,id,status,date,home,away,goals_home,goals_away
0,1,497410,TIMED,2024-08-16T19:00:00Z,66,63,0,0
1,1,497411,TIMED,2024-08-17T11:30:00Z,349,64,0,0
2,1,497412,TIMED,2024-08-17T14:00:00Z,57,76,0,0
3,1,497413,TIMED,2024-08-17T14:00:00Z,62,397,0,0
4,1,497414,TIMED,2024-08-17T14:00:00Z,67,340,0,0
...,...,...,...,...,...,...,...,...
375,38,497785,TIMED,2025-05-25T15:00:00Z,67,62,0,0
376,38,497786,TIMED,2025-05-25T15:00:00Z,351,61,0,0
377,38,497787,TIMED,2025-05-25T15:00:00Z,340,57,0,0
378,38,497788,TIMED,2025-05-25T15:00:00Z,73,397,0,0


In [21]:

write_deltalake(
    "data_lake/all_matches",
    matches_data,
    mode="overwrite"
)

In [22]:
all_matches_data_pq = DeltaTable("data_lake/all_matches").to_pandas()
all_matches_data_pq # Tabla delta con todos partidos de la premier league

Unnamed: 0,matchday,id,status,date,home,away,goals_home,goals_away
0,1,497410,TIMED,2024-08-16T19:00:00Z,66,63,0,0
1,1,497411,TIMED,2024-08-17T11:30:00Z,349,64,0,0
2,1,497412,TIMED,2024-08-17T14:00:00Z,57,76,0,0
3,1,497413,TIMED,2024-08-17T14:00:00Z,62,397,0,0
4,1,497414,TIMED,2024-08-17T14:00:00Z,67,340,0,0
...,...,...,...,...,...,...,...,...
375,38,497785,TIMED,2025-05-25T15:00:00Z,67,62,0,0
376,38,497786,TIMED,2025-05-25T15:00:00Z,351,61,0,0
377,38,497787,TIMED,2025-05-25T15:00:00Z,340,57,0,0
378,38,497788,TIMED,2025-05-25T15:00:00Z,73,397,0,0


## Extraccion incremental de los partidos de la premier league de la temporada 2024/2025
El archivo metadata.json guarda la fecha de la ultima carga de partidos finalizados, para poder hacer una extraccion incremental de los partidos de la premier league de la temporada 2024/2025

In [23]:
# Funciones para obtener y actualizar la metadata

def update_last_update_in_json(path,new_value):
    with(open(path,'w')) as file:
        json.dump(new_value,file)

def get_metadata_from_json(path):
    
    with open(path) as f:
        metadata = json.load(f)

    return metadata

# update_last_update_in_json("metadata/metadata.json","2024-08-15T00:00:00Z") # Para actualizar un dia antes de la temporada
print(f"Ultima actualizacion { get_metadata_from_json("metadata/metadata.json") }")

Ultima actualizacion 2024-08-15T00:00:00Z


In [24]:
# EJECUTAR PARA ACTULAIZAR LOS DATOS

# Aca hago un get a los partidos que ya terminaron, y que cumplen que son mayores a la fecha que esta en el metadata.json
last_date = get_metadata_from_json("metadata/metadata.json")
last_date = last_date[0:10] # Recolectamos solo año-mes-dia
params = {
    'dateFrom':last_date,
    # ATENCION
    #'dateTo':datetime.now().strftime('%Y-%m-%d') # Deberia ir este pero la premier todavia no arranco
    'dateTo' :"2024-09-12" # Fecha simulada de pruebas, se cargaran los partidos desde el dateFrom hasta el dateTo
    
}

played_matches = get_data(url_base,endpoint_matches,headers=headers,params=params)
played_matches

{'filters': {'season': '2024'},
 'resultSet': {'count': 30,
  'first': '2024-08-16',
  'last': '2024-09-01',
  'played': 0},
 'competition': {'id': 2021,
  'name': 'Premier League',
  'code': 'PL',
  'type': 'LEAGUE',
  'emblem': 'https://crests.football-data.org/PL.png'},
 'matches': [{'area': {'id': 2072,
    'name': 'England',
    'code': 'ENG',
    'flag': 'https://crests.football-data.org/770.svg'},
   'competition': {'id': 2021,
    'name': 'Premier League',
    'code': 'PL',
    'type': 'LEAGUE',
    'emblem': 'https://crests.football-data.org/PL.png'},
   'season': {'id': 2287,
    'startDate': '2024-08-16',
    'endDate': '2025-05-25',
    'currentMatchday': 1,
    'winner': None},
   'id': 497410,
   'utcDate': '2024-08-16T19:00:00Z',
   'status': 'TIMED',
   'matchday': 1,
   'stage': 'REGULAR_SEASON',
   'group': None,
   'lastUpdated': '2024-07-23T10:21:24Z',
   'homeTeam': {'id': 66,
    'name': 'Manchester United FC',
    'shortName': 'Man United',
    'tla': 'MUN',
    

In [25]:
# Procesar datos de los partidos jugados que no fueron cargados
played_matches_df = []
for match in played_matches['matches']:
    played_matches_df.append(dict(matchday=match['matchday'],
                             id=match['id'],
                             status=match['status'],
                             date=match['utcDate'],
                             home=match['homeTeam']['id'],
                             away=match['awayTeam']['id'],
                             goals_home=match['score']['fullTime']['home'],
                             goals_away=match['score']['fullTime']['away']
                            )
                        )

played_matches_df= pd.DataFrame(data=played_matches_df)

# played_matches_df = played_matches_df.query('status == "FINISHED"') ESTO DESACTIVADO PARA PRUEBAS, LOS PARTIDOS SE DEBERIAN CARGAR EN status == "FINISHED"
update_last_update_in_json("metadata/metadata.json",played_matches_df['date'].max())

played_matches_df.fillna(0,inplace=True)
played_matches_df.head()
# Aca tendriamos los partidos ya jugados


  played_matches_df.fillna(0,inplace=True)


Unnamed: 0,matchday,id,status,date,home,away,goals_home,goals_away
0,1,497410,TIMED,2024-08-16T19:00:00Z,66,63,0,0
1,1,497411,TIMED,2024-08-17T11:30:00Z,349,64,0,0
2,1,497412,TIMED,2024-08-17T14:00:00Z,57,76,0,0
3,1,497413,TIMED,2024-08-17T14:00:00Z,62,397,0,0
4,1,497414,TIMED,2024-08-17T14:00:00Z,67,340,0,0


In [26]:
played_matches_df['date'].max() # Fecha maxima deL df, osea de los partidos jugados

'2024-09-01T14:00:00Z'

In [27]:
played_matches_df.head()

Unnamed: 0,matchday,id,status,date,home,away,goals_home,goals_away
0,1,497410,TIMED,2024-08-16T19:00:00Z,66,63,0,0
1,1,497411,TIMED,2024-08-17T11:30:00Z,349,64,0,0
2,1,497412,TIMED,2024-08-17T14:00:00Z,57,76,0,0
3,1,497413,TIMED,2024-08-17T14:00:00Z,62,397,0,0
4,1,497414,TIMED,2024-08-17T14:00:00Z,67,340,0,0


<p style="color:red;">NO VOLVER A EJECUTAR LA CELDA DE ABAJO SI YA SE CREO LA TABLA</p>

In [28]:

# Creamos df vacio para poder crear el delta lake
empty_df = pd.DataFrame(columns=['matchday','id','status','date','home','away','goals_home','goals_away'])
empty_df

# Deltalake no me permitia crear un deltalake con un dataframe vacio, se me ocurrio 
# la solucion de crear un dataframe con un solo registro y luego eliminarlo
# ChatGPT me ayudo con la logica de como hacerlo
schema = pa.schema([
    pa.field('matchday',pa.int64()),
    pa.field('id',pa.int64()),
    pa.field('status',pa.string()),
    pa.field('date',pa.string()),
    pa.field('home',pa.int64()),
    pa.field('away',pa.int64()),
    pa.field('goals_home',pa.int64()),
    pa.field('goals_away',pa.int64())
])

empty_df_pa = pa.Table.from_arrays([[]] * len(schema), schema=schema)
# Se crea la tabla delta particionada por matchday, esta estara vacia en el momento de su creacion

write_deltalake(
    "data_lake/played_matches",
    data=empty_df_pa,
    mode="overwrite",
    partition_by=["matchday"]
)

In [29]:
dt = DeltaTable("data_lake/played_matches")

new_data_pa = pa.Table.from_pandas(played_matches_df)
(
      dt.merge(
          source=new_data_pa,
          source_alias="source",
          target_alias="target",
          predicate="source.id = target.id"
      )
      .when_not_matched_insert_all() 
      .execute()
)

{'num_source_rows': 30,
 'num_target_rows_inserted': 30,
 'num_target_rows_updated': 0,
 'num_target_rows_deleted': 0,
 'num_target_rows_copied': 0,
 'num_output_rows': 30,
 'num_target_files_added': 3,
 'num_target_files_removed': 0,
 'execution_time_ms': 39,
 'scan_time_ms': 0,
 'rewrite_time_ms': 14}

In [44]:
# Aca podes ver los partidos jugados en la fecha que se indique
matchday = 2 # para ejemplo
try:
    ls = os.listdir(f"data_lake/played_matches/matchday={matchday}/")
    df = pd.read_parquet(f"data_lake/played_matches/matchday={matchday}/{ls[-1]}") # Muestra el ultimo parquet
except FileNotFoundError:
    print("No se encontro el directorio")

df



Unnamed: 0,id,status,date,home,away,goals_home,goals_away
0,497424,TIMED,2024-08-24T14:00:00Z,63,338,0,0
1,497420,TIMED,2024-08-25T13:00:00Z,1044,67,0,0
2,497425,TIMED,2024-08-25T15:30:00Z,64,402,0,0
3,497428,TIMED,2024-08-24T14:00:00Z,73,62,0,0
4,497421,TIMED,2024-08-24T16:30:00Z,58,57,0,0
5,497422,TIMED,2024-08-24T11:30:00Z,397,66,0,0
6,497423,TIMED,2024-08-24T14:00:00Z,354,563,0,0
7,497429,TIMED,2024-08-25T13:00:00Z,76,61,0,0
8,497426,TIMED,2024-08-24T14:00:00Z,65,349,0,0
9,497427,TIMED,2024-08-24T14:00:00Z,340,351,0,0


In [47]:
df = pd.read_parquet(f"data_lake/played_matches")
df.head(10)
print(f"Se cargaron {len(df)} partidos jugados")

Se cargaron 30 partidos jugados
