# Recoleccion de Datos
## Importar Librerias

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsbombpy import sb
import json 
import html5lib 
import bs4
import time
from bs4 import BeautifulSoup
import requests
import os
import LanusStats as ls
import requests
from difflib import get_close_matches
fotmob = ls.FotMob()
fbref = ls.Fbref()
threesixfivescores = ls.ThreeSixFiveScores()
sofascore = ls.SofaScore()
transfermarkt = ls.Transfermarkt()


In [2]:
ls.get_available_leagues("Sofascore")

['Argentina Liga Profesional',
 'Argentina Copa de la Liga Profesional',
 'Argentina Primera Nacional',
 'Brasileirão Série A',
 'Bolivia Division Profesional',
 'Chile Primera Division',
 'Colombia Primera A Apertura',
 'Colombia Primera A Clausura',
 'Ecuador LigaPro',
 'Mexico LigaMX Apertura',
 'Mexico LigaMX Clausura',
 'Peru Liga 1',
 'Uruguay Primera Division',
 'Venezuela Primera Division',
 'World Cup',
 'Euros',
 'Copa America',
 'Premier League',
 'La Liga',
 'Bundesliga',
 'Serie A',
 'Ligue 1',
 'Copa Libertadores',
 'Copa Sudamericana',
 'MLS',
 'Saudi Pro League',
 'J1 League',
 'NSWL',
 'USL Championship',
 'La Liga 2']

## Funciones Extra

In [3]:
def match_players(df_fbref, df_fotmob, position=None, season=None):
    def map_pos(pos):
        if 'FW' in pos:
            return 'Forward'
        elif 'MF' in pos:
            return 'Midfielder'
        elif 'DF' in pos:
            return 'Defender'
        elif 'GK' in pos:
            return 'Goalkeeper'
        else:
            return None

    df_fbref = df_fbref.copy()
    df_fbref['PositionCategory'] = df_fbref['Pos'].apply(map_pos)

    # Filter by position
    if position:
        df_fbref = df_fbref[df_fbref['PositionCategory'] == position]

    # Filter by season
    if season:
        df_fbref = df_fbref[df_fbref['Season'].astype(str).isin([str(s) for s in range(2021, season + 1)])]

    # Get summary stats per player
    summary_df = df_fbref.groupby('Player').agg(
        last_season=('Season', 'max'),
        seasons_played=('Season', 'nunique'),
        matches_played=('MP', 'sum'),
        Pos=('Pos', 'first')
    ).reset_index()

    matched_rows = []

    for _, row in summary_df.iterrows():
        player_name_fbref = row['Player']
        possible_matches = get_close_matches(player_name_fbref, df_fotmob['player_name'], n=1, cutoff=0.8)

        if possible_matches:
            matched_name = possible_matches[0]
            fotmob_row = df_fotmob[df_fotmob['player_name'] == matched_name].iloc[0]

            matched_rows.append({
                'player_name': matched_name,
                'id': fotmob_row['id'],
                'Pos': row['Pos'],
                'last_season': row['last_season'],
                'seasons_played': row['seasons_played'],
                'matches_played': row['matches_played']
            })

    return pd.DataFrame(matched_rows)

## Funciones FBREF

In [4]:
#Devuelve un dataframe que contiene las estadisitcas de los equipos por año, de forma separada por tipo de estadistica
def all_year_team_stats_fbref(stats_list, league, seasons, vs=None):
    stat_dataframes = {}

    if vs == "vs":
        for stat in stats_list:
            print(f"📊 Descargando estadísticas VS para: {stat}")
            all_data = []
            for season in seasons:
                print(f"   ⏳ Temporada {season}...")
                df = fbref.get_teams_season_stats(stat, league, season, change_columns_names=True, stats_vs=True)
                df["League"] = league
                df["Season"] = int(season)
                all_data.append(df)
            combined_df = pd.concat(all_data, ignore_index=True)
            stat_dataframes[stat] = combined_df
            print(f"✅ Completado: {stat} VS\n")
    else:
        for stat in stats_list:
            print(f"📊 Descargando estadísticas para: {stat}")
            all_data = []
            for season in seasons:
                print(f"   ⏳ Temporada {season}...")
                df = fbref.get_teams_season_stats(stat, league, season, change_columns_names=True)
                df["League"] = league
                df["Season"] = int(season)
                all_data.append(df)
            combined_df = pd.concat(all_data, ignore_index=True)
            stat_dataframes[stat] = combined_df
            print(f"✅ Completado: {stat}\n")

    print("🎯 Descarga finalizada para todas las estadísticas.")
    return stat_dataframes


#Guarda las estadisticas de los dataframe en archivos csv (por tipo de estadisitca)
def save_stats_to_csv(stat_dataframes, league, vs=None, output_path="./"):
    # Limpiar el nombre de la liga para el nombre del archivo
    league_clean = league.replace(" ", "_").replace("/", "-")

    # Crear la carpeta si no existe
    os.makedirs(output_path, exist_ok=True)

    if vs == "vs":
        for stat, df in stat_dataframes.items():
            # Nombre del archivo: estadística + liga
            filename = f"{stat}_{league_clean}_vs.csv"
            filepath = os.path.join(output_path, filename)

            # Guardar CSV
            df.to_csv(filepath, index=False)
            print(f"✅ Archivo guardado: {filepath}")
    else:
        for stat, df in stat_dataframes.items():
            # Nombre del archivo: estadística + liga
            filename = f"{stat}_{league_clean}.csv"
            filepath = os.path.join(output_path, filename)

            # Guardar CSV
            df.to_csv(filepath, index=False)
            print(f"✅ Archivo guardado: {filepath}")








#Devuelve un dataframe que contiene las estadisitcas de los jugadores por año, de forma separada por tipo de estadistica
def all_year_player_stats_fbref(stats_list, league, seasons):
    stat_dataframes = {}

    for stat in stats_list:
        print(f"📊 Descargando estadísticas de jugadores para: {stat}")
        all_data = []
        for season in seasons:
            print(f"   ⏳ Temporada {season}...")
            try:
                df = fbref.get_player_season_stats(stat, league, season)
                df["League"] = league
                df["Season"] = int(season)
                all_data.append(df)
            except Exception as e:
                print(f"   ⚠️ Error en {league} {season} para {stat}: {e}")
        if all_data:
            combined_df = pd.concat(all_data, ignore_index=True)
            stat_dataframes[stat] = combined_df
            print(f"✅ Completado: {stat}\n")
        else:
            print(f"⚠️ No se encontró información para {stat}.\n")

    print("🎯 Descarga finalizada para todas las estadísticas de jugadores.")
    return stat_dataframes


#Guarda las estadisticas de los dataframe en archivos csv (por tipo de estadisitca de los jugadores)
def save_player_stats_to_csv(stat_dataframes, league, output_path="./"):
    # Limpiar nombre de liga
    league_clean = league.replace(" ", "_").replace("/", "-")

    # Crear carpeta si no existe
    os.makedirs(output_path, exist_ok=True)

    for stat, df in stat_dataframes.items():
        # Generar nombre del archivo
        filename = f"{stat}_{league_clean}_players.csv"
        filepath = os.path.join(output_path, filename)

        # Guardar CSV
        df.to_csv(filepath, index=False)
        print(f"✅ Archivo guardado: {filepath}")


## Funciones Fotmob

In [5]:
stats_fotmob_teams = list(fotmob.team_possible_stats)
stats_fotmob_players = list(fotmob.player_possible_stats)
leagues_fotmob = list(ls.get_available_leagues("Fotmob"))[5:7]
seasons_fotmob_liga = list((ls.get_available_season_for_leagues("Fotmob", "Argentina Primera Division"))["seasons"].keys())[0:4]
seasons_fotmob_copa = list((ls.get_available_season_for_leagues("Fotmob", "Argentina Copa de la Liga"))["seasons"].keys())[0:4]
leagues_fotmob[1]
stats_fotmob_players



['goals',
 'goal_assist',
 '_goals_and_goal_assist',
 'rating',
 'goals_per_90',
 'expected_goals',
 'expected_goals_per_90',
 'expected_goalsontarget',
 'ontarget_scoring_att',
 'total_scoring_att',
 'accurate_pass',
 'big_chance_created',
 'total_att_assist',
 'accurate_long_balls',
 'expected_assists',
 'expected_assists_per_90',
 '_expected_goals_and_expected_assists_per_90',
 'won_contest',
 'big_chance_missed',
 'penalty_won',
 'won_tackle',
 'interception',
 'effective_clearance',
 'outfielder_block',
 'penalty_conceded',
 'poss_won_att_3rd',
 'clean_sheet',
 '_save_percentage',
 'saves',
 '_goals_prevented',
 'goals_conceded',
 'fouls',
 'yellow_card',
 'red_card']

In [6]:
def all_year_team_stats_fotmob(stats_list, league, seasons):
    stat_dataframes = {}

    for stat in stats_list:
        print(f"📊 Descargando estadísticas para: {stat}")
        all_data = []
        failed_seasons = 0

        for season in seasons:
            try:
                print(f"   ⏳ Temporada {season}...")
                df = fotmob.get_teams_stats_season(league, season, stat)

                if df is None or df.empty:
                    print(f"   ⚠️ Sin datos para {stat} en temporada {season}.")
                    failed_seasons += 1
                    continue

                df["League"] = league

                # Manejo del tipo de temporada
                if isinstance(season, str) and "/" in season:
                    season_year = int(season.split("/")[0])
                else:
                    season_year = int(season)

                df["Season"] = season_year
                all_data.append(df)

            except Exception as e:
                print(f"   ❌ Error en {stat} ({season}): {e}")
                failed_seasons += 1
                continue

        if all_data:
            combined_df = pd.concat(all_data, ignore_index=True)
            stat_dataframes[stat] = combined_df
            print(f"✅ Completado: {stat} ({len(all_data)} temporadas descargadas)\n")
        else:
            print(f"⚠️ Todas las temporadas fallaron para {stat}. Se ignora esta estadística.\n")

    print("🎯 Descarga finalizada para todas las estadísticas.")
    return stat_dataframes


def all_year_player_stats_fotmob(stats_list, league, seasons):
    stat_dataframes = {}

    for stat in stats_list:
        print(f"📊 Descargando estadísticas de jugadores para: {stat}")
        all_data = []
        failed_seasons = 0

        for season in seasons:
            try:
                print(f"   ⏳ Temporada {season}...")
                df = fotmob.get_players_stats_season(league, season, stat)

                if df is None or df.empty:
                    print(f"   ⚠️ Sin datos para {stat} en temporada {season}.")
                    failed_seasons += 1
                    continue

                df["League"] = league

                # Manejo del tipo de temporada
                if isinstance(season, str) and "/" in season:
                    season_year = int(season.split("/")[0])
                else:
                    season_year = int(season)

                df["Season"] = season_year
                all_data.append(df)

            except Exception as e:
                print(f"   ❌ Error en {stat} ({season}): {e}")
                failed_seasons += 1
                continue

        if all_data:
            combined_df = pd.concat(all_data, ignore_index=True)
            stat_dataframes[stat] = combined_df
            print(f"✅ Completado: {stat} ({len(all_data)} temporadas descargadas)\n")
        else:
            print(f"⚠️ Todas las temporadas fallaron para {stat}. Se ignora esta estadística.\n")

    print("🎯 Descarga finalizada para todas las estadísticas de jugadores.")
    return stat_dataframes




## Testing

In [7]:
dataframe_fotmob = pd.read_csv(r"c:\Tobias\Docs\StrikerP\Fotmob\players\Liga\rating_Argentina_Primera_Division_players.csv")
dataframe_fotmob_copaligarcha = pd.read_csv(r"c:\Tobias\Docs\StrikerP\Fotmob\players\Copa Ligarcha\rating_Argentina_Copa_de_la_Liga_players.csv")
dataframe_fbref = pd.read_csv(r"C:\Tobias\Docs\StrikerP\Fbref\Players\Liga\stats_Primera_Division_Argentina_players.csv")
dataframe_fbref_copaligarcha = pd.read_csv(r"c:\Tobias\Docs\StrikerP\Fbref\Players\Copa de la Liga\stats_Copa_de_la_Liga_players.csv")

In [8]:
nuevo_df_liga = match_players(dataframe_fbref,dataframe_fotmob)
nuevo_df_copa = match_players(dataframe_fbref_copaligarcha, dataframe_fotmob_copaligarcha)

In [12]:
def unir_torneos_sumando_partidos(df_a: pd.DataFrame, df_b: pd.DataFrame) -> pd.DataFrame:
    """
    Une dos DataFrames de jugadores, sumando los partidos jugados (matches_played)
    de aquellos que tienen el mismo 'id', manteniendo el resto de columnas según aparezcan primero.
    
    Devuelve un DataFrame con las columnas en el orden original.
    """
    columnas = ['player_name', 'id', 'Pos', 'last_season', 'seasons_played', 'matches_played']
    
    df_unido = pd.concat([df_a, df_b], ignore_index=True)

    df_final = df_unido.groupby('id', as_index=False).agg({
        'player_name': 'first',
        'Pos': 'first',
        'last_season': 'first',
        'seasons_played': 'first',  # o 'sum' si querés acumular también
        'matches_played': 'sum'
    })

    # Reordenamos columnas
    df_final = df_final[columnas]

    return df_final

In [13]:
df_completo = unir_torneos_sumando_partidos(nuevo_df_liga, nuevo_df_copa)

In [21]:
df_completo.to_csv("matches_completo.csv", index=False)

Unnamed: 0,player_name,id,Pos,last_season,seasons_played,matches_played
621,Luis Advincula,190522,DF,2024,4,69
