In [3]:
import pandas as pd

from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.metrics import mean_squared_error
from statsmodels.tsa.stattools import adfuller

import matplotlib.pyplot as plt
import matplotlib.dates as mdates

from pmdarima import auto_arima

import numpy as np
import warnings

warnings.filterwarnings("ignore")



In [4]:
# cargar dataset
df = pd.read_csv("processed\data_processed_final.csv")


# Lista para guardar los resultados
results = []

# Iterar por jugador
for player in df["Player"].unique():
    player_data = df[df["Player"] == player]
    performance_series = player_data["last_performance"].values
    
    # Realizar el test de Dickey-Fuller
    adf_result = adfuller(performance_series)
    
    # Guardar los resultados
    result_dict = {
        "Player": player,
        "ADF Statistic": adf_result[0],
        "p-value": adf_result[1],
        "Critical Values": adf_result[4],
        "Is Stationary": adf_result[1] < 0.05  # Estacionariedad si p-valor < 0.05
    }
    results.append(result_dict)

# Convertir los resultados a un DataFrame
results_df = pd.DataFrame(results)

# Mostrar resultados iterando por jugador
for idx, row in results_df.iterrows():
    print(f"Player: {row['Player']}")
    print(f"ADF Statistic: {row['ADF Statistic']}")
    print(f"p-value: {row['p-value']}")
    print(f"Critical Values: {row['Critical Values']}")
    print(f"Is Stationary: {row['Is Stationary']}")
    print("\n")

Player: Ferran
ADF Statistic: -3.8305736839098357
p-value: 0.0026095474934785296
Critical Values: {'1%': -3.4521902441030963, '5%': -2.871158406898617, '10%': -2.5718948388228586}
Is Stationary: True


Player: Gerard Moreno
ADF Statistic: -3.74230768820806
p-value: 0.0035559182538531732
Critical Values: {'1%': -3.453922368485787, '5%': -2.871918329081633, '10%': -2.5723001147959184}
Is Stationary: True


Player: Griezmann
ADF Statistic: -4.510458732804784
p-value: 0.00018825808448329946
Critical Values: {'1%': -3.4514843502727306, '5%': -2.8708485956333556, '10%': -2.571729625657462}
Is Stationary: True


Player: Koundé
ADF Statistic: -4.638748328778486
p-value: 0.00010951481266284085
Critical Values: {'1%': -3.4513486122290717, '5%': -2.870789013306053, '10%': -2.5716978530569192}
Is Stationary: True


Player: Kubo
ADF Statistic: -3.9273604351576896
p-value: 0.0018424663164317203
Critical Values: {'1%': -3.453342167806272, '5%': -2.871663828287282, '10%': -2.572164381381345}
Is Statio

# TOPICS DINAMICOS

In [1]:
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.tokenize import word_tokenize
import re
from gensim.models import LdaSeqModel
from gensim.corpora import Dictionary
import nltk
from nltk.stem import WordNetLemmatizer
import ast



def preprocess_text(text):
        """
        Preprocesa el texto para análisis de tópicos
        """
        # Inicializar herramientas
        stop_words = set(stopwords.words('english'))
        lemmatizer = WordNetLemmatizer()

        # Convertir a minúsculas
        text = text.lower()

        # Eliminar caracteres especiales
        text = re.sub(r'[^a-zA-Z\s]', '', text)

        # Tokenizar
        tokens = word_tokenize(text)

        # Eliminar stopwords y lematizar
        tokens = [lemmatizer.lemmatize(token) for token in tokens
                if token not in stop_words and len(token) > 2]

        return tokens


def print_topic_evolution(model, periods=None):
        """
        Imprime la evolución de tópicos a lo largo del tiempo
        """
        num_time_slices = len(time_slices)
        for t in range(num_time_slices):
            print(f"\nPeríodo {t}:")
            for topic in range(model.num_topics):
                # La nueva sintaxis no usa el parámetro 'topics'
                top_terms = model.print_topics(time=t)[topic]
                print(f"Tópico {topic}: {top_terms[:5]}")

In [None]:
# Descargar recursos necesarios de NLTK español
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')


df = pd.read_csv("processed\data_processed_final.csv")

dtm_results = []

for player in df["Player"].unique():
    print(f"\nJugador: {player}------------------------------------------------------")
    df = pd.read_csv("processed\data_processed_final.csv")

    df = df[df["Player"] == player]
    df['Fecha'] = pd.to_datetime(df['Fecha'])




    # Crear un nuevo dataframe vacío
    noticias_expandidas = []

    # Iterar sobre cada fila del dataframe original
    for _, row in df.iterrows():
        fecha = row['Fecha']
        # Convertir el string que representa una lista a una lista real
        lista_noticias = ast.literal_eval(row['Noticias'])
        for noticia in lista_noticias:
            noticias_expandidas.append([fecha, noticia])

    # Crear un nuevo dataframe con las noticias expandidas
    df = pd.DataFrame(noticias_expandidas, columns=['Fecha', 'Noticia'])





    


    df['processed_text'] = df['Noticia'].apply(preprocess_text)


    # Crear diccionario
    dictionary = Dictionary(df['processed_text'])

    # Filtrar términos poco frecuentes o muy comunes
    dictionary.filter_extremes(no_below=5, no_above=0.5)
    print("\nTamaño del vocabulario:", len(dictionary))


    # Crear corpus
    corpus = [dictionary.doc2bow(doc) for doc in df['processed_text']]

    # Preparar time_slices (documentos por mes)
    df['month'] = df['Fecha'].dt.to_period('M')
    time_slices = df.groupby('month').size().tolist()
    print("\nNúmero de periodos temporales:", len(time_slices))


    # Configurar y entrenar modelo
    num_topics = 3  # Número de tópicos a detectar
    chain_variance = 0.100  # Controla la suavidad de la evolución temporal

    dtm = LdaSeqModel(
        corpus=corpus,
        time_slice=time_slices,
        num_topics=num_topics,
        id2word=dictionary,
        chain_variance=chain_variance,
        passes=10
    )


    dtm_results.append({player: dtm})

    

In [18]:
# hacer una visualización en grafica de los tópicos para el jugador seleccionado
player = "Ferran"
for result in dtm_results:
    if player in result:
        dtm = result[player]
        print_topic_evolution(dtm)
        break



Período 0:
Tópico 0: [('joao', 0.011452531570590249), ('gavi', 0.010397731312906097), ('serum', 0.008932352193813876), ('espaa', 0.008382813987414493), ('tras', 0.007705662048906297)]
Tópico 1: [('temporada', 0.011240950466003716), ('real', 0.009393674629182358), ('madrid', 0.008626188936843483), ('city', 0.008463644251155757), ('han', 0.008444272555013586)]
Tópico 2: [('bien', 0.01230096862714593), ('gol', 0.011712434289084965), ('partido', 0.011171854329666464), ('equipo', 0.008868352136292036), ('est', 0.008675488067905752)]

Período 1:
Tópico 0: [('primera', 0.009383615187351056), ('serum', 0.009294614339672432), ('joao', 0.008898722557306744), ('gavi', 0.007882489991343325), ('est', 0.007572806050086076)]
Tópico 1: [('uno', 0.010413296695534015), ('temporada', 0.00966600325014369), ('city', 0.00960916714319273), ('bara', 0.00862713949255298), ('real', 0.008439991365932513)]
Tópico 2: [('partido', 0.012028801812199575), ('bien', 0.01054097185205901), ('gol', 0.010031464003933798),