# LIBRARIES

In [52]:
import pandas as pd 
import pandas as pd
import gensim
import nltk 
import seaborn as sns
sns.set()
import spacy
import pyLDAvis.gensim_models
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 
from nltk.stem.wordnet import WordNetLemmatizer as lemma
import pickle
import fasttext
import numpy as np
from sklearn.metrics.pairwise import cosine_distances
from dictionaries import topic_word_dictionary
texts = pd.read_json('file.json',lines=True)
import diccionaries


In [53]:
test = texts[:20]

# PRE-TRAINED MODEL : FASTTEXT WORD2VEC

In [6]:
import fasttext.util
#fasttext.util.download_model('es', if_exists='ignore')  # Spanish
ft = fasttext.load_model('cc.es.300.bin')



# AIM OF THE DOCUMENT
The aim of this document is to implement simple topic modeling algorithms based on the results obtained from the Latent Dirichlet Allocation (LDA) proposal. During the LDA analysis, it was observed that a range of 8 to 10 topics seemed to be reasonably effective. Therefore, the objective here is to build upon those insights and develop straightforward topic modeling techniques that align with the identified number of topics.

In [50]:
def word_freq(token_list):
    """

    Calculate the frequency of each word in the given list of tokens.

    Parameters:
        token_list (list): A list of tokens representing words.

    Returns:
        dict: A dictionary where keys are words and values are their frequencies.

    """
    word_freq = {}
    for token in token_list:
        if token in word_freq:
            word_freq[token] += 1
        else:
            word_freq[token] = 1
    return word_freq

def top_words(word_freq_dict, n=10):
    """
    Get the top 'n' words with the highest frequencies from a word frequency dictionary.

    Parameters:
        word_freq_dict (dict): A dictionary where keys are words and values are their frequencies.
        n (int): Number of top words to return. Default is 10.

    Returns:
        list: A list of the top 'n' words, sorted by frequency.
    """
    # Ordenar el diccionario por valor (frecuencia) en orden descendente
    sorted_word_freq = sorted(word_freq_dict.items(), key=lambda x: x[1], reverse=True)
    
    # Obtener las primeras 'n' palabras
    top_n_words = [word for word, freq in sorted_word_freq[:n]]
    
    return top_n_words


def get_word_embeddings(word_list, model_path):

    """
    Get word embeddings for a list of words using FastText.

    Parameters:
        word_list (list): List of words for which embeddings are required.
        model_path (str): Path to the FastText model file.

    Returns:
        dict: A dictionary mapping words to their corresponding word embeddings.
    
    """
    # Load FastText model
    ft_model = fasttext.load_model(model_path)

    # Initialize an empty dictionary to store word embeddings
    embeddings = {}

    # Iterate through each word in the word list
    for word in word_list:
        # Get the word embedding vector for the current word
        embedding_vector = ft_model.get_word_vector(word)
        
        # Store the word embedding vector in the embeddings dictionary
        embeddings[word] = embedding_vector

    return embeddings



def average_distance(embeddings1, embeddings2):
    """
    Calculate the average cosine distance between corresponding word embeddings in two dictionaries.

    Parameters:
        embeddings1 (dict): Dictionary mapping words to their embeddings.
        embeddings2 (dict): Dictionary mapping words to their embeddings.

    Returns:
        float: The average cosine distance.
    """
    # Extract vectors from dictionaries
    vectors1 = list(embeddings1.values())
    vectors2 = list(embeddings2.values())
    
    # Calculate pairwise cosine distances
    distances = cosine_distances(vectors1, vectors2)
    
    # Calculate the average distance
    avg_distance = np.mean(distances)
    
    return avg_distance

def classify(embeddings_words, embeddings_topics, original_topic_dictionary):
    """
    Creates a dictionary of distances to the topics, and returns the topic with the least cosine distance.

    Parameters:
        embeddings_words (dict): Dictionary mapping words to their embeddings.
        embeddings_topics (dict): Dictionary mapping topics to their embeddings.
        original_topic_dictionary (dict): Original mapping of topic indices to topic names.

    Returns:
        string : The closest topic.
        dict : A dictionary mapping topic names to their distances to words.
    """
    # Initialize a dictionary to store distances to topics
    topic_distances = {}

    # Iterate through each topic embedding
    for topic_index, topic_embedding in embeddings_topics.items():
        # Calculate the average cosine distance between word embeddings in embeddings_words and the topic embedding
        avg_distance = average_distance(embeddings_words, {0: topic_embedding})
        # Get the topic name from the original dictionary
        topic_name = original_topic_dictionary.get(topic_index)
        # Store the average distance for the topic name
        topic_distances[topic_name] = avg_distance

    # Find the topic with the smallest average distance
    min_distance_topic = min(topic_distances, key=topic_distances.get)
    
    return min_distance_topic, topic_distances


def topic_clusterer(articles_column,cosinedistance_dictionary) : 
    
    """ 
    The aim of this function is to predict the topic of an article. Preprocessing is done before classification.
    Input : 
        articles_columns : Df column containing articles
        word_dictionary  : diccionary of the cosine distance of the words related to the topics
    Output : 
        Dataframe containing the topic predicted as well as the distances to other topics.

    """
    # Our spaCy model:
    nlp = spacy.load("es_core_news_lg")
    # Path to the FastText model file
    model_path = 'cc.es.300.bin'

    # Tags I want to remove from the text
    removal= ['ADV','PRON','CCONJ','PUNCT','PART','DET','ADP','SPACE', 'NUM', 'SYM']
    tokens = []
    for summary in nlp.pipe(articles_column):
        proj_tok = [token.lemma_.lower() for token in summary if token.pos_ not in removal and not token.is_stop and token.is_alpha]
        tokens.append(proj_tok)

    texts = pd.DataFrame()
    texts['Tokens'] = tokens
    texts['Word_Frequencies'] = texts['Tokens'].apply(word_freq)
    texts['Top_n_words']      = texts['Word_Frequencies'].apply(top_words)
    texts['embeddings']       = texts['Top_n_words'].apply(get_word_embeddings, args=(model_path,))
    texts['predicted_topic'], _ = zip(*texts['embeddings'].apply(classify,args=(embedding_topics,original_dict))) # embedding_topics será un diccionario de los word embeddings a los topics, que guardaremos en un pickle, y original_dict será el diccionario que contenga el nombre de los temas originales
    return texts

# Example Code
Here are examples demonstrating the usage of the functions described earlier:

In [6]:
import pandas as pd 
# Supongamos que tienes un DataFrame llamado df y quieres aplicar la función word_freq a cada fila de la columna 'Tokens'
# Creamos un DataFrame de ejemplo
data = {'Tokens': [['this', 'is', 'a', 'test'], ['another', 'test'], ['yet', 'another', 'test']]}
df = pd.DataFrame(data)

# Aplicar la función word_freq a cada fila de la columna 'Tokens'
df['Word_Frequencies'] = df['Tokens'].apply(word_freq)

# Mostrar el DataFrame resultante
print(df)

                 Tokens                         Word_Frequencies
0   [this, is, a, test]  {'this': 1, 'is': 1, 'a': 1, 'test': 1}
1       [another, test]                {'another': 1, 'test': 1}
2  [yet, another, test]      {'yet': 1, 'another': 1, 'test': 1}


In [10]:
# Ejemplo de uso:
word_frequencies = {'hello': 5, 'world': 3, 'python': 10, 'programming': 8, 'language': 7}
top_10_words = top_words(word_frequencies)
print(top_10_words)


['python', 'programming', 'language', 'hello', 'world']


In [55]:
# Example usage:


# List of words for which embeddings are required
word_list = ["guerra,ucrania,misil,muertos,rusia"]
word_list_futbol = top_words(word_freq(topic_clusterer(df111.loc[3, "Tokens"],dict())))


# Path to the FastText model file
model_path = 'cc.es.300.bin'

# Get word embeddings for the word list using FastText
word_embeddings =  get_word_embeddings(word_list_futbol , model_path)

word_embeddings_dict = {}
for i in range(10):  # Assuming you have 10 topics
    word_embeddings_dict[i] = get_word_embeddings(topic_word_dictionary[i], model_path)


# Print the word embeddings
#for word, embedding in word_embeddings2.items():
#    print(word, embedding)



In [20]:
import numpy as np
from sklearn.metrics.pairwise import cosine_distances

def average_distance(embeddings1, embeddings2):
    """
    Calculate the average cosine distance between corresponding word embeddings in two dictionaries.

    Parameters:
    embeddings1 (dict): Dictionary mapping words to their embeddings.
    embeddings2 (dict): Dictionary mapping words to their embeddings.

    Returns:
    float: The average cosine distance.
    """
    # Extract vectors from dictionaries
    vectors1 = list(embeddings1.values())
    vectors2 = list(embeddings2.values())
    
    # Calculate pairwise cosine distances
    distances = cosine_distances(vectors1, vectors2)
    
    # Calculate the average distance
    avg_distance = np.mean(distances)
    
    return avg_distance

# Example usage:
for i in range (10) : 

    # Calculate the average cosine distance
    avg_distance = average_distance(word_embeddings_dict[i], word_embeddings)

    print(f"Average cosine distance with topic {i} :", avg_distance)

Average cosine distance with topic 0 : 0.8231431
Average cosine distance with topic 1 : 0.82431126
Average cosine distance with topic 2 : 0.9571792
Average cosine distance with topic 3 : 0.8269824
Average cosine distance with topic 4 : 0.83639395
Average cosine distance with topic 5 : 0.8159949
Average cosine distance with topic 6 : 0.8134227
Average cosine distance with topic 7 : 0.8082148
Average cosine distance with topic 8 : 0.86202157
Average cosine distance with topic 9 : 0.74208677
