# Topic Modeling

## Imports de llibreries

In [157]:
import pandas as pd
import numpy as np
import sklearn
from stop_words import get_stop_words
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk import ne_chunk
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD, LatentDirichletAllocation
import spacy

## Càrrega del dataset

In [158]:
df_proposals = pd.read_csv("../data/processed/processed_proposals.csv")

## Títol de les propostes

Primer, creem el diccionari de stopwords en llengua catalana per poder tokenitzar bé els títols i no incloure paraules comunes com connectors, articles, pronoms, etc...

In [159]:
# stop_words = get_stop_words('catalan')
# stopword_custom = set(stop_words)

In [160]:
# # Funció per preprocessar els títols i retornar només els tokens que no estiguin al diccionari de stopwords.
# def preprocess_title(title):
#     tokens = preprocess_string(title)
#     return [token for token in tokens if token not in stopword_custom]

# df_proposals['title/ca_preprocessed'] = df_proposals['title/ca'].apply(preprocess_title)

# # Diccionari de paraules pels títols
# dictionary = Dictionary(df_proposals['title/ca_preprocessed'])

# topics_by_title = []
# for i, (title, preprocessed_title) in enumerate(zip(df_proposals['title/ca'], df_proposals['title/ca_preprocessed'])):

#     # Crear corpus pel títol actual
#     corpus = [dictionary.doc2bow(preprocessed_title)]
    
#     lda_model = LdaModel(corpus, id2word=dictionary, num_topics=7, passes=10)
    
#     # Obtenir paraules principals dels tòpics principals, limitat a 5 paraules
#     title_topics = set(word for topic_id, topic in lda_model.print_topics(num_words=5) for word, _ in lda_model.show_topic(topic_id, topn=3))

#     # Append a la columna de topics_by_title
#     topics_by_title.append(list(title_topics))

# df_proposals['topic_title'] = topics_by_title
# # df_proposals = df_proposals.drop(columns=['body/preprocessed', 'topic_body'])

In [161]:
# df_proposals.head(25)

## Text de les propostes

Primer carreguem el dataset de stopwords en català perquè no es tinguin en compte a l'hora de fer topic modeling i després tokenitzem el text havent-lo lemmatitzat abans per passar les paraules a singular, masculí, etc... per evitar problemes.

In [162]:

nlp = spacy.load("ca_core_news_sm")

stop_words = set(nltk.corpus.stopwords.words('catalan'))

def clean_text(headline):
    # Tokenize the text
    doc = nlp(headline)
    
    # Lemmatize and filter tokens
    tokens = [token.lemma_ for token in doc if not token.is_stop and token.text.lower() not in stop_words and len(token.text) > 3]
    
    # Join the tokens back into a string
    cleaned_text = " ".join(tokens)
    return cleaned_text

df_proposals['body_preprocessed'] = df_proposals['body/ca'].apply(clean_text)
df_proposals.head()

Unnamed: 0,id,title/ca,body/ca,endorsements/total_count,comments,attachments,followers,published_at,is_amend,published_at_dies,body_preprocessed
0,87446,Participar Assemblea Ciutadana pel Clima de Ca...,"Buenos días, me gustaría participar en la asam...",4,0,0,2,2023-10-09 11:27:07+00:00,False,123.93735,Buenos días gustaríar participar asamblea ciud...
1,87447,Assemblea ciutadana pel clima de Catalunya,M'agradaria participar en aquesta assemblea pe...,1,1,0,3,2023-10-10 08:38:57+00:00,False,123.054132,agradar participar assemblea clima rebar carta...
2,87452,"Transició a producció, comerç i consum ecològi...","La producció, comerç i consum d'aliments no ec...",1,0,0,2,2023-10-13 10:32:35+00:00,False,119.97522,producció comerç consum aliment ecològic local...
3,87453,"L'aigua, el principal aliment.",La major part dels pous catalans ni són legals...,3,0,0,3,2023-10-13 10:40:31+00:00,False,119.969711,major part pou català legal monitoratge contin...
4,87454,Prohibir construccions no bioclimàtiques,La normativa no hauria de permetre cap constru...,3,0,0,3,2023-10-13 10:50:31+00:00,False,119.962766,normativa haver permetre construcció edificaci...


### Model LSA (Latent Semantic Analysis o TruncatedSVD)

In [163]:
vect = TfidfVectorizer(max_features=1000, stop_words=list(stop_words))
vect_text = vect.fit_transform(df_proposals['body_preprocessed'])

# Paràmetres a modificar són n_components i n_iter
lsa_model = TruncatedSVD(n_components=25, algorithm='randomized',n_iter=20, random_state=42)
lsa_top = lsa_model.fit_transform(vect_text)


# most important words for each topic
vocab = vect.get_feature_names_out()

In [164]:
# Function to get the top words for the top topic for each document
def get_top_words_for_document(doc_index, lsa_model, vocab, num_top_words=10):
    # Get the topic distribution for the document
    doc_topic_distribution = lsa_model.transform(vect_text[doc_index])
    
    # Get the index of the top topic
    top_topic_index = np.argmax(doc_topic_distribution)
    
    # Get the top words for the top topic
    top_words_indices = lsa_model.components_[top_topic_index].argsort()[:-num_top_words-1:-1]
    top_words = [vocab[j] for j in top_words_indices]
    
    return top_words

# Apply the function to each row in the dataset
df_proposals['top_topic_words_lsa'] = df_proposals.index.map(lambda x: get_top_words_for_document(x, lsa_model, vocab))


In [165]:
df_proposals['top_topic_words_lsa']

0     [parar, días, buenos, hablar, projecte, tema, ...
1     [participar, clima, agradar, assemblea, carta,...
2     [local, territori, aigua, participació, pagès,...
3     [aigua, deixar, sequera, disposar, procés, res...
4     [mesura, legal, servei, híbrid, inspecció, con...
5     [participar, clima, agradar, assemblea, carta,...
6     [ciutadà, supermercat, llauna, local, comerç, ...
7     [participar, clima, agradar, assemblea, carta,...
8     [aigua, deixar, sequera, disposar, procés, res...
9     [energia, caldre, instal, industrial, solar, p...
10    [participar, clima, agradar, assemblea, carta,...
11    [treball, nou, energia, residu, deixar, mobili...
12    [gran, col, voluntari, necessari, afrontar, sa...
13    [aigua, vehicle, xarxa, parar, elèctric, mobil...
14    [pagès, taxa, caldre, bosc, propietat, product...
15    [energia, caldre, instal, industrial, solar, p...
16    [participar, clima, agradar, assemblea, carta,...
17    [participar, clima, agradar, assemblea, ca

### LDA Model (Latent Dirichlet Allocation)

In [167]:
lda_model = LatentDirichletAllocation(n_components=10, learning_method='online', random_state=42, max_iter=10)
lda_top = lda_model.fit_transform(vect_text)

# Function to get the top words for the top topic for each document
def get_top_words_for_document_lda(doc_index, lda_model, vect, num_top_words=10):
    # Transform the document to get its topic distribution
    doc_topic_distribution = lda_model.transform(vect_text[doc_index])
    
    # Get the index of the top topic
    top_topic_index = np.argmax(doc_topic_distribution)
    
    # Get the top words for the top topic
    top_words_indices = lda_model.components_[top_topic_index].argsort()[:-num_top_words-1:-1]
    top_words = [vocab[j] for j in top_words_indices]
    
    return top_words

# Apply the function to each row in the dataset
df_proposals['top_topic_words_lda'] = df_proposals.index.map(lambda x: get_top_words_for_document_lda(x, lda_model, vocab))
print(df_proposals['body/ca'][0])
df_proposals['top_topic_words_lda']

Buenos días, me gustaría participar en la asamblea ciudadana para
el clima de Cataluña.

Es un tema de mucho interés y actual, que me interesa bastante, y
así poder aportar mi granito de arena.

Muchas gracias,


0     [clima, arbre, aigua, espai, formigó, element,...
1     [participar, aleatori, projecte, assemblea, cl...
2     [automòbil, resultar, majoria, solució, energè...
3     [idea, participar, híbrid, gran, gent, aixì, e...
4     [aigua, condicionat, paper, planeta, treball, ...
5     [participar, aleatori, projecte, assemblea, cl...
6     [públic, aliment, transport, instal, trajecte,...
7     [idea, participar, híbrid, gran, gent, aixì, e...
8     [aigua, condicionat, paper, planeta, treball, ...
9     [caldre, energia, industrial, emissió, impleme...
10    [participar, aleatori, projecte, assemblea, cl...
11    [aigua, condicionat, paper, planeta, treball, ...
12    [caldre, energia, industrial, emissió, impleme...
13    [públic, aliment, transport, instal, trajecte,...
14    [caldre, energia, industrial, emissió, impleme...
15    [valor, canvi, aparir, matriu, cosa, geotermia...
16    [participar, aleatori, projecte, assemblea, cl...
17    [producte, cost, catalunya, mercaderia, mo