# Topic Modeling

## Imports de llibreries

In [13]:
import pandas as pd
import sklearn
from gensim.corpora import Dictionary
from gensim.models import LdaModel
from gensim.parsing.preprocessing import preprocess_string
from gensim.parsing.preprocessing import STOPWORDS
from stop_words import get_stop_words

## Càrrega del dataset

In [4]:
df_proposals = pd.read_csv("../data/processed/processed_proposals.csv")

## Títol de les propostes

In [15]:
stop_words = get_stop_words('catalan')
stopword_custom = set(stop_words)

In [42]:
# Set de stopwords en català i castellà que el model no té en compte.
# stopword_custom = set([
#     "el", "la", "los", "las", "un", "una", "unos", "unas",
#     "ell", "ella", "ells", "elles", "un", "una", "uns", "unes",
#     "lo", "la", "los", "las", "un", "una", "unos", "unas",
#     "yo", "tú", "él", "ella", "nosotros", "nosotras", "vosotros", "vosotras", "ellos", "ellas",
#     "me", "te", "se", "nos", "os",
#     "mí", "conmigo", "ti", "contigo", "sí", "consigo", "nosotros", "nosotras", "vosotros", "vosotras", "sí", "consigo",
#     "mi", "tu", "su", "nuestro", "nuestra", "vuestro", "vuestra", "mis", "tus", "sus", "nuestros", "nuestras", "vuestros", "vuestras",
#     "el", "la", "els", "les", "un", "una", "uns", "unes",
#     "ell", "ella", "ells", "elles", "un", "una", "uns", "unes",
#     "ho", "els", "les", "un", "una", "uns", "unes",
#     "jo", "tu", "ell", "ella", "nosaltres", "vosaltres", "ells", "elles",
#     "em", "et", "es", "ens", "us",
#     "mí", "mi", "tu", "tu", "ell", "ell", "ella", "ella", "nosaltres", "nosaltres", "vosaltres", "vosaltres", "ells", "ells", "elles", "elles",
#     "meu", "teu", "seu", "nostre", "nostra", "vostre", "vostra", "meus", "teus", "seus", "nostres", "nostres", "vostres", "vostres",
#     "m'", "te", "se", "ens", "us", "em", "et", "es", "li", "ho", "ne", "hi", 
#     "els", "les", "en", "ens", "us", "me", "te", "se", "ens", "vos", "vosaltres", 
#     "s'", "la", "el", "els", "les", "en", "ens", "us", "ne", "ens", "vos", "vosaltres", 
#     "amb", "l'", "de", "del","dels", "por", "hi", "ha", "d'", "o", "a", "ante","bajo", "con", "contra", "de", "desde", "en", "entre" ,
#     "hacia", "hasta", "para", "por", "segun", "sin", "sobre", "tras", "durante", "mediante"
# ])

# Funció per preprocessar els títols i retornar només els tokens que no estiguin al diccionari de stopwords.
def preprocess_title(title):
    tokens = preprocess_string(title)
    return [token for token in tokens if token not in stopword_custom]

df_proposals['title/ca_preprocessed'] = df_proposals['title/ca'].apply(preprocess_title)

# Diccionari de paraules pels títols
dictionary = Dictionary(df_proposals['title/ca_preprocessed'])

topics_by_title = []
for i, (title, preprocessed_title) in enumerate(zip(df_proposals['title/ca'], df_proposals['title/ca_preprocessed'])):

    # Crear corpus pel títol actual
    corpus = [dictionary.doc2bow(preprocessed_title)]
    
    lda_model = LdaModel(corpus, id2word=dictionary, num_topics=7, passes=10)
    
    # Obtenir paraules principals dels tòpics principals, limitat a 5 paraules
    title_topics = set(word for topic_id, topic in lda_model.print_topics(num_words=5) for word, _ in lda_model.show_topic(topic_id, topn=3))

    # Append a la columna de topics_by_title
    topics_by_title.append(list(title_topics))

df_proposals['topic_title'] = topics_by_title
# df_proposals = df_proposals.drop(columns=['body/preprocessed', 'topic_body'])

In [41]:
df_proposals.head(25)

Unnamed: 0,id,title/ca,body/ca,endorsements/total_count,comments,attachments,followers,published_at,is_amend,published_at_dies,title/ca_preprocessed,topic_title
0,87446,Participar Assemblea Ciutadana pel Clima de Ca...,"Buenos días, me gustaría participar en la asam...",4,0,0,2,2023-10-09 11:27:07+00:00,False,123.93735,"[participar, assemblea, ciutadana, pel, clima,...","[participar, catalunya, ciutadana, assemblea, ..."
1,87447,Assemblea ciutadana pel clima de Catalunya,M'agradaria participar en aquesta assemblea pe...,1,1,0,3,2023-10-10 08:38:57+00:00,False,123.054132,"[assemblea, ciutadana, pel, clima, catalunya]","[clima, catalunya, ciutadana, assemblea, pel]"
2,87452,"Transició a producció, comerç i consum ecològi...","La producció, comerç i consum d'aliments no ec...",1,0,0,2,2023-10-13 10:32:35+00:00,False,119.97522,"[transició, producció, comerç, consum, ecològ,...","[consum, local, producció, ecològ, comerç, tra..."
3,87453,"L'aigua, el principal aliment.",La major part dels pous catalans ni són legals...,3,0,0,3,2023-10-13 10:40:31+00:00,False,119.969711,"[aigua, princip, aliment]","[princip, aigua, aliment]"
4,87454,Prohibir construccions no bioclimàtiques,La normativa no hauria de permetre cap constru...,3,0,0,3,2023-10-13 10:50:31+00:00,False,119.962766,"[prohibir, construccion, bioclimàtiqu]","[bioclimàtiqu, construccion, prohibir]"
5,87458,Projectes del futur,"Hola, sóc un jove graduat en enginyeria; M'agr...",1,0,0,1,2023-10-14 14:38:54+00:00,False,118.804167,"[project, del, futur]","[del, futur, project]"
6,87460,Ordenació medioambiental. Tractament dels resi...,1.- Residus: Es podria instal.lar màquines de ...,5,0,0,2,2023-10-14 14:58:27+00:00,False,118.79059,"[ordenació, medioambient, tractament, del, res...","[residu, medioambient, aprofita, ordenació, pr..."
7,87461,Procés de selecció,"Referent al procés de selecció, no entenc perq...",3,3,0,1,2023-10-15 16:24:45+00:00,False,117.73066,"[procé, selecció]","[selecció, procé]"
8,87462,"Decreixement, Aigua i Sobirania alimentària ag...",Estem entrant en un caos climàtic terrible. Ai...,5,0,0,4,2023-10-15 19:02:35+00:00,False,117.621053,"[decreix, aigua, sobirania, alimentària, agroe...","[sobirania, agroecològica, decreix, aigua, ali..."
9,87464,Apropament entre generació i consum d'energia ...,Crec que caldria potenciar l'apropament entre ...,1,1,0,2,2023-10-15 20:58:48+00:00,False,117.540347,"[apropa, entr, generació, consum, energia, pol...","[polígon, consum, industri, generació, energia..."


## Text de les propostes

- TODO: **Considerar fer servir un dataset de stopwords en català**
    * [Remove-stopwords](https://github.com/WorldBrain/remove-stopwords/blob/master/lib/stopwords_ca.js)
    * [Alir3z4 stop-words](https://github.com/Alir3z4/stop-words/blob/master/catalan.txt) --> També disponible per pip `stop-words`
    * [Catalan Stop Words W2V](https://www.kaggle.com/code/mpwolke/catalan-stop-words-w2v)

In [11]:
# Funció per preprocessar el body i retornar només els tokens que no estiguin al diccionari de stopwords.
def preprocess_body(title):
    tokens = preprocess_string(title)
    return [token for token in tokens if token not in stopword_custom]

df_proposals['body/preprocessed'] = df_proposals['body/ca'].apply(preprocess_title)

# Diccionari de paraules pels body
dictionary = Dictionary(df_proposals['body/preprocessed'])

topics_by_body = []
for i, (body, preprocessed_body) in enumerate(zip(df_proposals['body/ca'], df_proposals['body/preprocessed'])):

    # Crear corpus pel títol actual
    corpus = [dictionary.doc2bow(preprocessed_body)]
    
    lda_model = LdaModel(corpus, id2word=dictionary, num_topics=1, passes=10)
    
    # Obtenir paraules principals dels tòpics principals, limitat a 5 paraules
    body_topics = set(word for topic_id, topic in lda_model.print_topics(num_words=5) for word, _ in lda_model.show_topic(topic_id, topn=5))

    # Append a la columna de topics_by_title
    topics_by_body.append(list(body_topics))

df_proposals['topic_body'] = topics_by_body

In [12]:
df_proposals.head(5)

Unnamed: 0,id,title/ca,body/ca,endorsements/total_count,comments,attachments,followers,published_at,is_amend,published_at_dies,title/ca_preprocessed,topic_title,body/preprocessed,topic_body
0,87446,Participar Assemblea Ciutadana pel Clima de Ca...,"Buenos días, me gustaría participar en la asam...",4,0,0,2,2023-10-09 11:27:07+00:00,False,123.93735,"[participar, assemblea, ciutadana, pel, clima,...","[clima, catalunya, ciutadana, assemblea, pel, ...","[bueno, día, gustaría, participar, asamblea, c...","[mucho, poder, así, día, cataluña]"
1,87447,Assemblea ciutadana pel clima de Catalunya,M'agradaria participar en aquesta assemblea pe...,1,1,0,3,2023-10-10 08:38:57+00:00,False,123.054132,"[assemblea, ciutadana, pel, clima, catalunya]","[clima, catalunya, ciutadana, assemblea, pel]","[agradaria, participar, aquesta, assemblea, pe...","[moltíssim, carta, ca, que, participar]"
2,87452,"Transició a producció, comerç i consum ecològi...","La producció, comerç i consum d'aliments no ec...",1,0,0,2,2023-10-13 10:32:35+00:00,False,119.97522,"[transició, producció, comerç, consum, ecològ,...","[consum, local, producció, ecològ, comerç]","[producció, comerç, consum, aliment, ecològ, l...","[veur, local, territori, proper, possibl]"
3,87453,"L'aigua, el principal aliment.",La major part dels pous catalans ni són legals...,3,0,0,3,2023-10-13 10:40:31+00:00,False,119.969711,"[aigua, princip, aliment]","[princip, aigua, aliment]","[major, pou, catalan, són, legal, monitoratg, ...","[són, ser, sosten, le, haurien]"
4,87454,Prohibir construccions no bioclimàtiques,La normativa no hauria de permetre cap constru...,3,0,0,3,2023-10-13 10:50:31+00:00,False,119.962766,"[prohibir, construccion, bioclimàtiqu]","[bioclimàtiqu, construccion, prohibir]","[normativa, hauria, permetr, cap, construcció,...","[inspeccion, balanç, le, cap, positiu]"
