In [1]:
import pandas as pd
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk import FreqDist
from nltk.sentiment.vader import SentimentIntensityAnalyzer 
import spacy
from collections import Counter
from textblob import TextBlob
import sqlalchemy as alch
from getpass import getpass
from config.configuration import engine
import src.cleaning as cl
import src.get_data as gd
import dotenv
dotenv.load_dotenv()

True

In [2]:
password = getpass("Introduce tu pass de sql: ")
dbName="elections"
connectionData=f"mysql+pymysql://root:{password}@localhost/{dbName}"

Introduce tu pass de sql: ········


In [3]:
engine = alch.create_engine(connectionData)
print("Conected")

Conected


### PROGRAMS

In [4]:
all_programs = gd.get_phrases()

In [5]:
verba = gd.verba_phrases()

### Tokenize phrases

In [5]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/unai/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [6]:
all_programs.dropna(axis = 0, how = 'any', inplace = True) 

In [7]:
all_programs['tokenized'] = all_programs['phrases'].apply(cl.tokenize)

### Apply stop_words

In [8]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /Users/unai/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [9]:
all_programs['tokenized'] = all_programs['tokenized'].apply(cl.stop_words)

In [10]:
all_programs

Unnamed: 0,id,party_id,phrases,tokenized
0,1,2,El próximo 4 de mayo las madrileñas y madrileñ...,El próximo 4 mayo madrileñas madrileños haber ...
1,2,2,Hemos sufrido el abandono de una presidenta qu...,Hemos sufrido abandono presidenta utilizado se...
2,3,2,Abordamos esta convocatoria electoral desde el...,Abordamos convocatoria electoral conocimiento ...
3,4,2,Los compromisos que adquirimos con el programa...,Los compromisos adquirimos programa electoral ...
4,5,2,Hemos revisado y actualizado las medidas para ...,Hemos revisado actualizado medidas soluciones ...
...,...,...,...,...
3717,3718,4,Colaborar con los ayuntamientos en los program...,Colaborar ayuntamientos programas agentes tuto...
3718,3719,4,Elaborar un Plan Director para el Cuerpo de Pr...,Elaborar Plan Director Cuerpo Prevención Extin...
3719,3720,4,"Mejorar la dotación de personal, aumentando la...",Mejorar dotación personal aumentando plantilla...
3720,3721,4,Mejorar la eficacia de la extinción de incend...,Mejorar eficacia extincio n incendios creando ...


In [11]:
#spanish_stemmer = SnowballStemmer('spanish')

In [12]:
#spanish_stemmer.stem("libertad, libre, libres")

In [13]:
#spanish_stemmer.stem("libertad, libre, libres, libertad, libre")

Between Stemming and Lematization, the latter works better. However, unfortunately there is no option to do it in Spanish. 

Although there is the option of translating the text, in this case I consider it important to keep it in the original language. 

I tried the SnoballStemmer, which is available in Spanish, but as it can be seen above the results are very good. It was supposed to take the root of the word and categorize all as the same. Instead, it puts them all as the same and deletes a part of the last one. 

Therefore, I will perform the search of the most common words in the column where I applied the tokenization and the stop words.

### Count words using value_counts

In [14]:
# Create a dataframe with the word count of all the programs together
df_word_count = cl.count_values(all_programs, 'tokenized')

In [15]:
# Make a list with the words to delete (The ones that would appear in the first 30)
to_delete = ["La", "Se"]

In [16]:
# Apply the function to delete them
df_word_count = cl.delete_extra_words(df_word_count, to_delete)

In [17]:
df_word_count = cl.for_streamlit(df_word_count)

### Using the Spacy library

In [18]:
nlp = spacy.load('es_core_news_sm')

In [19]:
all_common_verbs = cl.spacy_verb(all_programs, 'tokenized')

In [20]:
all_common_nouns = cl.spacy_noun(all_programs, 'tokenized')

In [21]:
all_programs[all_programs['phrases'].str.contains("persona")]

Unnamed: 0,id,party_id,phrases,tokenized
21,22,2,Queremos una Comunidad de Madrid con una sanid...,Queremos Comunidad Madrid sanidad cuide capaci...
22,23,2,Una Comunidad de Madrid que comprenda que somo...,Una Comunidad Madrid comprenda personas interd...
29,30,2,Garantizaremos que todas las personas de la Co...,Garantizaremos todas personas Comunidad Madrid...
32,33,2,Se reembolsarán los copagos farmacéuticos y no...,Se reembolsarán copagos farmacéuticos farmacéu...
34,35,2,Se implementarán mecanismos para hacer efectiv...,Se implementarán mecanismos hacer efectivas me...
...,...,...,...,...
3686,3687,4,Otros de los bienes públicos que buscamos gara...,Otros bienes públicos buscamos garantizar segu...
3690,3691,4,Nos comprometeremos a la reducción de la tempo...,Nos comprometeremos reducción temporalidad emp...
3708,3709,4,Madrid es una de las comunidades con mayor lit...,Madrid comunidades mayor litigiosidad colapsa ...
3711,3712,4,Garantizar el acceso a todas las sedes judicia...,Garantizar acceso todas sedes judiciales Comun...


The results of the search made using value_counts and Spacy are different. I would say this happens due to errors in the entity recognition of spacy, which doesn't always correctly put the tag (noun, verb...) to the word. 

I will use both methods (the one with value_counts() and the spacy library to learn how to use it. 

### Analyze each party

#### PP

In [18]:
pp_phrases = gd.phrases("PP")

In [19]:
# Create a column to apply tokenization
pp_phrases['tokenized'] = pp_phrases['phrases'].apply(cl.tokenize)

In [20]:
# Use the stop words in the column 
pp_phrases['tokenized'] = pp_phrases['tokenized'].apply(cl.stop_words)

In [21]:
# Apply the first function that creates a dataframe with words capitalized and the frequency
pp_phrases_count = cl.count_values(pp_phrases, 'tokenized')

In [22]:
# Make a list with the words to delete
to_delete = ["Años", "Ello", "En", "La", "Puedan", "Través", "Especialmente", 
             "Nuevas", "Vamos", "Además", "Especialmente", "Una", "Aquellos", "Cada"]

In [23]:
# Apply function to delete words
pp_phrases_count = cl.delete_extra_words(pp_phrases_count, to_delete)

In [24]:
# Prepare the dataframe to make a graph in streamlit
pp_phrases_count = cl.for_streamlit(pp_phrases_count)

In [25]:
pp_phrases_count.to_csv("./data/programs/csv-s/pp_phrases_count.csv")

In [26]:
pp_verbs = cl.spacy_verb(pp_phrases, 'tokenized')

In [31]:
#pp_verbs.to_csv("./data/programs/csv-s/pp_verbs.csv")

In [32]:
pp_nouns = cl.spacy_noun(pp_phrases, 'tokenized')

In [33]:
#pp_nouns.to_csv("./data/programs/csv-s/pp_nouns.csv")

#### PSOE

In [34]:
psoe_phrases = gd.phrases("PSOE")

In [35]:
psoe_phrases['tokenized'] = psoe_phrases['phrases'].apply(cl.tokenize)

In [36]:
psoe_phrases['tokenized'] = psoe_phrases['tokenized'].apply(cl.stop_words)

In [37]:
psoe_phrases_count = cl.count_values(psoe_phrases, 'tokenized')

In [38]:
to_delete = ["N", "La", "Así"]

In [39]:
psoe_phrases_count = cl.delete_extra_words(psoe_phrases_count, to_delete)

In [40]:
psoe_phrases_count = cl.for_streamlit(psoe_phrases_count)

In [41]:
#psoe_phrases_count.to_csv("./data/programs/csv-s/psoe_phrases_count.csv")

In [42]:
psoe_verbs = cl.spacy_verb(psoe_phrases, 'tokenized')

In [43]:
#psoe_verbs.to_csv("./data/programs/csv-s/psoe_verbs.csv")

In [44]:
psoe_nouns = cl.spacy_noun(psoe_phrases, 'tokenized')

In [45]:
#psoe_nouns.to_csv("./data/programs/csv-s/psoe_nouns.csv")

#### Más Madrid

In [46]:
mas_madrid_phrases = gd.phrases("Más Madrid")

In [47]:
mas_madrid_phrases['tokenized'] = mas_madrid_phrases['phrases'].apply(cl.tokenize)

In [48]:
mas_madrid_phrases['tokenized'] = mas_madrid_phrases['tokenized'].apply(cl.stop_words)

In [49]:
mas_madrid_phrases_count = cl.count_values(psoe_phrases, 'tokenized')

In [50]:
to_delete = ["Ción", "N", "La", "Dad", "Así", "Dos", "Co"]

In [51]:
mas_madrid_phrases_count = cl.delete_extra_words(mas_madrid_phrases_count, to_delete)

In [52]:
mas_madrid_phrases_count = cl.for_streamlit(mas_madrid_phrases_count)

In [53]:
#mas_madrid_phrases_count.to_csv("./data/programs/csv-s/mas_madrid_phrases_count.csv")

In [54]:
mas_madrid_verbs = cl.spacy_verb(mas_madrid_phrases, 'tokenized')

In [55]:
#mas_madrid_verbs.to_csv("./data/programs/csv-s/mas_madrid_verbs.csv")

In [56]:
mas_madrid_nouns = cl.spacy_noun(mas_madrid_phrases, 'tokenized')

In [57]:
#mas_madrid_nouns.to_csv("./data/programs/csv-s/mas_madrid_nouns.csv")

#### Unidas Podemos

In [58]:
unidas_podemos_phrases = gd.phrases("Unidas Podemos")

In [59]:
unidas_podemos_phrases['tokenized'] = unidas_podemos_phrases['phrases'].apply(cl.tokenize)

In [60]:
unidas_podemos_phrases['tokenized'] = unidas_podemos_phrases['tokenized'].apply(cl.stop_words)

In [61]:
unidas_podemos_phrases_count = cl.count_values(unidas_podemos_phrases, 'tokenized')

In [62]:
to_delete = ["Cada", "Ciento", "Manera", "Menos", "Ello", "La", "Se", "Así", "Con", "Para", "En", "Por", "Puedan", "Todas", "Fin", "Si", "Asimismo", "000"]

In [63]:
unidas_podemos_phrases_count = cl.delete_extra_words(unidas_podemos_phrases_count, to_delete)

In [64]:
unidas_podemos_phrases_count = cl.for_streamlit(unidas_podemos_phrases_count)

In [65]:
#unidas_podemos_phrases_count.to_csv("./data/programs/csv-s/unidas_podemos_phrases_count.csv")

In [66]:
unidas_podemos_verbs = cl.spacy_verb(unidas_podemos_phrases, 'tokenized')

In [67]:
#unidas_podemos_verbs.to_csv("./data/programs/csv-s/unidas_podemos_verbs.csv")

In [68]:
unidas_podemos_nouns = cl.spacy_noun(unidas_podemos_phrases, 'tokenized')

In [69]:
#unidas_podemos_nouns.to_csv("./data/programs/csv-s/unidas_podemos_nouns.csv")

#### Ciudadanos

In [70]:
ciudadanos_phrases = gd.phrases("Ciudadanos")

In [71]:
ciudadanos_phrases['tokenized'] = ciudadanos_phrases['phrases'].apply(cl.tokenize)

In [72]:
ciudadanos_phrases['tokenized'] = ciudadanos_phrases['tokenized'].apply(cl.stop_words)

In [73]:
ciudadanos_phrases_count = cl.count_values(ciudadanos_phrases, 'tokenized')

In [74]:
to_delete = ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "Los", "Impulsaremos", "Garantizaremos", "Apostaremos",
            "Través", "Puedan", "Aseguraremos"]

In [75]:
ciudadanos_phrases_count = cl.delete_extra_words(ciudadanos_phrases_count, to_delete)

In [76]:
ciudadanos_phrases_count = cl.for_streamlit(ciudadanos_phrases_count)

In [77]:
#ciudadanos_phrases_count.to_csv("./data/programs/csv-s/ciudadanos_phrases_count.csv")

In [78]:
ciudadanos_verbs = cl.spacy_verb(ciudadanos_phrases, 'tokenized')

In [79]:
#ciudadanos_verbs.to_csv("./data/programs/csv-s/ciudadanos_verbs.csv")

In [80]:
ciudadanos_nouns = cl.spacy_noun(ciudadanos_phrases, 'tokenized')

In [81]:
#ciudadanos_nouns.to_csv("./data/programs/csv-s/ciudadanos_nouns.csv")

#### Vox

In [82]:
vox_phrases = gd.phrases("Vox")

In [83]:
vox_phrases['tokenized'] = vox_phrases['phrases'].apply(cl.tokenize)

In [84]:
vox_phrases['tokenized'] = vox_phrases['tokenized'].apply(cl.stop_words)

In [85]:
vox_phrases_count = cl.count_values(vox_phrases, 'tokenized')

In [86]:
to_delete = ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "Deben", "Queda",
             "Frente", "Especialmente", "Cometan", "Su", "Pequeñas", "Adecuado"]

In [87]:
vox_phrases_count = cl.delete_extra_words(vox_phrases_count, to_delete)

In [88]:
vox_phrases_count = cl.for_streamlit(vox_phrases_count)

In [89]:
#vox_phrases_count.to_csv("./data/programs/csv-s/vox_phrases_count.csv")

In [90]:
vox_verbs = cl.spacy_verb(vox_phrases, 'tokenized')

In [91]:
#vox_verbs.to_csv("./data/programs/csv-s/vox_verbs.csv")

In [92]:
vox_nouns = cl.spacy_noun(vox_phrases, 'tokenized')

In [93]:
#vox_nouns.to_csv("./data/programs/csv-s/vox_nouns.csv")

### VERBA RESULTS

### Ángel Gabilondo

In [94]:
verba_gabilondo = gd.verba_candidate(1)

In [95]:
verba_gabilondo['tokenized'] = verba_gabilondo['phrase'].apply(cl.tokenize)

In [96]:
verba_gabilondo['tokenized'] = verba_gabilondo['tokenized'].apply(cl.stop_words)

In [97]:
verba_gabilondo_count = cl.count_values(verba_gabilondo, 'tokenized')

In [98]:
to_delete = ["El", "Si", "Y", "La", "Dice", "Hoy", "Presentado", "En", "Va", "Hora"]

In [99]:
verba_gabilondo_count = cl.delete_extra_words(verba_gabilondo_count, to_delete)

In [100]:
verba_gabilondo_count = cl.for_streamlit(verba_gabilondo_count)

In [101]:
#verba_gabilondo_count.to_csv("./data/verba_results/verba_candidates/gabilondo_verba.csv")

### Edmundo Bal 

In [102]:
verba_bal = gd.verba_candidate(2)

In [103]:
verba_bal['tokenized'] = verba_bal['phrase'].apply(cl.tokenize)

In [104]:
verba_bal['tokenized'] = verba_bal['tokenized'].apply(cl.stop_words)

In [105]:
verba_bal = cl.count_values(verba_bal, 'tokenized')

In [106]:
to_delete = ["El", "Y", "La", "Dice", "Tres", "Hoy", "En", "Si", "Tras", "Se", "Mañana"]

In [107]:
verba_bal_count = cl.delete_extra_words(verba_bal, to_delete)

In [108]:
verba_bal_count = cl.for_streamlit(verba_bal_count)

In [109]:
#verba_bal_count.to_csv("./data/verba_results/verba_candidates/bal_verba.csv")

### Isabel Díaz Ayuso

In [110]:
verba_ayuso = gd.verba_candidate(3)

In [111]:
verba_ayuso['tokenized'] = verba_ayuso['phrase'].apply(cl.tokenize)

In [112]:
verba_ayuso['tokenized'] = verba_ayuso['tokenized'].apply(cl.stop_words)

In [113]:
verba_ayuso = cl.count_values(verba_ayuso, 'tokenized')

In [114]:
to_delete = ["El", "Y", "La", "Dice", "Dos", "Hoy", "Si", "En", "No", "Sido", "Los"]

In [115]:
verba_ayuso_count = cl.delete_extra_words(verba_ayuso, to_delete)

In [116]:
verba_ayuso_count = cl.for_streamlit(verba_ayuso_count)

In [117]:
#verba_ayuso_count.to_csv("./data/verba_results/verba_candidates/ayuso_verba.csv")

### Mónica García 

In [118]:
verba_garcia = gd.verba_candidate(4)

In [119]:
verba_garcia['tokenized'] = verba_garcia['phrase'].apply(cl.tokenize)

In [120]:
verba_garcia['tokenized'] = verba_garcia['tokenized'].apply(cl.stop_words)

In [121]:
verba_garcia = cl.count_values(verba_garcia, 'tokenized')

In [122]:
to_delete = ["La", "En", "Días", "El", "Y", "4", "Hoy", "Es", "Años", "Semana", "Ciento", "Real", "Por", "Si", "Solo", "A", "Dos", "Va", "Dice", "Ahora"]

In [123]:
verba_garcia_count = cl.delete_extra_words(verba_garcia, to_delete)

In [124]:
verba_garcia_count = cl.for_streamlit(verba_garcia_count)

In [125]:
#verba_garcia_count.to_csv("./data/verba_results/verba_candidates/garcia_verba.csv")

### Pablo Iglesias

In [126]:
verba_iglesias = gd.verba_candidate(5)

In [127]:
verba_iglesias['tokenized'] = verba_iglesias['phrase'].apply(cl.tokenize)

In [128]:
verba_iglesias['tokenized'] = verba_iglesias['phrase'].apply(cl.tokenize)

In [129]:
verba_iglesias['tokenized'] = verba_iglesias['tokenized'].apply(cl.stop_words)

In [130]:
verba_iglesias = cl.count_values(verba_iglesias, 'tokenized')

In [131]:
to_delete = ["El", "Y", "Más", "Hoy", "Después", "La", "En", "Va", "Dice", "Aunque", "Si", "Dos"]

In [132]:
verba_iglesias_count = cl.delete_extra_words(verba_iglesias, to_delete)

In [133]:
verba_iglesias_count = cl.for_streamlit(verba_iglesias_count)

In [134]:
#verba_iglesias_count.to_csv("./data/verba_results/verba_candidates/iglesias_verba.csv")

### Rocío Monasterio

In [135]:
verba_monasterio = gd.verba_candidate(6)

In [136]:
verba_monasterio['tokenized'] = verba_monasterio['phrase'].apply(cl.tokenize)

In [137]:
verba_monasterio['tokenized'] = verba_monasterio['tokenized'].apply(cl.stop_words)

In [138]:
verba_monasterio = cl.count_values(verba_monasterio, 'tokenized')

In [139]:
to_delete = ["El", "La", "Y", "Tres", "Más", "Dice", "Si", "Ser", "Quiere", "En", "Hoy", "Ciento", "Va", "A"]

In [140]:
verba_monasterio_count = cl.delete_extra_words(verba_monasterio, to_delete)

In [141]:
verba_monasterio_count = cl.for_streamlit(verba_monasterio_count)

In [142]:
#verba_monasterio_count.to_csv("./data/verba_results/verba_candidates/monasterio_verba.csv")