<a href="https://colab.research.google.com/github/Vincenzo-Miracula/TallerPratico/blob/main/EscuelaGobierno.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# librerie

In [None]:
!pip install transformers

In [15]:
import pandas as pd
from transformers import pipeline
from tqdm import tqdm
tqdm.pandas()
import re
from transformers import BertForSequenceClassification, BertTokenizer

# Dataframe y análisis de datos

In [16]:
df = pd.read_excel('https://github.com/Vincenzo-Miracula/TallerPratico/raw/main/MadridEdG.xlsx')

In [None]:
df.rename(columns={'¿Considera que el proyecto ha contribuido a generar un cambio positivo en el contexto de referencia desde la conclusión de las actividades hasta hoy?':'texto1',
                   '¿Cómo sigue colaborando el sujeto responsable con las organizaciones asociadas al partenariado del proyecto?': 'texto2',
                   '¿Cuáles han sido, si los hay, los cambios significativos1': 'texto3'}, inplace=True)

In [None]:
df

# sentiment-emotion

In [None]:
def clean(text):

    text = re.sub(r'<[^>]+>', '', text)
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticon
                           u"\U0001F300-\U0001F5FF"
                           u"\U0001F680-\U0001F6FF"
                           u"\U0001F1E0-\U0001F1FF"
                           u"\U00002500-\U00002BEF"
                           u"\U00002702-\U000027B0"
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           u"\U0001f926-\U0001f937"
                           u"\U00010000-\U0010ffff"
                           u"\u2640-\u2642"
                           u"\u2600-\u2B55"
                           u"\u200d"
                           u"\u23cf"
                           u"\u23e9"
                           u"\u231a"
                           u"\ufe0f"
                           u"\u3030"
                           "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'www\.\S+|https?://\S+', '', text)
    text = re.sub(r'@\w+\s*', '', text)
    text = re.sub(r'[^A-Za-zÀ-ÿ\s\']+', ' ', text)
    text = text.replace("&gt;", '')
    text = re.sub(r'RT', '', text)
    text = text.replace("\n", '')
    text = re.sub(r'#[^\s]+', '', text)
    text = re.sub(r'[^\w\sàèéìòù]+', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    text= text.lower()
    text = ''.join(char for char in text if char not in '#$&?\*+-/;<=>@[\\]^_`{|}~')

    return text

In [None]:
df['texto1Limpio'] = df['texto1'].progress_apply(clean)

In [None]:
df['texto1Limpio']

## primero modelo

In [None]:
classifier = pipeline("zero-shot-classification", model="MoritzLaurer/mDeBERTa-v3-base-mnli-xnli")
candidate_labels = ["Positive", "Neutral", "Negative"]

In [None]:
def sentiment_model1(text):
  result = classifier(text, candidate_labels, multi)
  return result['labels'][0]

In [None]:
sentiment_model1('')

In [None]:
df['SentimentText1'] = df['texto1Limpio'].progress_apply(sentiment)

## segundo modelo

In [None]:
model = BertForSequenceClassification.from_pretrained("VerificadoProfesional/SaBERT-Spanish-Sentiment-Analysis")
tokenizer = BertTokenizer.from_pretrained("VerificadoProfesional/SaBERT-Spanish-Sentiment-Analysis")
sentiment_task = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)

In [None]:
def sentiment_model2(text):
    #reducir el texto
    truncated_text = text[:512]
    result = sentiment_task(truncated_text)
    return result[0]['label']

In [None]:
df['SentimentText1'] = df['texto1Limpio'].progress_apply(sentiment_model2)

In [None]:
df['SentimentText1'].value_counts()

# scraping

In [None]:
#Esta línea importa la clase BeautifulSoup de la biblioteca bs4 y le asigna un alias 'bs'
from bs4 import BeautifulSoup as bs
#Esta línea importa la biblioteca requests, que se utiliza para realizar solicitudes HTTP a páginas web
import requests
# Importar la biblioteca 'time' para manejar el tiempo en el script.
import time

In [None]:
# Se establece un encabezado HTTP para simular un navegador web (User-Agent) en la solicitud.
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"}
# Se hace una solicitud GET a la URL proporcionada con el encabezado especificado.
url = requests.get('https://www.plenainclusion.org/noticias/', headers=headers)
# Se utiliza BeautifulSoup para analizar el contenido de la respuesta XML.
soup = bs(url.content, "html.parser")
badges = soup.find_all('div', {'class': 'elementor-post__badge'})
titulos = soup.find_all("div", {'class':"elementor-post__text"})
textos = soup.find_all("div", {'class':"elementor-post__excerpt"})

In [None]:
data = []
for badge, titulo, texto in zip(badges, titulos, textos):
  badge = badge.text.strip()
  titulo = titulo.text.strip()
  texto = texto.text.strip()
  data.append([badge, titulo, texto])

In [None]:
data #pd.DataFrame(data, columns=['Tag', 'Titulo', 'Texto'])

In [None]:
url = "https://www.plenainclusion.org/noticias/?sf_paged={}"
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"}
start_page=1
end_page=15

data = []

for page_num in tqdm(range(start_page, end_page + 1)):
  url_pagina = url.format(page_num)
  response = requests.get(url_pagina, headers)
  soup = bs(response.content, "html.parser")

  badges = soup.find_all('div', {'class': 'elementor-post__badge'})
  titulos = soup.find_all("div", {'class':"elementor-post__text"})
  textos = soup.find_all("div", {'class':"elementor-post__excerpt"})
  for badge, titulo, texto in zip(badges, titulos, textos):
    badge = badge.text.strip()
    titulo = titulo.text.strip()
    texto = texto.text.strip()
    data.append([badge, titulo, texto])

In [None]:
data #pd.DataFrame(data, columns=['Tag', 'Titulo', 'Texto'])