<a href="https://colab.research.google.com/github/stesstesste/SICSS_2024/blob/main/SICSS_2024.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer

model = AutoModelForSequenceClassification.from_pretrained("manifesto-project/manifestoberta-xlm-roberta-56policy-topics-sentence-2023-1-1")
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-large")

sentence = "We will restore funding to the Global Environment Facility and the Intergovernmental Panel on Climate Change, to support critical climate science research around the world"

inputs = tokenizer(sentence,
                   return_tensors="pt",
                   max_length=200,  #we limited the input to 200 tokens during finetuning
                   padding="max_length",
                   truncation=True
                   )

logits = model(**inputs).logits

probabilities = torch.softmax(logits, dim=1).tolist()[0]
probabilities = {model.config.id2label[index]: round(probability * 100, 2) for index, probability in enumerate(probabilities)}
probabilities = dict(sorted(probabilities.items(), key=lambda item: item[1], reverse=True))
print(probabilities)
# {'501 - Environmental Protection: Positive': 67.28, '411 - Technology and Infrastructure': 15.19, '107 - Internationalism: Positive': 13.63, '416 - Anti-Growth Economy: Positive': 2.02...

predicted_class = model.config.id2label[logits.argmax().item()]
print(predicted_class)
# 501 - Environmental Protection: Positive


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/5.59k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/616 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

{'501 - Environmental Protection: Positive': 67.28, '411 - Technology and Infrastructure': 15.19, '107 - Internationalism: Positive': 13.63, '416 - Anti-Growth Economy: Positive': 2.02, '506 - Education Expansion': 0.26, '703 - Agriculture and Farmers: Positive': 0.26, '108 - European Community/Union: Positive': 0.21, '101 - Foreign Special Relationships: Positive': 0.16, '410 - Economic Growth: Positive': 0.13, '104 - Military: Positive': 0.06, '402 - Incentives': 0.05, '502 - Culture: Positive': 0.05, '601 - National Way of Life: Positive': 0.05, '201 - Freedom and Human Rights': 0.04, '403 - Market Regulation': 0.04, '407 - Protectionism: Negative': 0.04, '504 - Welfare State Expansion': 0.04, '109 - Internationalism: Negative': 0.03, '301 - Federalism': 0.03, '303 - Governmental and Administrative Efficiency': 0.03, '406 - Protectionism: Positive': 0.03, '408 - Economic Goals': 0.03, '605 - Law and Order: Positive': 0.03, '105 - Military: Negative': 0.02, '106 - Peace': 0.02, '305 

In [None]:
import pandas as pd
url = 'https://raw.githubusercontent.com/stesstesste/SICSS_2024/main/hungarian_speeches/2023/ParlaMint-HU-en_2023-02-28-meta.tsv'
df = pd.read_csv(url, delimiter='\t')


In [None]:
import requests

# URL of the raw TSV file on GitHub
file_url = 'https://raw.githubusercontent.com/stesstesste/SICSS_2024/main/hungarian_speeches/2023/ParlaMint-HU-en_2023-02-28.txt'

# Send a GET request to the URL
response = requests.get(file_url)

# Ensure the request was successful
if response.status_code == 200:
    # Split the content by lines
    lines = response.text.splitlines()

    # Initialize an empty list to store data
    data = []

    # Iterate over the lines
    for line in lines:
        # Split each line by tab
        text_id, text = line.split('\t')
        text = text.strip()
        data.append({'text_id': text_id, 'text': text})

    # Print the first few entries to check



In [None]:



df_1 = pd.DataFrame(data)

In [None]:

new_data=df.merge(df_1, left_on='Text_ID', right_on='text_id')

In [None]:
import dask.dataframe as dd
from dask import delayed

def compute_probabilities(sentence):
  inputs = tokenizer(sentence,
                   return_tensors="pt",
                   max_length=200,  #we limited the input to 200 tokens during finetuning
                   padding="max_length",
                   truncation=True
                   )


  logits = model(**inputs).logits

  probabilities = torch.softmax(logits, dim=1).tolist()[0]
  probabilities = {model.config.id2label[index]: round(probability * 100, 2) for index, probability in enumerate(probabilities)}
  probabilities = dict(sorted(probabilities.items(), key=lambda item: item[1], reverse=True))

  return probabilities

dask_df = dd.from_pandas(new_data.head(10), npartitions=50)

dask_df['probabilities'] = dask_df['text'].apply(lambda x: delayed(compute_probabilities)(x), meta=('x', 'object'))

result = dask_df.compute()

In [None]:
sentence=new_data['text'][101]

In [None]:
len(new_data)

260

In [None]:
sentence

'Köszönöm a szót, elnök úr. Tisztelt Országgyűlés! Tisztelt Képviselő Úr! Ön hivatkozott Lázár úr mondataira. Azt gondolom, hogy abban nagy igazság van; nyilvánvalóan fontos dolog a pártpolitika, a politikai berendezkedésünk, a működési alapunkban jelentős szerepet szánt az élet a pártpolitikának. Ugyanakkor azt hozzátehetem, hozzáteszem, hogy nem a mi kormányunk önkormányzatokért felelős minisztere tett olyan kijelentést, amelyben a pártpolitikát helyezte az önkormányzatok támogatásában meghatározónak. Az önkormányzatok finanszírozása egyébként valóban régóta komoly viták kereszttüzében állt. Ha megnézzük az önkormányzati finanszírozást az elmúlt 24 évben, azt láthatjuk, hogy a 2000-es években jutottak el az önkormányzatok a működőképességük határáig. A 2000-es években, 2004-2005-től 2009-ig alakult ki az a nagyméretű adósságállomány, amelyről a képviselő úr is beszélt. Konkrétan Pécs esetében is 2005, 2006, 2007 voltak azok az évek, amikor a város leginkább eladósodott, és ezért kell

In [None]:
sentence=new_data['text'][101]
inputs = tokenizer(sentence,
                   return_tensors="pt",
                   max_length=200,  #we limited the input to 200 tokens during finetuning
                   padding="max_length",
                   truncation=True
                   )

logits = model(**inputs).logits

probabilities = torch.softmax(logits, dim=1).tolist()[0]
probabilities = {model.config.id2label[index]: round(probability * 100, 2) for index, probability in enumerate(probabilities)}
probabilities = dict(sorted(probabilities.items(), key=lambda item: item[1], reverse=True))
#print(probabilities)
# {'501 - Environmental Protection: Positive': 67.28, '411 - Technology and Infrastructure': 15.19, '107 - Internationalism: Positive': 13.63, '416 - Anti-Growth Economy: Positive': 2.02...

predicted_class = model.config.id2label[logits.argmax().item()]
#print(predicted_class)
# 501 - Environmental Protection: Positive


{'301 - Federalism': 73.12, '305 - Political Authority': 6.67, '303 - Governmental and Administrative Efficiency': 5.47, '414 - Economic Orthodoxy': 2.33, '304 - Political Corruption': 2.27, '302 - Centralisation': 1.98, '202 - Democracy': 1.73, '606 - Civic Mindedness: Positive': 1.47, '504 - Welfare State Expansion': 0.96, '204 - Constitutionalism: Negative': 0.81, '503 - Equality: Positive': 0.46, '505 - Welfare State Limitation': 0.32, '203 - Constitutionalism: Positive': 0.23, '408 - Economic Goals': 0.22, '201 - Freedom and Human Rights': 0.19, '404 - Economic Planning': 0.19, '401 - Free Market Economy': 0.1, '601 - National Way of Life: Positive': 0.1, '605 - Law and Order: Positive': 0.1, '502 - Culture: Positive': 0.09, '506 - Education Expansion': 0.08, '402 - Incentives': 0.07, '602 - National Way of Life: Negative': 0.07, '607 - Multiculturalism: Positive': 0.07, '507 - Education Limitation': 0.06, '103 - Anti-Imperialism': 0.05, '110 - European Community/Union: Negative':

In [None]:
import dask.dataframe as dd
from dask import delayed

def compute_probabilities(sentence):
  inputs = tokenizer(sentence,
                   return_tensors="pt",
                   max_length=200,  #we limited the input to 200 tokens during finetuning
                   padding="max_length",
                   truncation=True
                   )


  logits = model(**inputs).logits

  probabilities = torch.softmax(logits, dim=1).tolist()[0]
  probabilities = {model.config.id2label[index]: round(probability * 100, 2) for index, probability in enumerate(probabilities)}
  probabilities = dict(sorted(probabilities.items(), key=lambda item: item[1], reverse=True))

  return probabilities

dask_df = dd.from_pandas(new_data.head(10), npartitions=3)

dask_df['probabilities'] = dask_df['text'].apply(lambda x: delayed(compute_probabilities)(x), meta=('x', 'object'))

result = dask_df.compute()