In [1]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [2]:
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer

model = AutoModelForSequenceClassification.from_pretrained("manifesto-project/manifestoberta-xlm-roberta-56policy-topics-sentence-2023-1-1")
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-large")

sentence = "We will restore funding to the Global Environment Facility and the Intergovernmental Panel on Climate Change, to support critical climate science research around the world"

inputs = tokenizer(sentence,
                   return_tensors="pt",
                   max_length=200,  #we limited the input to 200 tokens during finetuning
                   padding="max_length",
                   truncation=True
                   )

logits = model(**inputs).logits

probabilities = torch.softmax(logits, dim=1).tolist()[0]
probabilities = {model.config.id2label[index]: round(probability * 100, 2) for index, probability in enumerate(probabilities)}
probabilities = dict(sorted(probabilities.items(), key=lambda item: item[1], reverse=True))
print(probabilities)
# {'501 - Environmental Protection: Positive': 67.28, '411 - Technology and Infrastructure': 15.19, '107 - Internationalism: Positive': 13.63, '416 - Anti-Growth Economy: Positive': 2.02...

predicted_class = model.config.id2label[logits.argmax().item()]
print(predicted_class)
# 501 - Environmental Protection: Positive


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/5.59k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/616 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

{'501 - Environmental Protection: Positive': 67.28, '411 - Technology and Infrastructure': 15.19, '107 - Internationalism: Positive': 13.63, '416 - Anti-Growth Economy: Positive': 2.02, '506 - Education Expansion': 0.26, '703 - Agriculture and Farmers: Positive': 0.26, '108 - European Community/Union: Positive': 0.21, '101 - Foreign Special Relationships: Positive': 0.16, '410 - Economic Growth: Positive': 0.13, '104 - Military: Positive': 0.06, '402 - Incentives': 0.05, '502 - Culture: Positive': 0.05, '601 - National Way of Life: Positive': 0.05, '201 - Freedom and Human Rights': 0.04, '403 - Market Regulation': 0.04, '407 - Protectionism: Negative': 0.04, '504 - Welfare State Expansion': 0.04, '109 - Internationalism: Negative': 0.03, '301 - Federalism': 0.03, '303 - Governmental and Administrative Efficiency': 0.03, '406 - Protectionism: Positive': 0.03, '408 - Economic Goals': 0.03, '605 - Law and Order: Positive': 0.03, '105 - Military: Negative': 0.02, '106 - Peace': 0.02, '305 

In [3]:
import os

# List all files in the current directory
files = os.listdir('.')
print(files)

['.config', 'drive', 'sample_data']


In [4]:
import os
import pandas as pd

# Define the subfolder
subfolder = 'sample_data'

# Step 1: List all files in the subfolder
files = os.listdir(subfolder)
meta_files = [file for file in files if file.endswith('meta.tsv')]

# Step 2: Initialize an empty list to hold all DataFrames
dfs = []

# Step 3: Read each file into a DataFrame and append to 'dfs'
for file in meta_files:
    file_path = os.path.join(subfolder, file)
    df = pd.read_csv(file_path, sep='\t', index_col=False)  # Assuming files are tab-separated
    dfs.append(df)

# Step 4: Combine all DataFrames into one big DataFrame
combined_df = pd.concat(dfs, ignore_index=True)
df = combined_df

# Now 'df' contains the combined data from all 'meta.tsv' files in the 'sample_data' subfolder


In [None]:
df

Unnamed: 0,Text_ID,ID,Title,Date,Body,Term,Session,Meeting,Sitting,Agenda,...,Speaker_MP,Speaker_minister,Speaker_party,Speaker_party_name,Party_status,Party_orientation,Speaker_ID,Speaker_name,Speaker_gender,Speaker_birth
0,ParlaMint-HU-en_2022-06-14,u2022-06-14-0,"Hungarian parliamentary corpus ParlaMint-HU, l...",2022-06-14,Unicameralism,9. ciklus,-,6. ülés,2. ülésnap,-,...,MP,notMinister,KNDP-frakció,Parliamentary group of the Christian Democrati...,Coalition,Right,LatorcaiJanos,"Latorcai, János",M,1944
1,ParlaMint-HU-en_2022-06-14,u2022-06-14-1,"Hungarian parliamentary corpus ParlaMint-HU, l...",2022-06-14,Unicameralism,9. ciklus,-,6. ülés,2. ülésnap,-,...,MP,notMinister,LMP-frakció,Parliamentary group of the LMP – Hungary's Gre...,Opposition,Centre to centre-left,KanaszNagyMate,"Kanász-Nagy, Máté",M,1986
2,ParlaMint-HU-en_2022-06-14,u2022-06-14-2,"Hungarian parliamentary corpus ParlaMint-HU, l...",2022-06-14,Unicameralism,9. ciklus,-,6. ülés,2. ülésnap,-,...,MP,notMinister,KNDP-frakció,Parliamentary group of the Christian Democrati...,Coalition,Right,LatorcaiJanos,"Latorcai, János",M,1944
3,ParlaMint-HU-en_2022-06-14,u2022-06-14-3,"Hungarian parliamentary corpus ParlaMint-HU, l...",2022-06-14,Unicameralism,9. ciklus,-,6. ülés,2. ülésnap,-,...,MP,notMinister,Fidesz-frakció,Parliamentary group of the Fidesz – Hungarian ...,Coalition,Right to far-right,KonczZsofia,"Koncz, Zsófia",F,1990
4,ParlaMint-HU-en_2022-06-14,u2022-06-14-4,"Hungarian parliamentary corpus ParlaMint-HU, l...",2022-06-14,Unicameralism,9. ciklus,-,6. ülés,2. ülésnap,-,...,MP,notMinister,KNDP-frakció,Parliamentary group of the Christian Democrati...,Coalition,Right,LatorcaiJanos,"Latorcai, János",M,1944
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7844,ParlaMint-HU-en_2022-11-10,u2022-11-10-236,"Hungarian parliamentary corpus ParlaMint-HU, l...",2022-11-10,Unicameralism,9. ciklus,-,16. ülés,3. ülésnap,-,...,MP,notMinister,DK-frakció,Parliamentary group of the Democratic Coalitio...,Opposition,Centre-left,OlahLajos,"Oláh, Lajos",M,1969
7845,ParlaMint-HU-en_2022-11-10,u2022-11-10-237,"Hungarian parliamentary corpus ParlaMint-HU, l...",2022-11-10,Unicameralism,9. ciklus,-,16. ülés,3. ülésnap,-,...,notMP,notMinister,Fidesz,Fidesz – Hungarian Civic Alliance,-,Right to far-right,VarghaTamas,"Vargha, Tamás",M,1959
7846,ParlaMint-HU-en_2022-11-10,u2022-11-10-238,"Hungarian parliamentary corpus ParlaMint-HU, l...",2022-11-10,Unicameralism,9. ciklus,-,16. ülés,3. ülésnap,-,...,MP,notMinister,DK-frakció,Parliamentary group of the Democratic Coalitio...,Opposition,Centre-left,OlahLajos,"Oláh, Lajos",M,1969
7847,ParlaMint-HU-en_2022-11-10,u2022-11-10-239,"Hungarian parliamentary corpus ParlaMint-HU, l...",2022-11-10,Unicameralism,9. ciklus,-,16. ülés,3. ülésnap,-,...,MP,notMinister,Mi Hazánk-frakció,Parliamentary group of the Our Homeland Moveme...,Opposition,Far-right,NovakElod,"Novák, Előd",M,1980


In [5]:
import os
import pandas as pd

# Define the subfolder
subfolder = 'sample_data'

# Initialize an empty list to store data
data = []

# Step 1: List all files in the subfolder
files = os.listdir(subfolder)
txt_files = [file for file in files if file.endswith('.txt')]

# Step 2: Iterate over each text file
for file in txt_files:
    file_path = os.path.join(subfolder, file)
    with open(file_path, 'r', encoding='utf-8') as f:
        # Read lines from the text file
        lines = f.read().splitlines()

        # Iterate over each line in the file
        for line in lines:
            # Split each line by tab
            parts = line.split('\t')
            if len(parts) >= 2:
                text_id, text = parts[0], '\t'.join(parts[1:])
                text = text.strip()
                data.append({'text_id': text_id, 'text': text})

# Step 3: Create a DataFrame from the collected data
df_1 = pd.DataFrame(data)

# Now 'df_1' contains the combined data from all '.txt' files in the 'sample_data' subfolder


In [6]:


# Function to remove content within double square brackets
import re
def remove_bracketed_content(text):
  return re.sub(r'\[\[.*?\]\]', '', text)

# Apply the function to the 'text' column
df_1['text'] = df_1['text'].apply(remove_bracketed_content)

new_data=df.merge(df_1, left_on='ID', right_on='text_id')

new_data = new_data[new_data['Speaker_role'] != 'Chairperson']


In [None]:
new_data

Unnamed: 0,Text_ID,ID,Title,Date,Body,Term,Session,Meeting,Sitting,Agenda,...,Speaker_party,Speaker_party_name,Party_status,Party_orientation,Speaker_ID,Speaker_name,Speaker_gender,Speaker_birth,text_id,text
1,ParlaMint-HU-en_2022-06-14,u2022-06-14-1,"Hungarian parliamentary corpus ParlaMint-HU, l...",2022-06-14,Unicameralism,9. ciklus,-,6. ülés,2. ülésnap,-,...,LMP-frakció,Parliamentary group of the LMP – Hungary's Gre...,Opposition,Centre to centre-left,KanaszNagyMate,"Kanász-Nagy, Máté",M,1986,u2022-06-14-1,"Thank you for the floor, Mr President. Dear Ho..."
3,ParlaMint-HU-en_2022-06-14,u2022-06-14-3,"Hungarian parliamentary corpus ParlaMint-HU, l...",2022-06-14,Unicameralism,9. ciklus,-,6. ülés,2. ülésnap,-,...,Fidesz-frakció,Parliamentary group of the Fidesz – Hungarian ...,Coalition,Right to far-right,KonczZsofia,"Koncz, Zsófia",F,1990,u2022-06-14-3,Thank you very much for the floor. Mr Presiden...
5,ParlaMint-HU-en_2022-06-14,u2022-06-14-5,"Hungarian parliamentary corpus ParlaMint-HU, l...",2022-06-14,Unicameralism,9. ciklus,-,6. ülés,2. ülésnap,-,...,Párbeszéd-frakció,Parliamentary group of the Dialogue for Hungar...,Opposition,Centre-left to left,SzaboRebeka,"Szabó, Rebeka",F,1977,u2022-06-14-5,"Thank you very much. Mr President, Ladies and ..."
7,ParlaMint-HU-en_2022-06-14,u2022-06-14-7,"Hungarian parliamentary corpus ParlaMint-HU, l...",2022-06-14,Unicameralism,9. ciklus,-,6. ülés,2. ülésnap,-,...,Fidesz-frakció,Parliamentary group of the Fidesz – Hungarian ...,Coalition,Right to far-right,FarkasSandor,"Farkas, Sándor",M,1953,u2022-06-14-7,"Thank you very much. Mr President, Ladies and ..."
9,ParlaMint-HU-en_2022-06-14,u2022-06-14-9,"Hungarian parliamentary corpus ParlaMint-HU, l...",2022-06-14,Unicameralism,9. ciklus,-,6. ülés,2. ülésnap,-,...,Mi Hazánk-frakció,Parliamentary group of the Our Homeland Moveme...,Opposition,Far-right,NovakElod,"Novák, Előd",M,1980,u2022-06-14-9,"Thank you for the floor. Dear Parliament, Ther..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7839,ParlaMint-HU-en_2022-11-10,u2022-11-10-231,"Hungarian parliamentary corpus ParlaMint-HU, l...",2022-11-10,Unicameralism,9. ciklus,-,16. ülés,3. ülésnap,-,...,Mi Hazánk-frakció,Parliamentary group of the Our Homeland Moveme...,Opposition,Far-right,NovakElod,"Novák, Előd",M,1980,u2022-11-10-231,"Thank you for the floor, Mr President. Mr. Vic..."
7841,ParlaMint-HU-en_2022-11-10,u2022-11-10-233,"Hungarian parliamentary corpus ParlaMint-HU, l...",2022-11-10,Unicameralism,9. ciklus,-,16. ülés,3. ülésnap,-,...,DK-frakció,Parliamentary group of the Democratic Coalitio...,Opposition,Centre-left,VadaiAgnes,"Vadai, Ágnes",F,1974,u2022-11-10-233,"Thank you for the floor, Mr President. Dear Ho..."
7843,ParlaMint-HU-en_2022-11-10,u2022-11-10-235,"Hungarian parliamentary corpus ParlaMint-HU, l...",2022-11-10,Unicameralism,9. ciklus,-,16. ülés,3. ülésnap,-,...,JOBBIK-frakció,Parliamentary group of the Movement for a Bett...,Opposition,Centre-right,SasZoltan,"Sas, Zoltán",M,1972,u2022-11-10-235,"Thank you very much for the floor, Mr Presiden..."
7845,ParlaMint-HU-en_2022-11-10,u2022-11-10-237,"Hungarian parliamentary corpus ParlaMint-HU, l...",2022-11-10,Unicameralism,9. ciklus,-,16. ülés,3. ülésnap,-,...,Fidesz,Fidesz – Hungarian Civic Alliance,-,Right to far-right,VarghaTamas,"Vargha, Tamás",M,1959,u2022-11-10-237,"Thank you very much, Mr. President. Dear Parli..."


Unnamed: 0,text_id,text
0,u2023-04-12-0,[[The people in this room stand up and greet t...
1,u2023-04-12-1,"Thank you for the floor. Dear Parliament, Ladi..."
2,u2023-04-12-2,"Thank you very much, Congresswoman Bakos Berna..."
3,u2023-04-12-3,Thank you very much for the floor. Mr Presiden...
4,u2023-04-12-4,"Thank you very much, Mrs Zsófia Koncz. Rebeka ..."
...,...,...
5966,u2023-03-27-288,"Thank you very much, Congressman. Judit Ráczné..."
5967,u2023-03-27-289,"Thank you for the floor, Mr President. Dear Ho..."
5968,u2023-03-27-290,"Thank you, Congresswoman. Csaba Nagy, from the..."
5969,u2023-03-27-291,"Thank you for the floor, Mr President. Ladies ..."


In [None]:
import pandas as pd
url = 'https://raw.githubusercontent.com/stesstesste/SICSS_2024/main/hungarian_speeches/2023/ParlaMint-HU-en_2023-02-28-meta.tsv'
df = pd.read_csv(url, delimiter='\t', index_col=False)


import requests

# URL of the raw TSV file on GitHub
file_url = 'https://raw.githubusercontent.com/stesstesste/SICSS_2024/main/hungarian_speeches/2023/ParlaMint-HU-en_2023-02-28.txt'

# Send a GET request to the URL
response = requests.get(file_url)

# Ensure the request was successful
if response.status_code == 200:
    # Split the content by lines
    lines = response.text.splitlines()

    # Initialize an empty list to store data
    data = []

    # Iterate over the lines
    for line in lines:
        # Split each line by tab
        text_id, text = line.split('\t')
        text = text.strip()
        data.append({'text_id': text_id, 'text': text})

    # Print the first few entries to check




df_1 = pd.DataFrame(data)


In [None]:


# Function to remove content within double square brackets
import re
def remove_bracketed_content(text):
  return re.sub(r'\[\[.*?\]\]', '', text)

# Apply the function to the 'text' column
df_1['text'] = df_1['text'].apply(remove_bracketed_content)

new_data=df.merge(df_1, left_on='ID', right_on='text_id')

new_data = new_data[new_data['Speaker_role'] != 'Chairperson']


In [None]:
new_data

Unnamed: 0,Text_ID,ID,Title,Date,Body,Term,Session,Meeting,Sitting,Agenda,...,Speaker_party_name,Party_status,Party_orientation,Speaker_ID,Speaker_name,Speaker_gender,Speaker_birth,text_id,text,probabilities
1,ParlaMint-HU-en_2023-02-28,u2023-02-28-1,"Hungarian parliamentary corpus ParlaMint-HU, l...",2023-02-28,Unicameralism,9. ciklus,-,20. ülés,2. ülésnap,-,...,Parliamentary group of the LMP – Hungary's Gre...,Opposition,Centre to centre-left,BakosBernadett,"Bakos, Bernadett",F,1992,u2023-02-28-1,"Thank you for the floor. Dear House, Ladies an...","{'305 - Political Authority': 25.84, '501 - En..."
3,ParlaMint-HU-en_2023-02-28,u2023-02-28-3,"Hungarian parliamentary corpus ParlaMint-HU, l...",2023-02-28,Unicameralism,9. ciklus,-,20. ülés,2. ülésnap,-,...,Parliamentary group of the Fidesz – Hungarian ...,Coalition,Right to far-right,KonczZsofia,"Koncz, Zsófia",F,1990,u2023-02-28-3,Thank you very much for the floor. Dear Parlia...,"{'407 - Protectionism: Negative': 26.24, '410 ..."
5,ParlaMint-HU-en_2023-02-28,u2023-02-28-5,"Hungarian parliamentary corpus ParlaMint-HU, l...",2023-02-28,Unicameralism,9. ciklus,-,20. ülés,2. ülésnap,-,...,Parliamentary group of the Dialogue for Hungar...,Opposition,Centre-left to left,MellarTamas,"Mellar, Tamás",M,1954,u2023-02-28-5,"Thank you for the floor, Mr President. Dear Ho...","{'304 - Political Corruption': 60.8, '202 - De..."
7,ParlaMint-HU-en_2023-02-28,u2023-02-28-7,"Hungarian parliamentary corpus ParlaMint-HU, l...",2023-02-28,Unicameralism,9. ciklus,-,20. ülés,2. ülésnap,-,...,Parliamentary group of the Fidesz – Hungarian ...,Coalition,Right to far-right,DomotorCsaba,"Dömötör, Csaba",M,1982,u2023-02-28-7,"Thank you for the floor, Mr President. Honoura...",{'303 - Governmental and Administrative Effici...
9,ParlaMint-HU-en_2023-02-28,u2023-02-28-9,"Hungarian parliamentary corpus ParlaMint-HU, l...",2023-02-28,Unicameralism,9. ciklus,-,20. ülés,2. ülésnap,-,...,Parliamentary group of the Our Homeland Moveme...,Opposition,Far-right,ToroczkaiLaszlo,"Toroczkai, László",M,1978,u2023-02-28-9,"Thank you for the floor, Mr President. I will ...","{'202 - Democracy': 52.78, '305 - Political Au..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
175,ParlaMint-HU-en_2023-02-28,u2023-02-28-175,"Hungarian parliamentary corpus ParlaMint-HU, l...",2023-02-28,Unicameralism,9. ciklus,-,20. ülés,2. ülésnap,-,...,Parliamentary group of the Hungarian Socialist...,Opposition,Centre-left,GurmaiZita,"Gurmai, Zita",F,1965,u2023-02-28-175,Thank you very much. If we can continue with t...,"{'107 - Internationalism: Positive': 53.99, '3..."
177,ParlaMint-HU-en_2023-02-28,u2023-02-28-177,"Hungarian parliamentary corpus ParlaMint-HU, l...",2023-02-28,Unicameralism,9. ciklus,-,20. ülés,2. ülésnap,-,...,Parliamentary group of the Christian Democrati...,Coalition,Right,JuhaszHajnalka,"Juhász, Hajnalka",F,1980,u2023-02-28-177,"Thank you very much for the word, no one with ...","{'107 - Internationalism: Positive': 50.86, '1..."
179,ParlaMint-HU-en_2023-02-28,u2023-02-28-179,"Hungarian parliamentary corpus ParlaMint-HU, l...",2023-02-28,Unicameralism,9. ciklus,-,20. ülés,2. ülésnap,-,...,Parliamentary group of the Movement for a Bett...,Opposition,Centre-right,BalassaPeter,"Balassa, Péter",M,1975,u2023-02-28-179,"Thank you, Mr. President. Dear Secretary of St...",{'101 - Foreign Special Relationships: Positiv...
181,ParlaMint-HU-en_2023-02-28,u2023-02-28-181,"Hungarian parliamentary corpus ParlaMint-HU, l...",2023-02-28,Unicameralism,9. ciklus,-,20. ülés,2. ülésnap,-,...,Parliamentary group of the Movement for a Bett...,Opposition,Centre-right,SasZoltan,"Sas, Zoltán",M,1972,u2023-02-28-181,"Thank you very much for the floor, Mr Presiden...","{'104 - Military: Positive': 54.96, '605 - Law..."


In [7]:
import dask.dataframe as dd
from dask import delayed

def compute_probabilities(sentence):
  inputs = tokenizer(sentence,
                   return_tensors="pt",
                   max_length=200,  #we limited the input to 200 tokens during finetuning
                   padding="max_length",
                   truncation=True
                   )


  logits = model(**inputs).logits

  probabilities = torch.softmax(logits, dim=1).tolist()[0]
  probabilities = {model.config.id2label[index]: round(probability * 100, 2) for index, probability in enumerate(probabilities)}
  probabilities = dict(sorted(probabilities.items(), key=lambda item: item[1], reverse=True))

  return probabilities

#ask_df = dd.from_pandas(new_data.head(10), npartitions=50)

#dask_df['probabilities'] = dask_df['text'].apply(lambda x: delayed(compute_probabilities)(x), meta=('x', 'object'))

#result = dask_df.compute()

In [8]:
import multiprocessing

def apply_function_multiprocessing(df, func, num_processes=None):
    if num_processes is None:
        num_processes = multiprocessing.cpu_count() - 1  # Use all available cores except one

    with multiprocessing.Pool(processes=num_processes) as pool:
        results = pool.map(func, df['text'])

    return results

# Apply the function using multiprocessing
results = apply_function_multiprocessing(new_data, compute_probabilities)


91

In [9]:
new_data['probabilities'] = results

In [None]:
new_data

Unnamed: 0,Text_ID,ID,Title,Date,Body,Term,Session,Meeting,Sitting,Agenda,...,Speaker_party_name,Party_status,Party_orientation,Speaker_ID,Speaker_name,Speaker_gender,Speaker_birth,text_id,text,probabilities
1,ParlaMint-HU-en_2022-06-14,u2022-06-14-1,"Hungarian parliamentary corpus ParlaMint-HU, l...",2022-06-14,Unicameralism,9. ciklus,-,6. ülés,2. ülésnap,-,...,Parliamentary group of the LMP – Hungary's Gre...,Opposition,Centre to centre-left,KanaszNagyMate,"Kanász-Nagy, Máté",M,1986,u2022-06-14-1,"Thank you for the floor, Mr President. Dear Ho...","{'305 - Political Authority': 61.36, '503 - Eq..."
3,ParlaMint-HU-en_2022-06-14,u2022-06-14-3,"Hungarian parliamentary corpus ParlaMint-HU, l...",2022-06-14,Unicameralism,9. ciklus,-,6. ülés,2. ülésnap,-,...,Parliamentary group of the Fidesz – Hungarian ...,Coalition,Right to far-right,KonczZsofia,"Koncz, Zsófia",F,1990,u2022-06-14-3,Thank you very much for the floor. Mr Presiden...,{'501 - Environmental Protection: Positive': 7...
5,ParlaMint-HU-en_2022-06-14,u2022-06-14-5,"Hungarian parliamentary corpus ParlaMint-HU, l...",2022-06-14,Unicameralism,9. ciklus,-,6. ülés,2. ülésnap,-,...,Parliamentary group of the Dialogue for Hungar...,Opposition,Centre-left to left,SzaboRebeka,"Szabó, Rebeka",F,1977,u2022-06-14-5,"Thank you very much. Mr President, Ladies and ...",{'501 - Environmental Protection: Positive': 7...
7,ParlaMint-HU-en_2022-06-14,u2022-06-14-7,"Hungarian parliamentary corpus ParlaMint-HU, l...",2022-06-14,Unicameralism,9. ciklus,-,6. ülés,2. ülésnap,-,...,Parliamentary group of the Fidesz – Hungarian ...,Coalition,Right to far-right,FarkasSandor,"Farkas, Sándor",M,1953,u2022-06-14-7,"Thank you very much. Mr President, Ladies and ...",{'501 - Environmental Protection: Positive': 6...
9,ParlaMint-HU-en_2022-06-14,u2022-06-14-9,"Hungarian parliamentary corpus ParlaMint-HU, l...",2022-06-14,Unicameralism,9. ciklus,-,6. ülés,2. ülésnap,-,...,Parliamentary group of the Our Homeland Moveme...,Opposition,Far-right,NovakElod,"Novák, Előd",M,1980,u2022-06-14-9,"Thank you for the floor. Dear Parliament, Ther...","{'504 - Welfare State Expansion': 61.96, '603 ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7839,ParlaMint-HU-en_2022-11-10,u2022-11-10-231,"Hungarian parliamentary corpus ParlaMint-HU, l...",2022-11-10,Unicameralism,9. ciklus,-,16. ülés,3. ülésnap,-,...,Parliamentary group of the Our Homeland Moveme...,Opposition,Far-right,NovakElod,"Novák, Előd",M,1980,u2022-11-10-231,"Thank you for the floor, Mr President. Mr. Vic...","{'105 - Military: Negative': 60.46, '305 - Pol..."
7841,ParlaMint-HU-en_2022-11-10,u2022-11-10-233,"Hungarian parliamentary corpus ParlaMint-HU, l...",2022-11-10,Unicameralism,9. ciklus,-,16. ülés,3. ülésnap,-,...,Parliamentary group of the Democratic Coalitio...,Opposition,Centre-left,VadaiAgnes,"Vadai, Ágnes",F,1974,u2022-11-10-233,"Thank you for the floor, Mr President. Dear Ho...","{'104 - Military: Positive': 51.23, '107 - Int..."
7843,ParlaMint-HU-en_2022-11-10,u2022-11-10-235,"Hungarian parliamentary corpus ParlaMint-HU, l...",2022-11-10,Unicameralism,9. ciklus,-,16. ülés,3. ülésnap,-,...,Parliamentary group of the Movement for a Bett...,Opposition,Centre-right,SasZoltan,"Sas, Zoltán",M,1972,u2022-11-10-235,"Thank you very much for the floor, Mr Presiden...","{'104 - Military: Positive': 81.63, '107 - Int..."
7845,ParlaMint-HU-en_2022-11-10,u2022-11-10-237,"Hungarian parliamentary corpus ParlaMint-HU, l...",2022-11-10,Unicameralism,9. ciklus,-,16. ülés,3. ülésnap,-,...,Fidesz – Hungarian Civic Alliance,-,Right to far-right,VarghaTamas,"Vargha, Tamás",M,1959,u2022-11-10-237,"Thank you very much, Mr. President. Dear Parli...","{'104 - Military: Positive': 59.15, '305 - Pol..."


In [11]:
os.listdir()

['.config', 'drive', '2022_data_probabilities.csv', 'sample_data']

In [10]:

#from google.colab import drive
#drive.mount('/content/drive')

# Save to Google Drive
file_path = '2022_data_probabilities.csv'
 # Replace with your desired file path
new_data.to_csv(file_path)
print('exported 2022')

exported 2022


In [None]:
import dask.dataframe as dd
from dask import delayed, compute

import time
new_data_dd = dd.from_pandas(new_data.head(3), npartitions=8)  # Adjust npartitions based on your machine's cores

# Define a delayed function to compute probabilities
@delayed
def compute_probabilities_delayed(text):
    return compute_probabilities(text)

# Apply the function to the Dask DataFrame
prob_delayed = new_data_dd['text'].apply(compute_probabilities_delayed, meta=('text', 'f8'))

start_time = time.time()
# Compute the results
prob = compute(prob_persisted)
end_time = time.time()




# Calculate the elapsed time
elapsed_time = end_time - start_time

print(f"Time taken to run the computation: {elapsed_time:.2f} seconds")

# Flatten the list of results

NameError: name 'prob_persisted' is not defined

In [None]:
new_data_dd = dd.from_pandas(new_data[:5], npartitions=8)  # Adjust npartitions based on your machine's cores

# Define a delayed function to compute probabilities
@delayed
def compute_probabilities_delayed(text):
    return compute_probabilities(text)
# Use map_partitions for better performance with large DataFrames
prob_delayed = new_data_dd.map_partitions(lambda df: df['text'].apply(compute_probabilities_delayed), meta=('text', 'f8'))

# Persist the intermediate results in memory
prob_persisted = prob_delayed.persist()

start_time = time.time()
# Compute the results
prob = compute(prob_persisted)
end_time = time.time()

# Calculate the elapsed time
elapsed_time = end_time - start_time

print(f"Time taken to run the computation: {elapsed_time:.2f} seconds")

TypeError: Truth of Delayed objects is not supported

In [None]:
import dask.dataframe as dd
from dask_cuda import LocalCUDACluster
from dask.distributed import Client
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import pandas as pd
import time
new_data_dd = dd.from_pandas(new_data, npartitions=8)



# Define a delayed function to compute probabilities
@dd.delayed
def compute_probabilities_delayed(text):
    return compute_probabilities(text)

# Apply the function to the Dask DataFrame
prob_delayed = new_data_dd['text'].apply(compute_probabilities_delayed, meta=('text', 'object'))

# Persist the intermediate results in memory
prob_persisted = prob_delayed.persist()

# Compute the results
start_time = time.time()
prob = prob_persisted.compute()
end_time = time.time()

# Calculate the elapsed time
elapsed_time = end_time - start_time

print(f"Time taken to run the computation: {elapsed_time:.2f} seconds")

# Print the first few results
print(prob)

AttributeError: module 'dask.dataframe' has no attribute 'delayed'

260

In [None]:
sentence=new_data['text'][101]
inputs = tokenizer(sentence,
                   return_tensors="pt",
                   max_length=200,  #we limited the input to 200 tokens during finetuning
                   padding="max_length",
                   truncation=True
                   )

logits = model(**inputs).logits

probabilities = torch.softmax(logits, dim=1).tolist()[0]
probabilities = {model.config.id2label[index]: round(probability * 100, 2) for index, probability in enumerate(probabilities)}
probabilities = dict(sorted(probabilities.items(), key=lambda item: item[1], reverse=True))
#print(probabilities)
# {'501 - Environmental Protection: Positive': 67.28, '411 - Technology and Infrastructure': 15.19, '107 - Internationalism: Positive': 13.63, '416 - Anti-Growth Economy: Positive': 2.02...

predicted_class = model.config.id2label[logits.argmax().item()]
#print(predicted_class)
# 501 - Environmental Protection: Positive


{'301 - Federalism': 73.12, '305 - Political Authority': 6.67, '303 - Governmental and Administrative Efficiency': 5.47, '414 - Economic Orthodoxy': 2.33, '304 - Political Corruption': 2.27, '302 - Centralisation': 1.98, '202 - Democracy': 1.73, '606 - Civic Mindedness: Positive': 1.47, '504 - Welfare State Expansion': 0.96, '204 - Constitutionalism: Negative': 0.81, '503 - Equality: Positive': 0.46, '505 - Welfare State Limitation': 0.32, '203 - Constitutionalism: Positive': 0.23, '408 - Economic Goals': 0.22, '201 - Freedom and Human Rights': 0.19, '404 - Economic Planning': 0.19, '401 - Free Market Economy': 0.1, '601 - National Way of Life: Positive': 0.1, '605 - Law and Order: Positive': 0.1, '502 - Culture: Positive': 0.09, '506 - Education Expansion': 0.08, '402 - Incentives': 0.07, '602 - National Way of Life: Negative': 0.07, '607 - Multiculturalism: Positive': 0.07, '507 - Education Limitation': 0.06, '103 - Anti-Imperialism': 0.05, '110 - European Community/Union: Negative':

In [None]:
import dask.dataframe as dd
from dask import delayed

def compute_probabilities(sentence):
  inputs = tokenizer(sentence,
                   return_tensors="pt",
                   max_length=200,  #we limited the input to 200 tokens during finetuning
                   padding="max_length",
                   truncation=True
                   )


  logits = model(**inputs).logits

  probabilities = torch.softmax(logits, dim=1).tolist()[0]
  probabilities = {model.config.id2label[index]: round(probability * 100, 2) for index, probability in enumerate(probabilities)}
  probabilities = dict(sorted(probabilities.items(), key=lambda item: item[1], reverse=True))

  return probabilities

dask_df = dd.from_pandas(new_data.head(10), npartitions=3)

dask_df['probabilities'] = dask_df['text'].apply(lambda x: delayed(compute_probabilities)(x), meta=('x', 'object'))

result = dask_df.compute()