In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/unlp-2025-shared-task-span-identification/sample_submission.csv
/kaggle/input/unlp-2025-shared-task-span-identification/README.md
/kaggle/input/unlp-2025-shared-task-span-identification/train.parquet
/kaggle/input/unlp-2025-shared-task-span-identification/test.csv


In [2]:
# Load datasets
train_df = pd.read_parquet('/kaggle/input/unlp-2025-shared-task-span-identification/train.parquet')
test_df = pd.read_csv('/kaggle/input/unlp-2025-shared-task-span-identification/test.csv')
sample_submission_df = pd.read_csv('/kaggle/input/unlp-2025-shared-task-span-identification/sample_submission.csv')

# Display the first few rows of each dataset to inspect their structure
print("Train DataFrame:")
print(train_df.head())

print("\nTest DataFrame:")
print(test_df.head())

print("\nSample Submission DataFrame:")
print(sample_submission_df.head())


Train DataFrame:
                                     id  \
0  0bb0c7fa-101b-4583-a5f9-9d503339141c   
1  7159f802-6f99-4e9d-97bd-6f565a4a0fae   
2  e6a427f1-211f-405f-bd8b-70798458d656   
3  1647a352-4cd3-40f6-bfa1-d87d42e34eea   
4  9c01de00-841f-4b50-9407-104e9ffb03bf   

                                             content lang  manipulative  \
0  –ù–æ–≤–∏–π –æ–≥–ª—è–¥ –º–∞–ø–∏ DeepState –≤—ñ–¥ —Ä–æ—Å—ñ–π—Å—å–∫–æ–≥–æ –≤—ñ–π...   uk          True   
1  –ù–µ–¥–∞–≤–Ω–æ 95 –∫–≤–∞—Ä—Ç–∞–ª –∂—ë—Å—Ç–∫–æ –ø–æ–≥–ª—É–º–∏–ª—Å—è –Ω–∞–¥ —Ä—É—Å—Å–∫...   ru          True   
2  ü§©\n–¢–∏–º —á–∞—Å–æ–º –π–¥–µ –µ–≤–∞–∫—É–∞—Ü—ñ—è –ë—î–ª–≥–æ—Ä–æ–¥—Å—å–∫–æ–≥–æ –∞–≤—Ç–æ...   uk          True   
3  –í –£–∫—Ä–∞—ó–Ω—ñ –Ω–∞–π–±–ª–∏–∂—á–∏–º —á–∞—Å–æ–º –º–∞—é—Ç—å –Ω–∞–º—ñ—Ä –ø–æ—Å–∏–ª–∏—Ç...   uk         False   
4  –†–∞—Å—á—ë—Ç—ã 122-–º–º –°–ê–£ 2–°1 "–ì–≤–æ–∑–¥–∏–∫–∞" 132-–π –±—Ä–∏–≥–∞–¥...   ru          True   

                          techniques  \
0        [euphoria, loaded_language]   
1  [loaded_lang

In [3]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
secret_value_0 = user_secrets.get_secret("hfsb")

In [4]:
# Step 1: Handle missing values in 'trigger_words' and 'techniques'
train_df['trigger_words'] = train_df['trigger_words'].apply(lambda x: eval(x) if isinstance(x, str) else [])
train_df['techniques'] = train_df['techniques'].apply(lambda x: eval(x) if isinstance(x, str) else [])

# Step 2: Tokenization (using HuggingFace's tokenizer)
from transformers import AutoTokenizer, AutoModelForTokenClassification

# Replace '<your_token>' with the retrieved token from Kaggle Secrets
tokenizer = AutoTokenizer.from_pretrained('SpanBERT/spanbert-large-cased', use_auth_token=secret_value_0)
model = AutoModelForTokenClassification.from_pretrained('SpanBERT/spanbert-large-cased', use_auth_token=secret_value_0)


# Function to tokenize and handle truncation
def tokenize_with_truncation(text, max_length=512):
    tokens = tokenizer.encode(text, add_special_tokens=True, truncation=True, max_length=max_length)
    return tokens

# Tokenize the 'content' column with truncation
train_df['tokens'] = train_df['content'].apply(lambda x: tokenize_with_truncation(x))

# Step 3: Convert 'manipulative' target to numeric values
train_df['manipulative'] = train_df['manipulative'].astype(int)

# Step 4: Ensure the 'trigger_words' spans are correctly formatted for token-based indices
def get_tokenized_trigger_words(text, trigger_words, tokenizer, max_length=512):
    tokens = tokenizer.encode(text, add_special_tokens=True, truncation=True, max_length=max_length)
    trigger_word_indices = []
    for span in trigger_words:
        start, end = span
        # Adjust start and end positions based on truncation
        start_token_index = len(tokenizer.encode(text[:start], add_special_tokens=False, truncation=True, max_length=max_length))
        end_token_index = len(tokenizer.encode(text[:end], add_special_tokens=False, truncation=True, max_length=max_length))
        trigger_word_indices.append([start_token_index, end_token_index])
    return trigger_word_indices

train_df['tokenized_trigger_words'] = train_df.apply(
    lambda row: get_tokenized_trigger_words(row['content'], row['trigger_words'], tokenizer),
    axis=1
)

# Check the processed data
print(train_df[['content', 'tokens', 'tokenized_trigger_words', 'manipulative']].head())




config.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/665M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at SpanBERT/spanbert-large-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


                                             content  \
0  –ù–æ–≤–∏–π –æ–≥–ª—è–¥ –º–∞–ø–∏ DeepState –≤—ñ–¥ —Ä–æ—Å—ñ–π—Å—å–∫–æ–≥–æ –≤—ñ–π...   
1  –ù–µ–¥–∞–≤–Ω–æ 95 –∫–≤–∞—Ä—Ç–∞–ª –∂—ë—Å—Ç–∫–æ –ø–æ–≥–ª—É–º–∏–ª—Å—è –Ω–∞–¥ —Ä—É—Å—Å–∫...   
2  ü§©\n–¢–∏–º —á–∞—Å–æ–º –π–¥–µ –µ–≤–∞–∫—É–∞—Ü—ñ—è –ë—î–ª–≥–æ—Ä–æ–¥—Å—å–∫–æ–≥–æ –∞–≤—Ç–æ...   
3  –í –£–∫—Ä–∞—ó–Ω—ñ –Ω–∞–π–±–ª–∏–∂—á–∏–º —á–∞—Å–æ–º –º–∞—é—Ç—å –Ω–∞–º—ñ—Ä –ø–æ—Å–∏–ª–∏—Ç...   
4  –†–∞—Å—á—ë—Ç—ã 122-–º–º –°–ê–£ 2–°1 "–ì–≤–æ–∑–¥–∏–∫–∞" 132-–π –±—Ä–∏–≥–∞–¥...   

                                              tokens tokenized_trigger_words  \
0  [101, 488, 24625, 17424, 17424, 489, 28395, 28...                      []   
1  [101, 488, 19692, 28396, 10286, 28394, 17127, ...                      []   
2  [101, 100, 493, 17424, 28401, 498, 10286, 2840...                      []   
3  [101, 477, 494, 28399, 20442, 10286, 28418, 17...                      []   
4  [101, 491, 10286, 28403, 28409, 19692, 28404, ...                      []   

   m

In [5]:
# Check if there are any empty trigger words
empty_trigger_count = train_df[train_df['tokenized_trigger_words'].apply(lambda x: len(x) == 0)].shape[0]
print(f"Number of rows with empty trigger words: {empty_trigger_count}")

# Handle rows with no trigger words (optionally, we could mark them as 'None' or fill them)
train_df['tokenized_trigger_words'] = train_df['tokenized_trigger_words'].apply(lambda x: x if len(x) > 0 else None)

# Check the result again
print(train_df[['content', 'tokenized_trigger_words']].head())


Number of rows with empty trigger words: 3822
                                             content tokenized_trigger_words
0  –ù–æ–≤–∏–π –æ–≥–ª—è–¥ –º–∞–ø–∏ DeepState –≤—ñ–¥ —Ä–æ—Å—ñ–π—Å—å–∫–æ–≥–æ –≤—ñ–π...                    None
1  –ù–µ–¥–∞–≤–Ω–æ 95 –∫–≤–∞—Ä—Ç–∞–ª –∂—ë—Å—Ç–∫–æ –ø–æ–≥–ª—É–º–∏–ª—Å—è –Ω–∞–¥ —Ä—É—Å—Å–∫...                    None
2  ü§©\n–¢–∏–º —á–∞—Å–æ–º –π–¥–µ –µ–≤–∞–∫—É–∞—Ü—ñ—è –ë—î–ª–≥–æ—Ä–æ–¥—Å—å–∫–æ–≥–æ –∞–≤—Ç–æ...                    None
3  –í –£–∫—Ä–∞—ó–Ω—ñ –Ω–∞–π–±–ª–∏–∂—á–∏–º —á–∞—Å–æ–º –º–∞—é—Ç—å –Ω–∞–º—ñ—Ä –ø–æ—Å–∏–ª–∏—Ç...                    None
4  –†–∞—Å—á—ë—Ç—ã 122-–º–º –°–ê–£ 2–°1 "–ì–≤–æ–∑–¥–∏–∫–∞" 132-–π –±—Ä–∏–≥–∞–¥...                    None


In [6]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

# Replace '<your_token>' with the retrieved token from Kaggle Secrets
tokenizer = AutoTokenizer.from_pretrained('SpanBERT/spanbert-large-cased', use_auth_token=secret_value_0)
model = AutoModelForTokenClassification.from_pretrained('SpanBERT/spanbert-large-cased', use_auth_token=secret_value_0)


# Tokenize the 'content' column
train_df['tokenized_content'] = train_df['content'].apply(lambda x: tokenizer.encode(x, truncation=True, padding='max_length', max_length=512))

# Check the result
print(train_df[['content', 'tokenized_content']].head())

Some weights of BertForTokenClassification were not initialized from the model checkpoint at SpanBERT/spanbert-large-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


                                             content  \
0  –ù–æ–≤–∏–π –æ–≥–ª—è–¥ –º–∞–ø–∏ DeepState –≤—ñ–¥ —Ä–æ—Å—ñ–π—Å—å–∫–æ–≥–æ –≤—ñ–π...   
1  –ù–µ–¥–∞–≤–Ω–æ 95 –∫–≤–∞—Ä—Ç–∞–ª –∂—ë—Å—Ç–∫–æ –ø–æ–≥–ª—É–º–∏–ª—Å—è –Ω–∞–¥ —Ä—É—Å—Å–∫...   
2  ü§©\n–¢–∏–º —á–∞—Å–æ–º –π–¥–µ –µ–≤–∞–∫—É–∞—Ü—ñ—è –ë—î–ª–≥–æ—Ä–æ–¥—Å—å–∫–æ–≥–æ –∞–≤—Ç–æ...   
3  –í –£–∫—Ä–∞—ó–Ω—ñ –Ω–∞–π–±–ª–∏–∂—á–∏–º —á–∞—Å–æ–º –º–∞—é—Ç—å –Ω–∞–º—ñ—Ä –ø–æ—Å–∏–ª–∏—Ç...   
4  –†–∞—Å—á—ë—Ç—ã 122-–º–º –°–ê–£ 2–°1 "–ì–≤–æ–∑–¥–∏–∫–∞" 132-–π –±—Ä–∏–≥–∞–¥...   

                                   tokenized_content  
0  [101, 488, 24625, 17424, 17424, 489, 28395, 28...  
1  [101, 488, 19692, 28396, 10286, 28394, 17127, ...  
2  [101, 100, 493, 17424, 28401, 498, 10286, 2840...  
3  [101, 477, 494, 28399, 20442, 10286, 28418, 17...  
4  [101, 491, 10286, 28403, 28409, 19692, 28404, ...  


In [7]:
import ast

# Step 1: Map 'manipulative' column to binary labels (True -> 1, False -> 0)
train_df['manipulative_label'] = train_df['manipulative'].map({True: 1, False: 0})

# Save the dataframe with 'manipulative_label' as a CSV file
train_df[['id', 'content', 'manipulative', 'manipulative_label']].to_csv('/kaggle/working/train_with_labels.csv', index=False)

# Step 2: Convert 'trigger_words' from string to actual list of tuples (start, end)
# Apply the conversion only when the value is not None or 'no_trigger'
train_df['trigger_words'] = train_df['tokenized_trigger_words'].apply(
    lambda x: ast.literal_eval(x) if x not in [None, 'no_trigger'] else []
)

# Save the dataframe with formatted 'trigger_words' as a CSV file
train_df[['id', 'content', 'tokenized_trigger_words', 'trigger_words']].to_csv('/kaggle/working/train_with_trigger_words.csv', index=False)

# Output the head of the dataframe to confirm the changes
print(train_df[['id', 'content', 'manipulative', 'manipulative_label', 'trigger_words']].head())


                                     id  \
0  0bb0c7fa-101b-4583-a5f9-9d503339141c   
1  7159f802-6f99-4e9d-97bd-6f565a4a0fae   
2  e6a427f1-211f-405f-bd8b-70798458d656   
3  1647a352-4cd3-40f6-bfa1-d87d42e34eea   
4  9c01de00-841f-4b50-9407-104e9ffb03bf   

                                             content  manipulative  \
0  –ù–æ–≤–∏–π –æ–≥–ª—è–¥ –º–∞–ø–∏ DeepState –≤—ñ–¥ —Ä–æ—Å—ñ–π—Å—å–∫–æ–≥–æ –≤—ñ–π...             1   
1  –ù–µ–¥–∞–≤–Ω–æ 95 –∫–≤–∞—Ä—Ç–∞–ª –∂—ë—Å—Ç–∫–æ –ø–æ–≥–ª—É–º–∏–ª—Å—è –Ω–∞–¥ —Ä—É—Å—Å–∫...             1   
2  ü§©\n–¢–∏–º —á–∞—Å–æ–º –π–¥–µ –µ–≤–∞–∫—É–∞—Ü—ñ—è –ë—î–ª–≥–æ—Ä–æ–¥—Å—å–∫–æ–≥–æ –∞–≤—Ç–æ...             1   
3  –í –£–∫—Ä–∞—ó–Ω—ñ –Ω–∞–π–±–ª–∏–∂—á–∏–º —á–∞—Å–æ–º –º–∞—é—Ç—å –Ω–∞–º—ñ—Ä –ø–æ—Å–∏–ª–∏—Ç...             0   
4  –†–∞—Å—á—ë—Ç—ã 122-–º–º –°–ê–£ 2–°1 "–ì–≤–æ–∑–¥–∏–∫–∞" 132-–π –±—Ä–∏–≥–∞–¥...             1   

   manipulative_label trigger_words  
0                 NaN            []  
1                 NaN            []  
2                 NaN       

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


In [8]:
import pandas as pd

# Load the datasets
train_with_labels_df = pd.read_csv('/kaggle/working/train_with_labels.csv')
train_with_trigger_words_df = pd.read_csv('/kaggle/working/train_with_trigger_words.csv')

# Display the first few rows of the datasets
train_with_labels_df.head(), train_with_trigger_words_df.head()


  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


(                                     id  \
 0  0bb0c7fa-101b-4583-a5f9-9d503339141c   
 1  7159f802-6f99-4e9d-97bd-6f565a4a0fae   
 2  e6a427f1-211f-405f-bd8b-70798458d656   
 3  1647a352-4cd3-40f6-bfa1-d87d42e34eea   
 4  9c01de00-841f-4b50-9407-104e9ffb03bf   
 
                                              content  manipulative  \
 0  –ù–æ–≤–∏–π –æ–≥–ª—è–¥ –º–∞–ø–∏ DeepState –≤—ñ–¥ —Ä–æ—Å—ñ–π—Å—å–∫–æ–≥–æ –≤—ñ–π...             1   
 1  –ù–µ–¥–∞–≤–Ω–æ 95 –∫–≤–∞—Ä—Ç–∞–ª –∂—ë—Å—Ç–∫–æ –ø–æ–≥–ª—É–º–∏–ª—Å—è –Ω–∞–¥ —Ä—É—Å—Å–∫...             1   
 2  ü§©\n–¢–∏–º —á–∞—Å–æ–º –π–¥–µ –µ–≤–∞–∫—É–∞—Ü—ñ—è –ë—î–ª–≥–æ—Ä–æ–¥—Å—å–∫–æ–≥–æ –∞–≤—Ç–æ...             1   
 3  –í –£–∫—Ä–∞—ó–Ω—ñ –Ω–∞–π–±–ª–∏–∂—á–∏–º —á–∞—Å–æ–º –º–∞—é—Ç—å –Ω–∞–º—ñ—Ä –ø–æ—Å–∏–ª–∏—Ç...             0   
 4  –†–∞—Å—á—ë—Ç—ã 122-–º–º –°–ê–£ 2–°1 "–ì–≤–æ–∑–¥–∏–∫–∞" 132-–π –±—Ä–∏–≥–∞–¥...             1   
 
    manipulative_label  
 0                 NaN  
 1                 NaN  
 2                 NaN  
 3                 NaN  
 4 

In [9]:
import ast
from transformers import AutoTokenizer, AutoModelForTokenClassification



# Step 1: Populate manipulative_label
train_with_labels_df['manipulative_label'] = train_with_labels_df['manipulative']

# Step 2: Handle missing trigger words
train_with_trigger_words_df['trigger_words'] = train_with_trigger_words_df['trigger_words'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) and x != 'no_trigger' else [])

# Tokenize the content
# Replace '<your_token>' with the retrieved token from Kaggle Secrets
tokenizer = AutoTokenizer.from_pretrained('SpanBERT/spanbert-large-cased', use_auth_token=secret_value_0)


# Function to tokenize and pad sequences
def tokenize_and_pad(text):
    return tokenizer(text, padding='max_length', truncation=True, max_length=512, return_tensors='pt')

# Apply tokenization to the content column
train_with_trigger_words_df['tokenized_content'] = train_with_trigger_words_df['content'].apply(lambda x: tokenize_and_pad(x))

# Save the updated DataFrame with tokenized content
train_with_trigger_words_df.to_csv('/kaggle/working/train_with_tokenized_content.csv', index=False)

# Check the updated DataFrame
train_with_trigger_words_df[['id', 'content', 'tokenized_content']].head()




Unnamed: 0,id,content,tokenized_content
0,0bb0c7fa-101b-4583-a5f9-9d503339141c,–ù–æ–≤–∏–π –æ–≥–ª—è–¥ –º–∞–ø–∏ DeepState –≤—ñ–¥ —Ä–æ—Å—ñ–π—Å—å–∫–æ–≥–æ –≤—ñ–π...,"[input_ids, token_type_ids, attention_mask]"
1,7159f802-6f99-4e9d-97bd-6f565a4a0fae,–ù–µ–¥–∞–≤–Ω–æ 95 –∫–≤–∞—Ä—Ç–∞–ª –∂—ë—Å—Ç–∫–æ –ø–æ–≥–ª—É–º–∏–ª—Å—è –Ω–∞–¥ —Ä—É—Å—Å–∫...,"[input_ids, token_type_ids, attention_mask]"
2,e6a427f1-211f-405f-bd8b-70798458d656,ü§©\n–¢–∏–º —á–∞—Å–æ–º –π–¥–µ –µ–≤–∞–∫—É–∞—Ü—ñ—è –ë—î–ª–≥–æ—Ä–æ–¥—Å—å–∫–æ–≥–æ –∞–≤—Ç–æ...,"[input_ids, token_type_ids, attention_mask]"
3,1647a352-4cd3-40f6-bfa1-d87d42e34eea,–í –£–∫—Ä–∞—ó–Ω—ñ –Ω–∞–π–±–ª–∏–∂—á–∏–º —á–∞—Å–æ–º –º–∞—é—Ç—å –Ω–∞–º—ñ—Ä –ø–æ—Å–∏–ª–∏—Ç...,"[input_ids, token_type_ids, attention_mask]"
4,9c01de00-841f-4b50-9407-104e9ffb03bf,"–†–∞—Å—á—ë—Ç—ã 122-–º–º –°–ê–£ 2–°1 ""–ì–≤–æ–∑–¥–∏–∫–∞"" 132-–π –±—Ä–∏–≥–∞–¥...","[input_ids, token_type_ids, attention_mask]"


In [10]:
import pandas as pd
import ast
from transformers import AutoTokenizer, AutoModelForTokenClassification



# Assuming you have already loaded the dataset in train_with_trigger_words_df and train_with_labels_df
train_with_trigger_words_df = pd.read_csv('/kaggle/working/train_with_trigger_words.csv')
train_with_labels_df = pd.read_csv('/kaggle/working/train_with_labels.csv')

# Step 1: Check if the 'manipulative' column exists
if 'manipulative' not in train_with_trigger_words_df.columns:
    print("Error: 'manipulative' column is missing in the dataset.")
    # If missing, manually add a dummy or placeholder 'manipulative' column for further processing:
    # You may need to replace this with your actual logic if you have another source for this column.
    # For now, assuming the 'manipulative' column should be derived from some other logic.
    
    # Example of creating a dummy column based on index
    train_with_trigger_words_df['manipulative'] = [True if i % 2 == 0 else False for i in range(len(train_with_trigger_words_df))]
    print("Added dummy 'manipulative' column for testing.")

# Step 2: Map 'manipulative' column to binary labels (True -> 1, False -> 0)
train_with_trigger_words_df['manipulative_label'] = train_with_trigger_words_df['manipulative'].map({True: 1, False: 0})

# Step 3: Handle missing 'trigger_words'
train_with_trigger_words_df['trigger_words'] = train_with_trigger_words_df['trigger_words'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) and x != 'no_trigger' else [])

# Step 4: Tokenization using HuggingFace's BERT tokenizer
# Replace '<your_token>' with the retrieved token from Kaggle Secrets
tokenizer = AutoTokenizer.from_pretrained('SpanBERT/spanbert-large-cased', use_auth_token=secret_value_0)


def tokenize_and_pad(text):
    encoding = tokenizer.encode_plus(
        text, 
        padding='max_length', 
        truncation=True, 
        max_length=512, 
        return_tensors='pt'
    )
    # Extract token_ids from the encoding
    return encoding['input_ids'].squeeze().tolist()  # Convert tensor to list

# Apply tokenization
train_with_trigger_words_df['tokenized_content'] = train_with_trigger_words_df['content'].apply(lambda x: tokenize_and_pad(x))

# Save the updated DataFrame with tokenized content
train_with_trigger_words_df.to_csv('/kaggle/working/train_with_tokenized_content.csv', index=False)

# Check the updated DataFrame
print(train_with_trigger_words_df[['id', 'content', 'tokenized_content']].head())


Error: 'manipulative' column is missing in the dataset.
Added dummy 'manipulative' column for testing.




                                     id  \
0  0bb0c7fa-101b-4583-a5f9-9d503339141c   
1  7159f802-6f99-4e9d-97bd-6f565a4a0fae   
2  e6a427f1-211f-405f-bd8b-70798458d656   
3  1647a352-4cd3-40f6-bfa1-d87d42e34eea   
4  9c01de00-841f-4b50-9407-104e9ffb03bf   

                                             content  \
0  –ù–æ–≤–∏–π –æ–≥–ª—è–¥ –º–∞–ø–∏ DeepState –≤—ñ–¥ —Ä–æ—Å—ñ–π—Å—å–∫–æ–≥–æ –≤—ñ–π...   
1  –ù–µ–¥–∞–≤–Ω–æ 95 –∫–≤–∞—Ä—Ç–∞–ª –∂—ë—Å—Ç–∫–æ –ø–æ–≥–ª—É–º–∏–ª—Å—è –Ω–∞–¥ —Ä—É—Å—Å–∫...   
2  ü§©\n–¢–∏–º —á–∞—Å–æ–º –π–¥–µ –µ–≤–∞–∫—É–∞—Ü—ñ—è –ë—î–ª–≥–æ—Ä–æ–¥—Å—å–∫–æ–≥–æ –∞–≤—Ç–æ...   
3  –í –£–∫—Ä–∞—ó–Ω—ñ –Ω–∞–π–±–ª–∏–∂—á–∏–º —á–∞—Å–æ–º –º–∞—é—Ç—å –Ω–∞–º—ñ—Ä –ø–æ—Å–∏–ª–∏—Ç...   
4  –†–∞—Å—á—ë—Ç—ã 122-–º–º –°–ê–£ 2–°1 "–ì–≤–æ–∑–¥–∏–∫–∞" 132-–π –±—Ä–∏–≥–∞–¥...   

                                   tokenized_content  
0  [101, 488, 24625, 17424, 17424, 489, 28395, 28...  
1  [101, 488, 19692, 28396, 10286, 28394, 17127, ...  
2  [101, 100, 493, 17424, 28401, 498, 10286, 2840...  
3  [10

In [24]:
# from transformers import AutoTokenizer, AutoModelForTokenClassification
# import torch

# # Define the device (GPU if available, otherwise CPU)
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# hf_token = "hfsb"  # Replace with your actual token

# # Load the tokenizer and model with token authentication
# tokenizer = AutoTokenizer.from_pretrained('microsoft/llmlingua-2-bert-base-multilingual-cased-meetingbank')

# # Use ignore_mismatched_sizes=True to avoid errors for mismatched layer sizes
# model = AutoModelForTokenClassification.from_pretrained(
#     'microsoft/llmlingua-2-bert-base-multilingual-cased-meetingbank',
#     num_labels=2,  # Set the number of labels according to your task
#     use_auth_token=hf_token,
#     ignore_mismatched_sizes=True  # This allows for size mismatch in the classifier layer
# )


In [27]:
import torch
import time
from transformers import AutoTokenizer, AutoModelForTokenClassification
from huggingface_hub import login
from torch.optim import AdamW
from tqdm import tqdm
from sklearn.metrics import f1_score
from torch.optim import Adam
import pandas as pd

# Step 1: Authenticate with Hugging Face
# Replace 'your_huggingface_token' with your actual Hugging Face API token
login(token="roberta")

# Step 2: Prepare the model and tokenizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_name = "Nic1/roberta-finetuned-propaganda-span-identification"

# Load the tokenizer and model from Hugging Face Model Hub using authentication
tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=True)
model = AutoModelForTokenClassification.from_pretrained(model_name, use_auth_token=True)
model.to(device)

# Step 3: Prepare the Dataset
class TokenClassificationDataset(torch.utils.data.Dataset):
    def __init__(self, df):
        self.tokenizer = tokenizer
        self.data = df
        self.max_length = 512  # Max length for XLM-RoBERTa

    def __getitem__(self, idx):
        content = self.data.iloc[idx]['content']
        label = self.data.iloc[idx]['manipulative_label']
        
        # Tokenize the content with padding and truncation
        encoding = self.tokenizer(content, truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt')
        
        # Get input_ids and attention_mask from encoding
        input_ids = encoding['input_ids'].squeeze(0).to(device)
        attention_mask = encoding['attention_mask'].squeeze(0).to(device)

        # Create labels tensor with padding (should be the same length as input_ids)
        labels_tensor = torch.full_like(input_ids, -100).to(device)  # Use -100 to ignore padding tokens during loss computation
        
        # Set the label for the entire sequence (all tokens)
        labels_tensor[:] = label  # Assign the same label to all tokens in the sequence

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': labels_tensor,
        }

    def __len__(self):
        return len(self.data)

# Load your dataset (replace this with your actual dataframe loading method)
train_with_tokenized_content_df = pd.read_csv("/kaggle/working/train_with_tokenized_content.csv")

# Create dataset for training
train_dataset = TokenClassificationDataset(train_with_tokenized_content_df)

# Step 4: Training Arguments
training_args = {
    'output_dir': '/kaggle/working',          # Directory to save model
    'num_train_epochs': 5,                   # Number of epochs
    'per_device_train_batch_size': 12,        # Batch size per device
    'per_device_eval_batch_size': 12,         # Batch size for evaluation
    'warmup_steps': 500,                     # Warm-up steps
    'weight_decay': 0.01,                    # Strength of weight decay
    'logging_dir': '/kaggle/working/logs',    # Directory for logs
    'logging_steps': 10,
    'evaluation_strategy': "epoch",          # Evaluate after each epoch
    'save_strategy': "epoch",                # Save the model after each epoch
    'load_best_model_at_end': True,          # Load the best model at the end
    'report_to': "none",                     # Disable reporting to Huggingface Hub
    'disable_tqdm': False,                   # Enable progress bars
    'dataloader_num_workers': 2,             # Number of workers for the data loader
}

# Step 5: Training loop and optimizer setup
optimizer = AdamW(model.parameters(), lr=5e-5)

def train_model(model, train_dataset, epochs=5):
    for epoch in range(epochs):
        start_time = time.time()
        model.train()
        
        # Create data loader for batching
        train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=8, shuffle=True)

        total_loss = 0
        correct_predictions = 0
        total_predictions = 0
        
        # Loop through the batches and perform training
        for batch in tqdm(train_loader, desc=f'Epoch {epoch + 1}/{epochs}', unit='batch'):
            optimizer.zero_grad()  # Reset gradients
            input_ids = batch['input_ids']
            attention_mask = batch['attention_mask']
            labels = batch['labels']

            # Forward pass
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            logits = outputs.logits

            # Backward pass
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
            predictions = torch.argmax(logits, dim=-1)
            correct_predictions += (predictions == labels).sum().item()
            total_predictions += labels.numel()

        # Calculate average loss and accuracy for the epoch
        avg_loss = total_loss / len(train_loader)
        accuracy = correct_predictions / total_predictions

        # Compute F1 score (macro)
        f1 = f1_score(labels.cpu().numpy(), predictions.cpu().numpy(), average='macro')

        # Log the metrics
        epoch_duration = time.time() - start_time
        print(f'Epoch {epoch + 1}/{epochs} | Loss: {avg_loss:.4f} | Accuracy: {accuracy:.4f} | F1 Score: {f1:.4f} | Time: {epoch_duration:.2f}s')

# Step 6: Train the model
train_model(model, train_dataset, epochs=5)

# Save the model's state_dict
output_dir = "/kaggle/working/xlm-roberta-model"
os.makedirs(output_dir, exist_ok=True)
torch.save(model.state_dict(), f"{output_dir}/model_state_dict.pth")

# Save the tokenizer
tokenizer.save_pretrained(output_dir)

print("Training completed and model saved!") 

Epoch 1/5: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 478/478 [07:20<00:00,  1.08batch/s]


Epoch 1/5 | Loss: 0.7125 | Accuracy: 0.5003 | F1 Score: 0.0000 | Time: 440.97s


Epoch 2/5: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 478/478 [07:19<00:00,  1.09batch/s]


Epoch 2/5 | Loss: 0.6949 | Accuracy: 0.5095 | F1 Score: 0.4855 | Time: 439.73s


Epoch 3/5: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 478/478 [07:19<00:00,  1.09batch/s]


Epoch 3/5 | Loss: 0.6954 | Accuracy: 0.4947 | F1 Score: 0.8177 | Time: 439.86s


Epoch 4/5: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 478/478 [07:19<00:00,  1.09batch/s]


Epoch 4/5 | Loss: 0.6948 | Accuracy: 0.5057 | F1 Score: 0.7990 | Time: 439.64s


Epoch 5/5: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 478/478 [07:19<00:00,  1.09batch/s]


Epoch 5/5 | Loss: 0.6948 | Accuracy: 0.4951 | F1 Score: 0.5451 | Time: 439.57s
Training completed and model saved!


In [28]:
import torch   
from torch.utils.data import DataLoader
from tqdm import tqdm
import pandas as pd

# Step 6: Model Evaluation on Test Set

# Load test data
test_df = pd.read_csv('/kaggle/input/unlp-2025-shared-task-span-identification/test.csv')

# Tokenize the test data
test_encodings = tokenizer(list(test_df['content']), truncation=True, padding=True, max_length=512, return_tensors='pt')

# Prepare the labels (if ground truth is available)
# For inference purposes, we'll predict without labels
# Placeholder labels (for token classification, each token has a label, not just a sequence)
labels = [[0] * len(test_encodings['input_ids'][i]) for i in range(len(test_df))]  # Placeholder, no ground truth

# Create a custom dataset for the test set (same structure as the training dataset)
class ManipulationDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx]).to(device)  # Label per token
        return item

    def __len__(self):
        return len(self.labels)

# Create the test dataset
test_dataset = ManipulationDataset(test_encodings, labels)

# Create DataLoader for batching during evaluation
test_dataloader = DataLoader(test_dataset, batch_size=8)

# Put model in evaluation mode
model.eval()

all_preds = []

# Disable gradient calculation for inference
with torch.no_grad():
    for batch in tqdm(test_dataloader, desc="Evaluating on Test Set"):
        # Move batch to GPU
        batch = {key: val.to(device) for key, val in batch.items()}

        # Forward pass
        outputs = model(input_ids=batch['input_ids'],
                        attention_mask=batch['attention_mask'],
                        labels=batch['labels'])  # Correctly handle labels for token classification

        logits = outputs.logits
        
        # Get predictions
        preds = torch.argmax(logits, dim=-1).cpu().numpy()
        all_preds.extend(preds)

# Prepare the final submission format with 'trigger_words' column
def get_trigger_words(predictions):
    trigger_words = []
    for pred in predictions:
        spans = []
        for idx, token in enumerate(pred):
            if token == 1:  # If the token is predicted as a trigger (1)
                start = idx
                end = idx + 1
                spans.append((start, end))
        trigger_words.append(spans if spans else "[]")
    return trigger_words

# Generate 'trigger_words' for each test sample
trigger_words = get_trigger_words(all_preds)

# Add the 'trigger_words' to the test dataframe
test_df['trigger_words'] = trigger_words

# Save the result in the required format
submission_df = test_df[['id', 'trigger_words']]

# Save the predictions to a CSV file in the submission format
submission_df.to_csv('/kaggle/working/final_submission.csv', index=False)

print("Predictions saved to /kaggle/working/final_submission.csv")


Evaluating on Test Set: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 717/717 [02:57<00:00,  4.04it/s]


Predictions saved to /kaggle/working/final_submission.csv
