0. Data and Libraries

0.1 Preprocessed Texts

0.2 Libraries (Especially Unsloth Download)

In [None]:
#capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab and Kaggle notebooks! Otherwise use pip install unsloth
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29 peft trl triton
    !pip install --no-deps cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf datasets huggingface_hub hf_transfer
    !pip install --no-deps unsloth

In [None]:
!pip install xformers

1. Initialize Unsloth

In [None]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2000 # Choose any! We auto support RoPE Scaling internally!
dtype = torch.float16 # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = False # Use 4bit quantization to reduce memory usage. Can be False.

if True:
    from unsloth import FastLanguageModel
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = "lora_model_Llama3.2_3B", # Pretrained unsloth model 
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit,
    )
    FastLanguageModel.for_inference(model) # Enable native 2x faster inference

2. Test Whether it Works

In [None]:
instruction = (
    "You are an expert researcher in political science. "
    "Analyze the following text and determine which of the following solidarity categories best describe it: "
    "Sentence to be analyzed: We as a Community of European States must hold together in times of crisis"
    "POSSIBLE LABELS:"
    "1. National Solidarity"
    "2. European-Level Solidarity,"
    "3. Outside-Europe Solidarity. "
    "Decide which label is most suitable and provide the output in a JSON format like this:"
    "{\"label\": \"<Insert one label: National Solidarity, European-Level Solidarity, or Outside-Europe Solidarity>\"}"
    "ONLY WRITE IN THE JSON FORMAT. GENERATE NOTHING ELSE."
)
messages = [
    {"role": "system", "content": instruction},
    {"role": "user", "content": "We as a Community of European States must hold together in times of crisis"}
]
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize = True,
    add_generation_prompt = True, # Must add for generation
    return_tensors = "pt",
).to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer, skip_prompt = True)
_ = model.generate(input_ids = inputs, streamer = text_streamer, max_new_tokens = 300,
                   use_cache = True, temperature = 1.5, min_p = 0.1)

3. Prompts

3.1 Extract Solidarity Sentences

In [None]:
def code_solidarity_label(input_text, model, tokenizer):
    """
    This function instructs the model, an expert political scientist, to analyze a political statement 
    and decide if it expresses any indication of solidarity towards another actor.
    The output is a JSON array with a single object containing two keys:
      - "Solidarity": "Solidarity" if the statement shows explicit support, assistance, or positive alignment towards another party;
                        "No Solidarity" otherwise.
      - "Justification": A brief explanation (one or two sentences) justifying the label, referring to specific phrases if applicable.
    
    Parameters:
        input_text (str): The text to analyze.
        model: The language model.
        tokenizer: The tokenizer for the model.
    
    Returns:
        str: The model's generated response as a structured JSON array.
    """
    
    instruction = (
        "You are an expert political scientist specializing in discourse analysis. "
        "Your task is to analyze the following statement and determine whether it expresses any indication of solidarity towards another actor. "
        "For your response, return a JSON array with a single object containing two keys: 'Solidarity' and 'Justification'. "
        "The 'Solidarity' key should have the value 'Solidarity' if the statement shows support, or 'No Solidarity' if it does not. "
        "The 'Justification' key should provide a brief explanation (one or two sentences) citing specific phrases or elements that justify your label. \n\n"
        "For example, if the input is:\n"
        "\"I stand with you in these trying times, offering all my support,\" \n"
        "then the output should be:\n"
        "[\n"
        "  {\n"
        "    \"Solidarity\": \"Solidarity\",\n"
        "    \"Justification\": \"The statement explicitly offers support and solidarity through phrases like 'I stand with you' and 'offering all my support'.\"\n"
        "  }\n"
        "]\n\n"
        "If the input is:\n"
        "\"I am indifferent to your situation,\" \n"
        "then the output should be:\n"
        "[\n"
        "  {\n"
        "    \"Solidarity\": \"No Solidarity\",\n"
        "    \"Justification\": \"The statement does not express any form of support or solidarity, as indicated by the word 'indifferent'.\"\n"
        "  }\n"
        "]\n\n"
        "Now, analyze the following text:\n"
        f"{input_text}"
    )
    
    # Build the prompt (here we're combining the instruction and the input text)
    prompt = instruction
    
    # Create the text-generation pipeline with a controlled temperature setting
    pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, temperature=0.9)
    output = pipe(prompt, max_length=400)
    generated_text = output[0]['generated_text'].strip()
    
    return generated_text

3.2 Extract Solidarity Speech Acts 

In [None]:
def code_solidarity_speech_acts(input_text, model, tokenizer):
    """
    Function that instructs the model to analyze political text and extract solidarity-related speech acts,
    returning a structured JSON array with the identified information.

    Parameters:
        input_text (str): The political text to be analyzed.
        model: The language model.
        tokenizer: The tokenizer for the model.

    Returns:
        str: The coded response from the model as a structured JSON array.
    """

    # Optimized instruction for extracting solidarity-related speech acts with structured output
    instruction = (
        "You are an expert in political science and computational discourse analysis. "
        "Your task is to analyze the following political text to identify and code all instances of solidarity-related speech acts. "
        "A valid solidarity-related speech act must meet these criteria: "
        "1. Transitive Structure: The utterance must contain a transitive verb structure linking a subject (the provider, typically the speaker) to a direct object (the recipient). "
        "2. Positive Orientation: The subject expresses a positive orientation (they offer some form of assistance or help, even if verbal) toward the recipient, who is understood to be in a comparatively difficult situation. "
        "3. Illocutionary Type: The speech act must clearly fall into one of the following categories: "
        "   - Solidarity Expressives: Utterances that communicate emotional support or compassion without calling for specific action. "
        "   - Solidarity Directives: Utterances that demand, request, or call on others to act in support of the recipient. "
        "   - Solidarity Commissives: Utterances in which the speaker commits themselves to take future actions to support the recipient. "
        "For each solidarity-related speech act identified in the text, extract and return the following information: "
        "   - SpeechActType: Label the act as 'Expressive,' 'Directive,' or 'Commissive.' "
        "   - Provider: Identify the actor (usually the speaker) performing the solidarity action. "
        "   - Recipient: Identify the actor (direct object) who is in need and toward whom the positive orientation is directed. "
        "   - PropositionalContent: Summarize the core message or demand embedded in the speech act. "
        "If a sentence does not contain a solidarity-related speech act or lacks the required transitive structure, do not return anything. "
        "Return your analysis as a structured JSON array where each element represents an identified speech act with the keys "
        "'SpeechActType,' 'Provider,' 'Recipient,' and 'PropositionalContent.' "
        "Do not include any additional commentary or text beyond this structured output. "
        "For example, your output should look like this:\n"
        "[\n"
        "  {\n"
        "    \"SpeechActType\": \"\",\n"
        "    \"Provider\": \"\",\n"
        "    \"Recipient\": \"\",\n"
        "    \"PropositionalContent\": \"\"\n"
        "  }\n"
        "]\n"
        "Text to analyze:\n"
        f"{input_text}"
    )

    # Structuring the message with system and user roles
    messages = [
        {"role": "user", "content": instruction}
    ]

    # Creating the text-generation pipeline with a deterministic setting
    pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, temperature=0.9)
    output = pipe(messages)

    # Return the generated response containing the structured JSON analysis
    return output[0]['generated_text']
