# 1. Importar Librerias

In [1]:
import pandas as pd
import numpy as np

from datetime import datetime
import json


import matplotlib.pyplot as plt
import seaborn as sns


import spacy
# from spacy import displacy

import re
from multiprocessing import process
from transformers import pipeline


  from .autonotebook import tqdm as notebook_tqdm


# 2. Carga del Dataset

In [2]:
# Train
url = "data/train.csv" 
df = pd.read_csv(url)
df_original = df.copy()

url2 = "formated/train._exportado.csv" 
df_formated = pd.read_csv(url)


# Test

url3 = "data/test_nolabel.csv"
df_test_original = pd.read_csv(url3)

url4 = "formated/test_exportado.csv"
df_test_formated = pd.read_csv(url4)

# 3. LLM Features 

## 3.1 speaker_job

In [3]:
df_original['speaker_job'] = (
    df_original['speaker_job']
    .str.lower()
    .str.strip()
    .str.replace(r'\s+', '_', regex=True)
)


speaker_job_counts = df_original['speaker_job'].value_counts()
top_n = 13
frequent_speaker_jobs = set(speaker_job_counts.head(top_n).index)
print(f"Frequent speaker jobs: {frequent_speaker_jobs}")

Frequent speaker jobs: {'u.s._senator', 'state_senator', 'u.s._house_of_representatives', 'governor', 'state_representative', 'senator', 'president', 'attorney', 'president-elect', 'former_governor', 'presidential_candidate', 'u.s._representative', 'milwaukee_county_executive'}


In [4]:
mapping_speaker_job_en = {
    "us_senator": "federal_legislator_senate",
    "president": "president",
    "governor": "governor",
    "us_representative": "federal_legislator_house",
    "president-elect": "president",
    "presidential_candidate": "presidential_candidate",
    "state_senator": "state_legislator_senate",
    "state_representative": "state_legislator_house",
    "former_governor": "governor",
    "senator": "federal_legislator_senate",  # Assuming federal context in absence of "state"
    "milwaukee_county_executive": "local_executive",
    "attorney": "legal_professional",
    "us_house_of_representatives": "federal_legislator_house",
    "social_media_posting": "public_communicator",  # Could be refined to "non_traditional_communicator" if relevant
    "governor_of_new_jersey": "governor",
    "congressman": "federal_legislator_house",
    "co-host_on_cnn's_\"crossfire\"": "public_communicator",
    "us_congressman": "federal_legislator_house",
    "congresswoman": "federal_legislator_house",
    "speaker_of_the_house_of_representatives": "federal_legislative_leader",
    "businessman": "business_professional",
    "governor_of_ohio_as_of_jan_10,_2011": "governor",
    "author": "public_communicator",
    "lawyer": "legal_professional",
    "candidate_for_us_senate_and_physician": "federal_legislative_candidate",
    "us_senator_from_ohio": "federal_legislator_senate",
    "lieutenant_governor": "state_executive",
    "house_majority_leader": "federal_legislative_leader",
    "us_representative,_florida_district_23": "federal_legislator_house",
    "consultant": "political_advisor",
    "mayor_of_providence": "local_executive",
    "former_president": "president",
    "columnist": "public_communicator",
    "political_action_committee": "political_organization",
    "radio_host": "public_communicator",
    "madison_school_board_member": "local_education_official",
    "secretary_of_state": "state_executive",
    "attorney_general": "state_executive",
    "us_house_member": "federal_legislator_house",
    "msnbc_host": "public_communicator",
    "senate_democratic_leader": "federal_legislative_leader",
    "senate_minority_leader": "federal_legislative_leader",
    "ohio_treasurer": "state_executive",
    "state_assemblyman": "state_legislator_house",
    "chairman,_republican_national_committee": "party_leader",
    "us_representative,_florida_district_22": "federal_legislator_house",
    "house_minority_leader": "federal_legislative_leader",
    "mayor_of_milwaukee": "local_executive",
    "lieutenant_governor-elect": "state_executive",
    "philanthropist": "public_communicator" # Could be considered "influencer" or "social_actor"
}

In [5]:
# Extract all unique mappings from the dictionary
unique_mappings_speaker_job = list(set(mapping_speaker_job_en.values()))
print(unique_mappings_speaker_job)

['federal_legislative_candidate', 'party_leader', 'state_executive', 'local_education_official', 'legal_professional', 'governor', 'political_organization', 'political_advisor', 'federal_legislative_leader', 'federal_legislator_house', 'public_communicator', 'federal_legislator_senate', 'president', 'business_professional', 'state_legislator_senate', 'local_executive', 'presidential_candidate', 'state_legislator_house']


In [6]:
classifier = pipeline('zero-shot-classification', model='roberta-large-mnli')


def crear_frase_detallada_speaker_job(row):
    partes = [f"{row['speaker']}"]
    # if pd.notna(row['speaker_job']):
    #     partes.append(f"({row['speaker_job']})")
    if pd.notna(row['state_info']):
        partes.append(f"from {row['state_info']}")
    if pd.notna(row['party_affiliation']):
        partes.append(f"(affiliated with the {row['party_affiliation']} party)")
    partes.append(f"said: \"{row['statement']}\"")
    if pd.notna(row['subject']):
        partes.append(f"The statement concerns the topics of: {row['subject']}.")
    return " ".join(partes)

def cliassify_speaker_job(row):
    
    text = crear_frase_detallada_speaker_job(row)

    candidate_labels = unique_mappings_speaker_job

    result = classifier(text, candidate_labels)

    # Get the label with the highest score
    best_label = result['labels'][0]
    best_score = result['scores'][0]

    if best_score < 0.2:
        best_label = "other"
        best_score = 0.0

    return best_label, best_score

Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu


In [7]:
df_original.head(3)

Unnamed: 0,id,label,statement,subject,speaker,speaker_job,state_info,party_affiliation
0,81f884c64a7,1,China is in the South China Sea and (building)...,"china,foreign-policy,military",donald-trump,president-elect,New York,republican
1,30c2723a188,0,With the resources it takes to execute just ov...,health-care,chris-dodd,u.s._senator,Connecticut,democrat
2,6936b216e5d,0,The (Wisconsin) governor has proposed tax give...,"corporations,pundits,taxes,abc-news-week",donna-brazile,political_commentator,"Washington, D.C.",democrat


In [8]:
n = len(df_original)
processes_rows = [] 

for i in range(n):
    if pd.isna(df_original.loc[i, 'speaker_job']) or df_original.loc[i, 'speaker_job'].strip() == "":
  
        classification = cliassify_speaker_job(
            df_original.loc[i]
        )[0]

        df_original.loc[i, 'speaker_job'] = classification

        processes_rows.append(i)

print(f"Rows processed: {processes_rows}")

Rows processed: [3, 4, 5, 6, 8, 15, 26, 27, 28, 31, 33, 34, 37, 44, 47, 50, 55, 57, 58, 59, 69, 74, 79, 80, 83, 85, 86, 89, 90, 95, 98, 100, 101, 106, 107, 109, 114, 116, 121, 122, 125, 129, 133, 139, 143, 145, 146, 147, 154, 155, 165, 167, 170, 174, 179, 183, 186, 191, 196, 207, 208, 214, 215, 216, 217, 224, 225, 230, 239, 240, 243, 245, 246, 249, 250, 251, 252, 253, 254, 257, 262, 265, 271, 273, 276, 277, 281, 282, 291, 294, 305, 310, 311, 313, 326, 330, 334, 341, 348, 351, 352, 353, 354, 364, 365, 367, 371, 375, 378, 380, 392, 395, 396, 398, 402, 403, 407, 415, 428, 432, 435, 436, 439, 442, 446, 456, 458, 468, 469, 478, 481, 487, 492, 493, 494, 496, 497, 499, 503, 507, 514, 520, 523, 530, 537, 539, 541, 543, 545, 554, 562, 564, 569, 577, 581, 587, 591, 593, 598, 605, 610, 620, 626, 631, 632, 638, 639, 642, 643, 654, 655, 662, 665, 669, 673, 682, 685, 688, 708, 716, 718, 724, 727, 730, 731, 733, 734, 738, 742, 747, 749, 750, 756, 761, 765, 767, 770, 773, 782, 787, 793, 794, 799, 800,

## 3.2 state_info

In [9]:
def clean_state_info(text):
    if isinstance(text, str):  
        text = re.sub(r'[^\w\s]', '', text)  
        return text.strip()  
    return text  


In [10]:
df_original['state_info'] = df_original['state_info'].str.lower()
df_original['state_info'] = df_original['state_info'].apply(clean_state_info)
state_info_counts = df_original['state_info'].value_counts()

top_n = 15
frequent_state_info = set(state_info_counts.head(top_n).index)
print(f"Frequent state info: {frequent_state_info}")

Frequent state info: {'new york', 'illinois', 'massachusetts', 'florida', 'virginia', 'oregon', 'arizona', 'california', 'washington dc', 'ohio', 'rhode island', 'texas', 'new jersey', 'wisconsin', 'georgia'}


In [11]:
classifier = pipeline('zero-shot-classification', model='roberta-large-mnli')

def crear_frase_detallada_state_info(row):
    partes = [f"{row['speaker']}"]
    if pd.notna(row['speaker_job']):
        partes.append(f"({row['speaker_job']})")
    # if pd.notna(row['state_info']):
    #     partes.append(f"from {row['state_info']}")
    if pd.notna(row['party_affiliation']):
        partes.append(f"(affiliated with the {row['party_affiliation']} party)")
    partes.append(f"said: \"{row['statement']}\"")
    if pd.notna(row['subject']):
        partes.append(f"The statement concerns the topics of: {row['subject']}.")
    return " ".join(partes)



def cliassify_state_info(row):
    
    text = crear_frase_detallada_state_info(row)
    print(text)
    candidate_labels = list(frequent_state_info)


    result = classifier(text, candidate_labels)

    best_label = result['labels'][0]
    best_score = result['scores'][0]

    if best_score < 0.2:
        best_label = "other"
        best_score = 0.0

    return best_label, best_score

Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu


In [12]:
n = len(df_original)

processed_state_info = df_original['state_info'].copy()

processes_rows = []

for i in range(n):
    if pd.isna(df_original.loc[i, 'state_info']) or df_original.loc[i, 'state_info'].strip() == "":
        print(f"Processing row {i} of {n}")
        
        # Clasificar el estado
        classification = cliassify_state_info(df_original.loc[i])[0]
        
        # Almacenar el resultado en la lista
        processed_state_info[i] = classification
        processes_rows.append(i)

df_original['state_info'] = processed_state_info

df_original['state_info'] = df_original['state_info'].apply(
    lambda x: x if x in frequent_state_info else 'other'
)

print(f"Rows processed: {processes_rows}")

Processing row 3 of 8950
rebecca-bradley (legal_professional) (affiliated with the none party) said: "Says her representation of an ex-boyfriend who was then my friend in a family law case is something that lawyers do on a regular basis." The statement concerns the topics of: candidates-biography,children,ethics,families,legal-issues.
Processing row 5 of 8950
we-love-usa-pac (political_organization) (affiliated with the none party) said: "Ron Klein sponsored an amendment that specifically allows price gouging." The statement concerns the topics of: candidates-biography.
Processing row 8 of 8950
aclu-georgia-foundation (political_organization) (affiliated with the none party) said: "In the last 15 years, weve witnessed a dramatic expansion in the jailing of immigrants, from about 70,000 people detained annually to about 400,000." The statement concerns the topics of: immigration.
Processing row 12 of 8950
sean-hannity (radio/tv_host) (affiliated with the none party) said: "Barack Obama 

## 3.3 party_affiliation

In [13]:
mapping = {
    # Major parties
    'republican': 'republican',
    'democrat': 'democrat',
    'democratic-farmer-labor': 'democrat',

    # Independents / None
    # 'none': 'independent_None',
    'independent': 'independent_None',

    # Media
    'newsmaker': 'media',
    'journalist': 'media',
    'columnist': 'media',
    'activist': 'media',
    'talk-show-host': 'media',
    
    # Third parties
    'libertarian': 'third_Party',
    'green': 'third_Party',
    'constitution-party': 'third_Party',
    'liberal-party-canada': 'third_Party',

    # Organizations
    'organization': 'organization',
    
    # Officials  
    'state-official': 'official',
    'business-leader': 'official',
    'labor-leader': 'official',
    'education-official': 'official',
    'government-body': 'official',

    # Tea Party
    'tea-party-member': 'republican',
    'ocean-state-tea-party-action': 'republican',
}

def group_party(x):
    return mapping.get(x, 'other')  # anything else → 'Other'



In [14]:
# Extract all unique mappings from the dictionary
unique_mappings_party = list(set(mapping.values()))
print(unique_mappings_party)

['media', 'organization', 'republican', 'third_Party', 'independent_None', 'official', 'democrat']


In [15]:
classifier = pipeline('zero-shot-classification', model='roberta-large-mnli')

def crear_frase_detallada_party(row):
    partes = [f"{row['speaker']}"]
    if pd.notna(row['speaker_job']):
        partes.append(f"({row['speaker_job']})")
    if pd.notna(row['state_info']):
        partes.append(f"from {row['state_info']}")
    # if pd.notna(row['party_affiliation']) or row['party_affiliation'] != "none":
    #     partes.append(f"(affiliated with the {row['party_affiliation']} party)")
    partes.append(f"said: \"{row['statement']}\"")
    if pd.notna(row['subject']):
        partes.append(f"The statement concerns the topics of: {row['subject']}.")
    return " ".join(partes)

def cliassify_party_affiliation(row):
    
    text = crear_frase_detallada_party(row)
    candidate_labels = unique_mappings_party

    result = classifier(text, candidate_labels)

    best_label = result['labels'][0]
    best_score = result['scores'][0]

    if best_score < 0.2:
        best_label = "other"
        best_score = 0.0

    return best_label, best_score

Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu


In [16]:
n = len(df_original)
processes_rows = [] 

for i in range(n):
    if pd.isna(df_original.loc[i, 'party_affiliation']) or df_original.loc[i, 'party_affiliation'].strip() == "" or df_original.loc[i, 'party_affiliation'] == "none":
        classification = cliassify_party_affiliation(
            df_original.loc[i]
        )[0]

        df_original.loc[i, 'party_affiliation'] = classification

        processes_rows.append(i)
    else:
        df_original.loc[i, 'party_affiliation'] = group_party(df_original.loc[i, 'party_affiliation'])

print(f"Rows processed: {processes_rows}")

Rows processed: [3, 5, 8, 12, 18, 26, 27, 28, 54, 57, 59, 75, 79, 83, 85, 90, 94, 95, 105, 107, 109, 116, 122, 125, 129, 130, 143, 147, 148, 154, 155, 166, 170, 186, 188, 198, 201, 203, 215, 217, 220, 225, 230, 232, 240, 246, 251, 255, 256, 257, 259, 260, 263, 270, 273, 281, 296, 332, 341, 343, 348, 349, 355, 364, 367, 369, 383, 392, 396, 405, 407, 410, 412, 430, 432, 436, 442, 446, 455, 456, 465, 468, 470, 472, 479, 492, 501, 509, 530, 531, 537, 539, 559, 563, 564, 577, 581, 589, 593, 595, 598, 620, 632, 643, 659, 660, 662, 665, 680, 685, 688, 707, 713, 724, 738, 743, 749, 756, 757, 767, 768, 770, 773, 780, 782, 783, 789, 799, 806, 811, 812, 813, 820, 831, 835, 846, 849, 851, 859, 872, 873, 877, 879, 881, 885, 886, 887, 890, 895, 899, 903, 908, 910, 912, 917, 926, 927, 933, 935, 943, 948, 955, 959, 964, 968, 971, 980, 989, 994, 995, 999, 1001, 1009, 1014, 1015, 1016, 1018, 1024, 1026, 1034, 1040, 1045, 1047, 1048, 1053, 1057, 1062, 1064, 1076, 1078, 1084, 1087, 1093, 1095, 1111, 1125,

# 4. Test

## 4.1 speaker_job

In [17]:
df_test_original['speaker_job'] = (
    df_test_original['speaker_job']
    .str.lower()
    .str.strip()
    .str.replace(r'\s+', '_', regex=True)
)

In [18]:
n = len(df_test_original)
processes_rows = [] 

for i in range(n):
    if pd.isna(df_test_original.loc[i, 'speaker_job']) or df_test_original.loc[i, 'speaker_job'].strip() == "":
  
        classification = cliassify_speaker_job(
            df_test_original.loc[i]
        )[0]

        df_test_original.loc[i, 'speaker_job'] = classification

        processes_rows.append(i)

print(f"Rows processed: {processes_rows}")

Rows processed: [0, 1, 5, 6, 7, 14, 16, 18, 20, 21, 35, 39, 44, 51, 52, 55, 57, 64, 67, 71, 78, 81, 82, 85, 94, 96, 101, 103, 107, 109, 110, 111, 115, 117, 119, 126, 127, 134, 136, 139, 143, 145, 146, 148, 158, 165, 170, 171, 175, 177, 179, 184, 191, 194, 196, 207, 210, 217, 226, 230, 232, 242, 244, 245, 249, 253, 261, 262, 263, 264, 267, 268, 270, 271, 273, 275, 283, 287, 288, 289, 296, 304, 307, 308, 309, 312, 313, 314, 316, 324, 327, 328, 329, 330, 332, 340, 341, 344, 351, 353, 355, 358, 362, 369, 371, 375, 378, 382, 385, 392, 408, 409, 420, 422, 427, 428, 429, 430, 433, 435, 439, 441, 442, 451, 454, 465, 466, 467, 468, 471, 480, 481, 484, 488, 499, 502, 503, 507, 514, 519, 520, 521, 522, 528, 529, 531, 533, 534, 535, 541, 550, 557, 558, 559, 562, 566, 568, 570, 572, 575, 578, 579, 581, 584, 588, 596, 597, 599, 600, 603, 608, 615, 620, 622, 626, 630, 631, 634, 636, 644, 646, 648, 649, 650, 651, 658, 663, 664, 666, 668, 669, 672, 673, 678, 680, 683, 684, 687, 690, 695, 696, 698, 699,

## 4.2. state_info

In [19]:
df_test_original['state_info'] = df_test_original['state_info'].str.lower()
df_test_original['state_info'] = df_test_original['state_info'].apply(clean_state_info)


In [None]:
n = len(df_test_original)

processed_state_info = df_test_original['state_info'].copy()

processes_rows = []

for i in range(n):
    if pd.isna(df_test_original.loc[i, 'state_info']) or df_test_original.loc[i, 'state_info'].strip() == "":
        print(f"Processing row {i} of {n}")
        
        # Clasificar el estado
        classification = cliassify_state_info(df_test_original.loc[i])[0]
        
        # Almacenar el resultado en la lista
        processed_state_info[i] = classification
        processes_rows.append(i)

df_test_original['state_info'] = processed_state_info

df_test_original['state_info'] = df_test_original['state_info'].apply(
    lambda x: x if x in frequent_state_info else 'other'
)

print(f"Rows processed: {processes_rows}")

Processing row 0 of 3836
kasim-reed (political_organization) (affiliated with the democrat party) said: "Five members of [the Common Cause Georgia] board accepted maximum campaign contributions." The statement concerns the topics of: campaign-finance,ethics,government-regulation.
Processing row 6 of 3836
dick-armey (other) (affiliated with the republican party) said: "If you're over 65 years old in America today, you have no choice but to be in Medicare. Even if you want out of Medicare, you have to forfeit your Social Security to get out of it." The statement concerns the topics of: health-care.
Processing row 7 of 3836
sandra-stotsky (other) (affiliated with the none party) said: "Common Core expects English teachers to spend at least half of their reading instructional time at every grade level on informational texts." The statement concerns the topics of: corrections-and-updates,education.
Processing row 16 of 3836
forbes-blog (other) (affiliated with the none party) said: "The Uni

## 4.3 party_affiliation

In [None]:
n = len(df_test_original)
processes_rows = [] 

for i in range(n):
    if pd.isna(df_test_original.loc[i, 'party_affiliation']) or df_test_original.loc[i, 'party_affiliation'].strip() == "" or df_original.loc[i, 'party_affiliation'] == "none":
        classification = cliassify_party_affiliation(
            df_test_original.loc[i]
        )[0]

        df_test_original.loc[i, 'party_affiliation'] = classification

        processes_rows.append(i)
    else:
        df_test_original.loc[i, 'party_affiliation'] = group_party(df_test_original.loc[i, 'party_affiliation'])

print(f"Rows processed: {processes_rows}")

# 5. Exportar CSV

In [None]:
df_formated["speaker_job-llm"] = df_original["speaker_job"]
df_formated["state_info-llm"] = df_original["state_info"]
df_formated["party_affiliation-llm"] = df_original["party_affiliation"]
df_test_formated["speaker_job-llm"] = df_test_original["speaker_job"]
df_test_formated["state_info-llm"] = df_test_original["state_info"]
df_test_formated["party_affiliation-llm"] = df_test_original["party_affiliation"]

In [None]:
# Exportar el DataFrame 'dfn' a un archivo CSV
df_formated.to_csv('./formated/train_exportado_llm.csv', index=False)

df_test_formated.to_csv('./formated/test_exportado_llm.csv', index=False)

In [None]:
df_original["speaker_job_cod-llm"] = df_original['speaker_job-llm'].astype('category').cat.codes.astype('int64')
df_original["state_info_cod-llm"] = df_original['state_info-llm'].astype('category').cat.codes.astype('int64')
df_original["party_affiliation_cod-llm"] = df_original['party_affiliation-llm'].astype('category').cat.codes.astype('int64')

df_test_original["speaker_job_cod-llm"] = df_test_original['speaker_job-llm'].astype('category').cat.codes.astype('int64')
df_test_original["state_info_cod-llm"] = df_test_original['state_info-llm'].astype('category').cat.codes.astype('int64')
df_test_original["party_affiliation_cod-llm"] = df_test_original['party_affiliation-llm'].astype('category').cat.codes.astype('int64')

In [None]:
llm_train = pd.DataFrame({
    "speaker_job-llm": df_original["speaker_job-llm"],
    # "speaker_job_cod-llm": df_original["speaker_job_cod-llm"],
    "state_info-llm": df_original["state_info-llm"],
    # "state_info_cod-llm": df_original["state_info_cod-llm"],
    "party_affiliation-llm": df_original["party_affiliation-llm"],
    # "party_affiliation_cod-llm": df_original["party_affiliation_cod-llm"],
})

llm_test = pd.DataFrame({
    "speaker_job-llm": df_test_original["speaker_job-llm"],
    # "speaker_job_cod-llm": df_test_original["speaker_job_cod-llm"],
    "state_info-llm": df_test_original["state_info-llm"],
    # "state_info_cod-llm": df_test_original["state_info_cod-llm"],
    "party_affiliation-llm": df_test_original["party_affiliation-llm"],
    # "party_affiliation_cod-llm": df_test_original["party_affiliation_cod-llm"],
})


In [None]:
# Exportar el DataFrame 'dfn' a un archivo CSV
llm_train.to_csv('./formated/llm_train_backup.csv', index=False)

llm_test.to_csv('./formated/llm_test_backup.csv', index=False)