# 1. Importar Librerias

In [None]:
import pandas as pd
import numpy as np

from datetime import datetime
import json


import matplotlib.pyplot as plt
import seaborn as sns


import spacy
# from spacy import displacy

import re
from multiprocessing import process
from transformers import pipeline


# 2. Carga del Dataset

In [None]:
# Train
url = "data/train.csv" 
df = pd.read_csv(url)
df_original = df.copy()

url2 = "formated/train._exportado.csv" 
df_formated = pd.read_csv(url)


# Test

url3 = "data/test_nolabel.csv"
df_test_original = pd.read_csv(url3)

url4 = "formated/test_exportado.csv"
df_test_formated = pd.read_csv(url4)

# 3. LLM Features 

## 3.1 speaker_job

In [None]:
df_original['speaker_job'] = (
    df_original['speaker_job']
    .str.lower()
    .str.strip()
    .str.replace(r'\s+', '_', regex=True)
)


speaker_job_counts = df_original['speaker_job'].value_counts()
top_n = 13
frequent_speaker_jobs = set(speaker_job_counts.head(top_n).index)
print(f"Frequent speaker jobs: {frequent_speaker_jobs}")

In [None]:
mapping_speaker_job_en = {
    "us_senator": "federal_legislator_senate",
    "president": "president",
    "governor": "governor",
    "us_representative": "federal_legislator_house",
    "president-elect": "president",
    "presidential_candidate": "presidential_candidate",
    "state_senator": "state_legislator_senate",
    "state_representative": "state_legislator_house",
    "former_governor": "governor",
    "senator": "federal_legislator_senate",  # Assuming federal context in absence of "state"
    "milwaukee_county_executive": "local_executive",
    "attorney": "legal_professional",
    "us_house_of_representatives": "federal_legislator_house",
    "social_media_posting": "public_communicator",  # Could be refined to "non_traditional_communicator" if relevant
    "governor_of_new_jersey": "governor",
    "congressman": "federal_legislator_house",
    "co-host_on_cnn's_\"crossfire\"": "public_communicator",
    "us_congressman": "federal_legislator_house",
    "congresswoman": "federal_legislator_house",
    "speaker_of_the_house_of_representatives": "federal_legislative_leader",
    "businessman": "business_professional",
    "governor_of_ohio_as_of_jan_10,_2011": "governor",
    "author": "public_communicator",
    "lawyer": "legal_professional",
    "candidate_for_us_senate_and_physician": "federal_legislative_candidate",
    "us_senator_from_ohio": "federal_legislator_senate",
    "lieutenant_governor": "state_executive",
    "house_majority_leader": "federal_legislative_leader",
    "us_representative,_florida_district_23": "federal_legislator_house",
    "consultant": "political_advisor",
    "mayor_of_providence": "local_executive",
    "former_president": "president",
    "columnist": "public_communicator",
    "political_action_committee": "political_organization",
    "radio_host": "public_communicator",
    "madison_school_board_member": "local_education_official",
    "secretary_of_state": "state_executive",
    "attorney_general": "state_executive",
    "us_house_member": "federal_legislator_house",
    "msnbc_host": "public_communicator",
    "senate_democratic_leader": "federal_legislative_leader",
    "senate_minority_leader": "federal_legislative_leader",
    "ohio_treasurer": "state_executive",
    "state_assemblyman": "state_legislator_house",
    "chairman,_republican_national_committee": "party_leader",
    "us_representative,_florida_district_22": "federal_legislator_house",
    "house_minority_leader": "federal_legislative_leader",
    "mayor_of_milwaukee": "local_executive",
    "lieutenant_governor-elect": "state_executive",
    "philanthropist": "public_communicator" # Could be considered "influencer" or "social_actor"
}

In [None]:
# Extract all unique mappings from the dictionary
unique_mappings_speaker_job = list(set(mapping_speaker_job_en.values()))
print(unique_mappings_speaker_job)

In [None]:
classifier = pipeline('zero-shot-classification', model='roberta-large-mnli')


def crear_frase_detallada_speaker_job(row):
    partes = [f"{row['speaker']}"]
    # if pd.notna(row['speaker_job']):
    #     partes.append(f"({row['speaker_job']})")
    if pd.notna(row['state_info']):
        partes.append(f"from {row['state_info']}")
    if pd.notna(row['party_affiliation']):
        partes.append(f"(affiliated with the {row['party_affiliation']} party)")
    partes.append(f"said: \"{row['statement']}\"")
    if pd.notna(row['subject']):
        partes.append(f"The statement concerns the topics of: {row['subject']}.")
    return " ".join(partes)

def cliassify_speaker_job(row):
    
    text = crear_frase_detallada_speaker_job(row)

    candidate_labels = unique_mappings_speaker_job

    result = classifier(text, candidate_labels)

    # Get the label with the highest score
    best_label = result['labels'][0]
    best_score = result['scores'][0]

    if best_score < 0.2:
        best_label = "other"
        best_score = 0.0

    return best_label, best_score

In [None]:
df_original.head(3)

In [None]:
n = len(df_original)
processes_rows = [] 

for i in range(n):
    if pd.isna(df_original.loc[i, 'speaker_job']) or df_original.loc[i, 'speaker_job'].strip() == "":
  
        classification = cliassify_speaker_job(
            df_original.loc[i]
        )[0]

        df_original.loc[i, 'speaker_job'] = classification

        processes_rows.append(i)

print(f"Rows processed: {processes_rows}")

## 3.2 state_info

In [None]:
def clean_state_info(text):
    if isinstance(text, str):  
        text = re.sub(r'[^\w\s]', '', text)  
        return text.strip()  
    return text  


In [None]:
df_original['state_info'] = df_original['state_info'].str.lower()
df_original['state_info'] = df_original['state_info'].apply(clean_state_info)
state_info_counts = df_original['state_info'].value_counts()

top_n = 15
frequent_state_info = set(state_info_counts.head(top_n).index)
print(f"Frequent state info: {frequent_state_info}")

In [None]:
classifier = pipeline('zero-shot-classification', model='roberta-large-mnli')

def crear_frase_detallada_state_info(row):
    partes = [f"{row['speaker']}"]
    if pd.notna(row['speaker_job']):
        partes.append(f"({row['speaker_job']})")
    # if pd.notna(row['state_info']):
    #     partes.append(f"from {row['state_info']}")
    if pd.notna(row['party_affiliation']):
        partes.append(f"(affiliated with the {row['party_affiliation']} party)")
    partes.append(f"said: \"{row['statement']}\"")
    if pd.notna(row['subject']):
        partes.append(f"The statement concerns the topics of: {row['subject']}.")
    return " ".join(partes)



def cliassify_state_info(row):
    
    text = crear_frase_detallada_state_info(row)
    print(text)
    candidate_labels = list(frequent_state_info)


    result = classifier(text, candidate_labels)

    best_label = result['labels'][0]
    best_score = result['scores'][0]

    if best_score < 0.2:
        best_label = "other"
        best_score = 0.0

    return best_label, best_score

In [None]:
n = len(df_original)

processed_state_info = df_original['state_info'].copy()

processes_rows = []

for i in range(n):
    if pd.isna(df_original.loc[i, 'state_info']) or df_original.loc[i, 'state_info'].strip() == "":
        print(f"Processing row {i} of {n}")
        
        # Clasificar el estado
        classification = cliassify_state_info(df_original.loc[i])[0]
        
        # Almacenar el resultado en la lista
        processed_state_info[i] = classification
        processes_rows.append(i)

df_original['state_info'] = processed_state_info

df_original['state_info'] = df_original['state_info'].apply(
    lambda x: x if x in frequent_state_info else 'other'
)

print(f"Rows processed: {processes_rows}")

## 3.3 party_affiliation

In [None]:
mapping = {
    # Major parties
    'republican': 'republican',
    'democrat': 'democrat',
    'democratic-farmer-labor': 'democrat',

    # Independents / None
    # 'none': 'independent_None',
    'independent': 'independent_None',

    # Media
    'newsmaker': 'media',
    'journalist': 'media',
    'columnist': 'media',
    'activist': 'media',
    'talk-show-host': 'media',
    
    # Third parties
    'libertarian': 'third_Party',
    'green': 'third_Party',
    'constitution-party': 'third_Party',
    'liberal-party-canada': 'third_Party',

    # Organizations
    'organization': 'organization',
    
    # Officials  
    'state-official': 'official',
    'business-leader': 'official',
    'labor-leader': 'official',
    'education-official': 'official',
    'government-body': 'official',

    # Tea Party
    'tea-party-member': 'republican',
    'ocean-state-tea-party-action': 'republican',
}

def group_party(x):
    return mapping.get(x, 'other')  # anything else → 'Other'



In [None]:
# Extract all unique mappings from the dictionary
unique_mappings_party = list(set(mapping.values()))
print(unique_mappings_party)

In [None]:
classifier = pipeline('zero-shot-classification', model='roberta-large-mnli')

def crear_frase_detallada_party(row):
    partes = [f"{row['speaker']}"]
    if pd.notna(row['speaker_job']):
        partes.append(f"({row['speaker_job']})")
    if pd.notna(row['state_info']):
        partes.append(f"from {row['state_info']}")
    # if pd.notna(row['party_affiliation']) or row['party_affiliation'] != "none":
    #     partes.append(f"(affiliated with the {row['party_affiliation']} party)")
    partes.append(f"said: \"{row['statement']}\"")
    if pd.notna(row['subject']):
        partes.append(f"The statement concerns the topics of: {row['subject']}.")
    return " ".join(partes)

def cliassify_party_affiliation(row):
    
    text = crear_frase_detallada_party(row)
    candidate_labels = unique_mappings_party

    result = classifier(text, candidate_labels)

    best_label = result['labels'][0]
    best_score = result['scores'][0]

    if best_score < 0.2:
        best_label = "other"
        best_score = 0.0

    return best_label, best_score

In [None]:
n = len(df_original)
processes_rows = [] 

for i in range(n):
    if pd.isna(df_original.loc[i, 'party_affiliation']) or df_original.loc[i, 'party_affiliation'].strip() == "" or df_original.loc[i, 'party_affiliation'] == "none":
        classification = cliassify_party_affiliation(
            df_original.loc[i]
        )[0]

        df_original.loc[i, 'party_affiliation'] = classification

        processes_rows.append(i)
    else:
        df_original.loc[i, 'party_affiliation'] = group_party(df_original.loc[i, 'party_affiliation'])

print(f"Rows processed: {processes_rows}")

# 4. Test

## 4.1 speaker_job

In [None]:
df_test_original['speaker_job'] = (
    df_test_original['speaker_job']
    .str.lower()
    .str.strip()
    .str.replace(r'\s+', '_', regex=True)
)

In [None]:
n = len(df_test_original)
processes_rows = [] 

for i in range(n):
    if pd.isna(df_test_original.loc[i, 'speaker_job']) or df_test_original.loc[i, 'speaker_job'].strip() == "":
  
        classification = cliassify_speaker_job(
            df_test_original.loc[i]
        )[0]

        df_test_original.loc[i, 'speaker_job'] = classification

        processes_rows.append(i)

print(f"Rows processed: {processes_rows}")

## 4.2. state_info

In [None]:
df_test_original['state_info'] = df_test_original['state_info'].str.lower()
df_test_original['state_info'] = df_test_original['state_info'].apply(clean_state_info)


In [None]:
n = len(df_test_original)

processed_state_info = df_test_original['state_info'].copy()

processes_rows = []

for i in range(n):
    if pd.isna(df_test_original.loc[i, 'state_info']) or df_test_original.loc[i, 'state_info'].strip() == "":
        print(f"Processing row {i} of {n}")
        
        # Clasificar el estado
        classification = cliassify_state_info(df_test_original.loc[i])[0]
        
        # Almacenar el resultado en la lista
        processed_state_info[i] = classification
        processes_rows.append(i)

df_test_original['state_info'] = processed_state_info

df_test_original['state_info'] = df_test_original['state_info'].apply(
    lambda x: x if x in frequent_state_info else 'other'
)

print(f"Rows processed: {processes_rows}")

## 4.3 party_affiliation

In [None]:
n = len(df_test_original)
processes_rows = [] 

for i in range(n):
    if pd.isna(df_test_original.loc[i, 'party_affiliation']) or df_test_original.loc[i, 'party_affiliation'].strip() == "" or df_original.loc[i, 'party_affiliation'] == "none":
        classification = cliassify_party_affiliation(
            df_test_original.loc[i]
        )[0]

        df_test_original.loc[i, 'party_affiliation'] = classification

        processes_rows.append(i)
    else:
        df_test_original.loc[i, 'party_affiliation'] = group_party(df_test_original.loc[i, 'party_affiliation'])

print(f"Rows processed: {processes_rows}")

# 5. Exportar CSV

In [None]:
df_formated["speaker_job-llm"] = df_original["speaker_job"]
df_formated["state_info-llm"] = df_original["state_info"]
df_formated["party_affiliation-llm"] = df_original["party_affiliation"]
df_test_formated["speaker_job-llm"] = df_test_original["speaker_job"]
df_test_formated["state_info-llm"] = df_test_original["state_info"]
df_test_formated["party_affiliation-llm"] = df_test_original["party_affiliation"]

In [None]:
# Exportar el DataFrame 'dfn' a un archivo CSV
df_formated.to_csv('./formated/train_exportado_llm.csv', index=False)

df_test_formated.to_csv('./formated/test_exportado_llm.csv', index=False)