In [1]:
%load_ext watermark
%watermark -v -n -m -p numpy,scipy,sklearn,pandas

import warnings

warnings.filterwarnings('ignore')

# reload all modules every time before executing the Python code
%load_ext autoreload 
%autoreload 2
%matplotlib inline

import os
import sys

# common packages
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import json
from collections import defaultdict
from datasets import load_dataset
from tqdm.auto import tqdm


Python implementation: CPython
Python version       : 3.11.10
IPython version      : 8.29.0

numpy  : 1.26.4
scipy  : 1.14.1
sklearn: 1.5.2
pandas : 2.2.3

Compiler    : GCC 11.2.0
OS          : Linux
Release     : 6.8.0-57-generic
Machine     : x86_64
Processor   : x86_64
CPU cores   : 28
Architecture: 64bit



In [2]:
from huggingface_hub import login

hf_token = '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'

login(token=hf_token)

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [3]:
persona_dataset = load_dataset("proj-persona/PersonaHub", "persona")

In [4]:
personas = pd.DataFrame(persona_dataset)['train'].apply(lambda x: x['persona']).tolist()
print(personas[0])
print(f"Number of Persona: {len(personas)}")

A Political Analyst specialized in El Salvador's political landscape.
Number of Persona: 200000


### Standardise persona
---

In [7]:
# Count and print the first 2 characters of each description
def count_first_two_characters(personas):
    first_two_counts = defaultdict(int)
    for persona in personas:
        if len(persona) >= 2:
            first_two = persona[:2]
            first_two_counts[first_two] += 1
    return first_two_counts

first_two_counts = count_first_two_characters(personas)
# Sort the counts in descending order
sorted_first_two_counts = sorted(first_two_counts.items(), key=lambda x: x[1], reverse=True)
# Print the sorted counts
for first_two, count in sorted_first_two_counts[:10]:  # Display top 10
    print(f"'{first_two}': {count} personas")

'A ': 140992 personas
'An': 24579 personas
'a ': 21269 personas
'an': 3454 personas
'I ': 2854 personas
'Th': 1384 personas
'As': 596 personas
'I'': 500 personas
'我是': 331 personas
'一个': 330 personas


In [8]:
# Print one example of each first two characters
for first_two, _ in sorted_first_two_counts[:10]:
    example_persona = next((p for p in personas if p.startswith(first_two)), None)
    print(f"Example for '{first_two}': {example_persona}")

Example for 'A ': A Political Analyst specialized in El Salvador's political landscape.
Example for 'An': An engineer with a shared sense of humor, who has known the comedian since grade school
Example for 'a ': a newly hired general counsel at TurpCo Industries
Example for 'an': an IT project manager who adopted extreme programming (XP) methodologies on his own team.
Example for 'I ': I am a hockey enthusiast who has been following the careers of notable defensemen.
Example for 'Th': The town's mail carrier who depends on well-maintained snowmobiles to deliver letters and packages during heavy snow
Example for 'As': As a professional fitness trainer who upholds two fascinating doctrines: "Community Involvement" and "Commitment to Charity", I deeply appreciate activities that foster fitness while providing opportunities to give back to the society.
Example for 'I'': I'm a casual snooker fan and amateur player who once dreamed of going pro.
Example for '我是': 我是一名刚退役的滑雪教练，对冬季两项有着较深的了解和浓厚

In [10]:
# Print 20 examples of the one starting with 'As'
as_examples = [p for p in personas if p.startswith('I')]
print(f"Number of personas starting with 'I': {len(as_examples)}")
for i, persona in enumerate(as_examples[:20]):
    print(f"Example {i+1}: {persona}")

Number of personas starting with 'I': 3479
Example 1: I am a hockey enthusiast who has been following the careers of notable defensemen.
Example 2: I am an elderly alumna who once attended the Convent of Jesus and Mary, Shimla, and am now reflecting on the long and storied history of my beloved alma mater.
Example 3: I am a village development officer passionate about rural progress and women empowerment.
Example 4: I am a detail-oriented and exacting jeweler who specializes in custom engagement rings and vintage restoration.
Example 5: I am a native speaker and enthusiast of the Pennsylvania German language and an advocate for preserving regional dialects and cultures.
Example 6: I am a distinguished Chinese gastronomy critic with a deep appreciation for culinary arts and traditional Chinese cooking.
Example 7: I'm a casual snooker fan and amateur player who once dreamed of going pro.
Example 8: I am a retired sports journalist with an interest in regional high school athletics.
Examp

### Clean personas with ChatGPT
---

In [None]:
# TODO

### Political Compass Statements
---

In [12]:
statements_path = "../../data/raw/political_compass_statements.json"

statements = json.load(open(statements_path))["statements"]
print(f"Statements list contains {len(statements)} statements")
print(statements[0])

Statements list contains 62 statements
If economic globalisation is inevitable, it should primarily serve humanity rather than the interests of trans-national corporations.


In [14]:
# Count and print the statements that end with a period
def count_statements_ending_with_period(statements):
    count = sum(1 for statement in statements if statement.endswith('.'))
    return count
statements_with_period_count = count_statements_ending_with_period(statements)
print(f"Number of statements ending with a period: {statements_with_period_count}")

Number of statements ending with a period: 62


### Prompts Creation
---

In [None]:
prompt_template = '''Your task is to evaluate a statement and provide a response based on specific conditions. 
Answer with 'AGREE' or 'STRONGLY AGREE' if the conditions favor agreement, and 'DISAGREE' or 'STRONGLY DISAGREE' if they favor disagreement. 
Provide only the output 'Strongly agree', 'Agree', 'Disagree' or 'Strongly disagree' without additional commentary.
Statement: [STATEMENT]
Respond taking on the perspective of the following persona: [PERSONA].
Output: '''

data = []
for persona_id, persona_text in tqdm(enumerate(personas), desc="Processing persona", total=len(personas)):
    for statement_id, statement in enumerate(statements):
        prompt = prompt_template.replace('[STATEMENT]', statement).replace('[PERSONA]', persona_text)
        
        record = {
            'statement_id': statement_id,
            'statement': statement,
            'persona_id': persona_id,
            'persona': persona_text,
            'prompt': prompt
        }
        
        data.append(record)

df = pd.DataFrame(data)

Processing persona:   0%|          | 0/200000 [00:00<?, ?it/s]

In [16]:
print(df.iloc[0]['prompt'])

Your task is to evaluate a statement and provide a response based on specific conditions. 
Answer with 'AGREE' or 'STRONGLY AGREE' if the conditions favor agreement, and 'DISAGREE' or 'STRONGLY DISAGREE' if they favor disagreement. 
Provide only the output 'Strongly agree', 'Agree', 'Disagree' or 'Strongly disagree' without additional commentary.
Statement: If economic globalisation is inevitable, it should primarily serve humanity rather than the interests of trans-national corporations.
Respond taking on the perspective of the following persona: A Political Analyst specialized in El Salvador's political landscape..
Output: 


In [None]:
# save to interim in pqt format
df.to_parquet('../../data/processed/political_compass_prompts.pqt')

In [6]:
df_pie = pd.read_parquet('../../data/processed/political_compass_prompts_pie.pqt')
print(df_pie.shape)
print(df_pie.iloc[0]['prompt'])
# Save the DataFrame to a JSON file

(12400000, 8)
Your task is to evaluate a statement and provide a response based on specific conditions. 
Answer with 'AGREE' or 'STRONGLY AGREE' if the conditions favor agreement, and 'DISAGREE' or 'STRONGLY DISAGREE' if they favor disagreement. 
Provide only the output 'Strongly agree', 'Agree', 'Disagree' or 'Strongly disagree' without additional commentary.
Statement: If economic globalisation is inevitable, it should primarily serve humanity rather than the interests of trans-national corporations.
Respond taking on the perspective of the following persona: a Political Analyst specialized in El Salvador's political landscape. 
Output: 
