In [8]:
from models import (get_gpt_argument)
import pandas as pd
from collections import Counter
from tqdm import tqdm

In [2]:
def analyze_df(df):
    
    # Ensure 'claim' and 'source' columns exist
    if 'claim' not in df.columns or 'source' not in df.columns:
        raise ValueError("CSV must contain 'claim' and 'source' columns")
    
    # Count unique claims and their occurrences
    claim_counts = df['claim'].value_counts()
    
    # Grouping claims by their row count
    claim_frequency_distribution = Counter(claim_counts.values)
    
    # Count occurrences of each unique source
    source_counts = df['source'].value_counts()

    # #     # Find claims with exactly 78 rows
    # claims_with_78_rows = claim_counts[claim_counts == 26].index.tolist()
    # print("Claims with 78 rows:", claims_with_78_rows)
    
    # Display results
    print(f"Total unique claims: {len(claim_counts)}")
    print("Claim frequency distribution:")
    for num_rows, num_claims in claim_frequency_distribution.items():
        print(f"{num_claims} claims have {num_rows} rows")
    
    print("\nSource counts:")
    print(source_counts)
    
    return {
        'total_unique_claims': len(claim_counts),
        'claim_frequency_distribution': dict(claim_frequency_distribution),
        'source_counts': source_counts.to_dict()
    }



In [3]:
df = pd.read_csv('anthropic_persuasion\persuasion_data.csv')
df = df.drop(columns=['worker_id'], errors='ignore')
df = df[df['source'] != 'Control']
df = df.drop_duplicates(subset=['argument']).dropna(subset=['argument'])
result = analyze_df(df)

Total unique claims: 56
Claim frequency distribution:
2 claims have 26 rows
54 claims have 23 rows

Source counts:
source
Claude 2              224
Claude 3 Haiku        224
Claude 3 Opus         224
Claude Instant 1.2    224
Claude 1.3            224
Human                 174
Name: count, dtype: int64


In [4]:
unique_claims = df['claim'].unique().tolist()

In [12]:
len(unique_claims)

56

# Getting GPT arguments

## GPT4

In [6]:
gpt_arg_compelling = get_gpt_argument(unique_claims[0], 'compelling', "gpt-4-1106-preview")

In [9]:
# Compelling
claim_type = 'compelling'
gpt4_compelling = []
for i in tqdm(range(len(unique_claims)), desc="Processing Items"):
    claim = unique_claims[i]
    # Perform some operation
    gpt_arg = get_gpt_argument(claim, claim_type, "gpt-4-1106-preview")
    gpt4_compelling.append(gpt_arg)

Processing Items: 100%|██████████| 56/56 [10:00<00:00, 10.73s/it]


In [15]:
for index, claim in enumerate(unique_claims):
    new_row = {
        'claim': claim,
        'argument': gpt4_compelling[index],
        'source': 'GPT4',
        'prompt_type': 'Compelling Case'
    }
    df = pd.concat([df, pd.DataFrame([new_row])], ignore_index=True)


In [18]:
# role-play
claim_type = 'role-play'
gpt4_role = []
for i in tqdm(range(len(unique_claims)), desc="Processing Items"):
    claim = unique_claims[i]
    gpt_arg = get_gpt_argument(claim, claim_type, "gpt-4-1106-preview")
    gpt4_role.append(gpt_arg)

Processing Items: 100%|██████████| 56/56 [09:54<00:00, 10.62s/it]


In [19]:
for index, claim in enumerate(unique_claims):
    new_row = {
        'claim': claim,
        'argument': gpt4_role[index],
        'source': 'GPT4',
        'prompt_type': 'Expert Writer Rhetorics'
    }
    df = pd.concat([df, pd.DataFrame([new_row])], ignore_index=True)


In [21]:
# logical
claim_type = 'logical'
gpt4_logical = []
for i in tqdm(range(len(unique_claims)), desc="Processing Items"):
    claim = unique_claims[i]
    gpt_arg = get_gpt_argument(claim, claim_type, "gpt-4-1106-preview")
    gpt4_logical.append(gpt_arg)

for index, claim in enumerate(unique_claims):
    new_row = {
        'claim': claim,
        'argument': gpt4_logical[index],
        'source': 'GPT4',
        'prompt_type': 'Logical Reasoning'
    }
    df = pd.concat([df, pd.DataFrame([new_row])], ignore_index=True)

Processing Items: 100%|██████████| 56/56 [09:11<00:00,  9.86s/it]


In [23]:
# Save df as a JSON object
df.to_json('anthropic_persuasion\gpt4_data_clean.json', orient='records', lines=True)

## GPT 35

In [24]:
# Compelling
claim_type = 'compelling'
gpt35_compelling = []
for i in tqdm(range(len(unique_claims)), desc="Processing Items"):
    claim = unique_claims[i]
    # Perform some operation
    gpt_arg = get_gpt_argument(claim, claim_type, "gpt-3.5-turbo-1106")
    gpt35_compelling.append(gpt_arg)

for index, claim in enumerate(unique_claims):
    new_row = {
        'claim': claim,
        'argument': gpt35_compelling[index],
        'source': 'GPT3.5',
        'prompt_type': 'Compelling Case'
    }
    df = pd.concat([df, pd.DataFrame([new_row])], ignore_index=True)


Processing Items: 100%|██████████| 56/56 [02:35<00:00,  2.78s/it]


In [26]:
# Role
claim_type = 'role-play'
gpt35_role = []
for i in tqdm(range(len(unique_claims)), desc="Processing Items"):
    claim = unique_claims[i]
    # Perform some operation
    gpt_arg = get_gpt_argument(claim, claim_type, "gpt-3.5-turbo-1106")
    gpt35_role.append(gpt_arg)

for index, claim in enumerate(unique_claims):
    new_row = {
        'claim': claim,
        'argument': gpt35_role[index],
        'source': 'GPT3.5',
        'prompt_type': 'Expert Writer Rhetorics'
    }
    df = pd.concat([df, pd.DataFrame([new_row])], ignore_index=True)


Processing Items: 100%|██████████| 56/56 [02:57<00:00,  3.17s/it]


In [28]:
# logical
claim_type = 'logical'
gpt35_logical = []
for i in tqdm(range(len(unique_claims)), desc="Processing Items"):
    claim = unique_claims[i]
    gpt_arg = get_gpt_argument(claim, claim_type, "gpt-3.5-turbo-1106")
    gpt35_logical.append(gpt_arg)

for index, claim in enumerate(unique_claims):
    new_row = {
        'claim': claim,
        'argument': gpt35_logical[index],
        'source': 'GPT3.5',
        'prompt_type': 'Logical Reasoning'
    }
    df = pd.concat([df, pd.DataFrame([new_row])], ignore_index=True)

Processing Items: 100%|██████████| 56/56 [02:28<00:00,  2.66s/it]


In [29]:
# Save df as a JSON object
df.to_json('anthropic_persuasion\gpt_data_clean.json', orient='records', lines=True)

In [31]:
result = analyze_df(df)

Total unique claims: 56
Claim frequency distribution:
2 claims have 32 rows
54 claims have 29 rows

Source counts:
source
Claude 2              224
Claude 3 Haiku        224
Claude 3 Opus         224
Claude Instant 1.2    224
Claude 1.3            224
Human                 174
GPT4                  168
GPT3.5                168
Name: count, dtype: int64
