In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import os

output_folder = "prism_analysis_results"
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

sample_prism = pd.read_csv('important_data_v3_metrics.csv')

print(f"PRISM Sample Dataset Shape: {sample_prism.shape}")
print(f"Number of unique conversations: {sample_prism['conversation_id'].nunique()}")
print(f"Number of unique participants: {sample_prism['user_id'].nunique()}")

convo_per_user = sample_prism.groupby('user_id')['conversation_id'].nunique()
print(f"\nConversations per participant:")
print(f"  Mean: {convo_per_user.mean():.2f}")
print(f"  Median: {convo_per_user.median():.2f}")
print(f"  Min: {convo_per_user.min()}")
print(f"  Max: {convo_per_user.max()}")

if convo_per_user.equals(pd.Series(6, index=convo_per_user.index)):
    print("✓ Confirmed: All participants have exactly 6 conversations.")
else:
    print("⚠ Note: Not all participants have exactly 6 conversations.")
    print(f"Participants with != 6 conversations: {(convo_per_user != 6).sum()}")

demographic_vars = ['gender', 'age', 'education', 'employment_status', 
                   'marital_status', 'english_proficiency', 'lm_familiarity', 
                   'lm_frequency_use', 'location', 'study_locale', 'religion', 'ethnicity']

print("\n=== DEMOGRAPHIC DISTRIBUTIONS ===")
for demographic in demographic_vars:
    if demographic in sample_prism.columns:
        dist = sample_prism.groupby('user_id')[demographic].first().value_counts(normalize=True)
        print(f"\n{demographic.upper()} DISTRIBUTION:")
        print(dist)
        
        # Create visualization
        plt.figure(figsize=(10, 6))
        dist.plot(kind='bar')
        plt.title(f'{demographic.title()} Distribution')
        plt.ylabel('Proportion')
        plt.xticks(rotation=45, ha='right')
        plt.tight_layout()
        plt.savefig(os.path.join(output_folder, f'{demographic}_distribution.png'))
        plt.close()

print("\n=== CONVERSATION CHARACTERISTICS ===")

if 'conversation_history' in sample_prism.columns:
    sample_prism['num_turns'] = sample_prism['conversation_history'].str.count('\n') + 1
    sample_prism['convo_length_words'] = sample_prism['conversation_history'].str.split().str.len()    
    sample_prism['avg_turn_length'] = sample_prism['convo_length_words'] / sample_prism['num_turns']
    convo_metrics = ['num_turns', 'convo_length_words', 'avg_turn_length']
    for metric in convo_metrics:
        print(f"\n{metric.upper()}:")
        print(f"  Mean: {sample_prism[metric].mean():.2f}")
        print(f"  Median: {sample_prism[metric].median():.2f}")
        print(f"  Std Dev: {sample_prism[metric].std():.2f}")
        print(f"  Min: {sample_prism[metric].min():.2f}")
        print(f"  Max: {sample_prism[metric].max():.2f}")
        plt.figure(figsize=(10, 6))
        sns.histplot(sample_prism[metric], kde=True)
        plt.title(f'Distribution of {metric.replace("_", " ").title()}')
        plt.tight_layout()
        plt.savefig(os.path.join(output_folder, f'{metric}_distribution.png'))
        plt.close()

if 'opening_prompt' in sample_prism.columns:
    print("\n=== OPENING PROMPT ANALYSIS ===")
    sample_prism['opening_prompt_length'] = sample_prism['opening_prompt'].str.len()
    sample_prism['opening_prompt_words'] = sample_prism['opening_prompt'].str.split().str.len()
    
    prompt_metrics = ['opening_prompt_length', 'opening_prompt_words']
    for metric in prompt_metrics:
        print(f"\n{metric.upper()}:")
        print(f"  Mean: {sample_prism[metric].mean():.2f}")
        print(f"  Median: {sample_prism[metric].median():.2f}")
        print(f"  Std Dev: {sample_prism[metric].std():.2f}")
        print(f"  Min: {sample_prism[metric].min():.2f}")
        print(f"  Max: {sample_prism[metric].max():.2f}")
        
        plt.figure(figsize=(10, 6))
        sns.histplot(sample_prism[metric], kde=True)
        plt.title(f'Distribution of {metric.replace("_", " ").title()}')
        plt.tight_layout()
        plt.savefig(os.path.join(output_folder, f'{metric}_distribution.png'))
        plt.close()

if 'opening_prompt' in sample_prism.columns:
    print("\n=== TOPIC ANALYSIS ===")
    
    from sklearn.feature_extraction.text import CountVectorizer
    from sklearn.decomposition import LatentDirichletAllocation

    vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english', max_features=1000)
    dtm = vectorizer.fit_transform(sample_prism['opening_prompt'].fillna(''))
    
    # Simple word frequency analysis
    word_freq = pd.DataFrame(dtm.sum(axis=0), 
                           columns=vectorizer.get_feature_names_out()).T
    word_freq.columns = ['frequency']
    top_words = word_freq.sort_values('frequency', ascending=False).head(20)
    
    print("\nTop 20 words in opening prompts:")
    print(top_words)
    
    try:
        from wordcloud import WordCloud
        
        wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(
            word_freq['frequency'].to_dict())
        
        plt.figure(figsize=(10, 6))
        plt.imshow(wordcloud, interpolation='bilinear')
        plt.axis('off')
        plt.tight_layout()
        plt.savefig(os.path.join(output_folder, 'wordcloud_topics.png'))
        plt.close()
    except ImportError:
        print("WordCloud package not installed. Skipping word cloud visualization.")
        
    try:
        lda = LatentDirichletAllocation(n_components=5, random_state=42)
        lda.fit(dtm)
        
        print("\nTop words per topic:")
        features = vectorizer.get_feature_names_out()
        for topic_idx, topic in enumerate(lda.components_):
            top_features_idx = topic.argsort()[:-11:-1]
            top_features = [features[i] for i in top_features_idx]
            print(f"Topic {topic_idx+1}: {', '.join(top_features)}")
    except Exception as e:
        print(f"Topic modeling skipped: {str(e)}")

print("\n=== DEMOGRAPHIC BALANCE CHECK ===")
for demo1, demo2 in [('gender', 'age'), ('gender', 'education'), ('age', 'education')]:
    if demo1 in sample_prism.columns and demo2 in sample_prism.columns:
        user_demos = sample_prism.groupby('user_id')[[demo1, demo2]].first()
        
        cross_tab = pd.crosstab(user_demos[demo1], user_demos[demo2], normalize='all')
        
        print(f"\nCross-tabulation of {demo1.title()} and {demo2.title()}:")
        print(cross_tab)
        
        plt.figure(figsize=(12, 8))
        sns.heatmap(cross_tab, annot=True, cmap='YlGnBu', fmt='.2%')
        plt.title(f'Distribution of Participants: {demo1.title()} vs {demo2.title()}')
        plt.tight_layout()
        plt.savefig(os.path.join(output_folder, f'{demo1}_{demo2}_heatmap.png'))
        plt.close()

if 'lm_familiarity' in sample_prism.columns:
    plt.figure(figsize=(10, 6))
    user_lm_familiarity = sample_prism.groupby('user_id')['lm_familiarity'].first()
    
    plt.pie(user_lm_familiarity.value_counts(), 
            labels=user_lm_familiarity.value_counts().index,
            autopct='%1.1f%%')
    plt.title('LLM Familiarity Distribution')
    plt.axis('equal')
    plt.tight_layout()
    plt.savefig(os.path.join(output_folder, 'lm_familiarity_pie.png'))
    plt.close()

with open(os.path.join(output_folder, 'prism_sample_analysis_summary.txt'), 'w') as f:
    f.write("PRISM Sample Analysis Summary\n")
    f.write("============================\n\n")
    f.write(f"Dataset size: {sample_prism.shape[0]} rows\n")
    f.write(f"Unique participants: {sample_prism['user_id'].nunique()}\n")
    f.write(f"Unique conversations: {sample_prism['conversation_id'].nunique()}\n")
    f.write(f"Average conversations per participant: {convo_per_user.mean():.2f}\n\n")
    
    f.write("Key demographic distributions:\n")
    for demographic in ['gender', 'age', 'education']:
        if demographic in sample_prism.columns:
            f.write(f"\n{demographic.upper()}:\n")
            dist = sample_prism.groupby('user_id')[demographic].first().value_counts(normalize=True)
            for category, value in dist.items():
                f.write(f"  {category}: {value*100:.1f}%\n")
    
    f.write("\nVisualization files generated:\n")
    for demo in demographic_vars:
        if demo in sample_prism.columns:
            f.write(f"- {demo}_distribution.png\n")
    
    for metric in ['num_turns', 'convo_length_words', 'avg_turn_length', 'opening_prompt_length', 'opening_prompt_words']:
        f.write(f"- {metric}_distribution.png\n")
    
    for pair in [('gender', 'age'), ('gender', 'education'), ('age', 'education')]:
        demo1, demo2 = pair
        if demo1 in sample_prism.columns and demo2 in sample_prism.columns:
            f.write(f"- {demo1}_{demo2}_heatmap.png\n")

print(f"\nAnalysis complete! All visualizations saved to '{output_folder}' folder")
print(f"Summary report saved to '{os.path.join(output_folder, 'prism_sample_analysis_summary.txt')}'")

PRISM Sample Dataset Shape: (154, 41)
Number of unique conversations: 154
Number of unique participants: 137

Conversations per participant:
  Mean: 1.12
  Median: 1.00
  Min: 1
  Max: 3
⚠ Note: Not all participants have exactly 6 conversations.
Participants with != 6 conversations: 137

=== DEMOGRAPHIC DISTRIBUTIONS ===

GENDER DISTRIBUTION:
gender
Male                         0.496350
Female                       0.496350
Non-binary / third gender    0.007299
Name: proportion, dtype: float64

AGE DISTRIBUTION:
age
25-34 years old    0.277372
35-44 years old    0.189781
55-64 years old    0.175182
18-24 years old    0.160584
45-54 years old    0.124088
65+ years old      0.072993
Name: proportion, dtype: float64

EDUCATION DISTRIBUTION:
education
University Bachelors Degree       0.416058
Graduate / Professional degree    0.189781
Some University but no degree     0.160584
Completed Secondary School        0.153285
Vocational                        0.058394
Some Secondary             

In [20]:
import pandas as pd

df = pd.read_json("hf://datasets/HannahRoseKirk/prism-alignment/survey.jsonl", lines=True)
sample_df = pd.read_csv('important_data_v3_metrics.csv')

In [50]:
print(df.columns)
print(sample_df.columns)

Index(['user_id', 'survey_only', 'num_completed_conversations', 'consent',
       'consent_age', 'lm_familiarity', 'lm_indirect_use', 'lm_direct_use',
       'lm_frequency_use', 'self_description', 'system_string', 'age',
       'gender', 'employment_status', 'education', 'marital_status',
       'english_proficiency', 'study_id', 'study_locale', 'religion',
       'ethnicity', 'location', 'lm_usecases', 'stated_prefs',
       'order_lm_usecases', 'order_stated_prefs', 'generated_datetime',
       'timing_duration_s', 'timing_duration_mins', 'included_in_US_REP',
       'included_in_UK_REP', 'included_in_balanced_subset'],
      dtype='object')
Index(['conversation_id', 'user_id', 'opening_prompt', 'conversation_history',
       'lm_familiarity', 'lm_frequency_use', 'age', 'gender',
       'employment_status', 'education', 'marital_status',
       'english_proficiency', 'study_locale', 'religion', 'ethnicity',
       'location', 'human_lang', 'llm_lang', 'human_flesch_reading_ease',
  