In [1]:
# ============================================
# LETA TRANSFORMER VALIDATION - GOOGLE COLAB
# ============================================

# STEP 1: Install requirements
!pip install transformers torch pandas scipy tqdm -q

# STEP 2: Upload your files
from google.colab import files
print("Upload 'authors40_full_records.csv':")
uploaded = files.upload()
print("\nUpload 'pillar4_post_emotions.csv':")
uploaded = files.upload()

# STEP 3: Run validation
import pandas as pd
import numpy as np
from scipy import stats
from tqdm import tqdm
from transformers import pipeline
import torch

print(f"\nGPU Available: {torch.cuda.is_available()}")
print(f"GPU Name: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'None'}")

# Load data
print("\nLoading data...")
texts_df = pd.read_csv("authors40_full_records.csv")
leta_df = pd.read_csv("pillar4_post_emotions.csv")
print(f"Loaded {len(texts_df)} texts and {len(leta_df)} LETA scores")

# Load transformer model on GPU
print("\nLoading transformer model...")
classifier = pipeline(
    "text-classification",
    model="j-hartmann/emotion-english-distilroberta-base",
    top_k=None,
    device=0  # GPU
)
print("Model loaded on GPU!")

# Process texts
def get_emotions(text, max_length=500):
    try:
        if pd.isna(text) or str(text).strip() == '':
            return None
        text = str(text)[:max_length]
        result = classifier(text)[0]
        return {item['label']: item['score'] for item in result}
    except:
        return None

print(f"\nProcessing {len(texts_df)} texts (estimated time: 20-40 minutes)...")

transformer_results = []
for idx, row in tqdm(texts_df.iterrows(), total=len(texts_df)):
    emotions = get_emotions(row['text'])
    if emotions:
        emotions['post_idx'] = idx
        transformer_results.append(emotions)

transformer_df = pd.DataFrame(transformer_results)
print(f"\nProcessed {len(transformer_df)} posts successfully")

# Compute correlations
print("\nComputing correlations...")

emotion_pairs = [
    ('anger', 'score_anger'),
    ('disgust', 'score_disgust'),
    ('fear', 'score_fear'),
    ('joy', 'score_joy'),
    ('sadness', 'score_sadness'),
    ('surprise', 'score_surprise')
]

merged = transformer_df.merge(leta_df, on='post_idx', how='inner')
print(f"Matched {len(merged)} posts")

correlations = []
for trans_col, leta_col in emotion_pairs:
    if trans_col in merged.columns and leta_col in merged.columns:
        mask = (merged[trans_col].notna()) & (merged[leta_col].notna())
        if mask.sum() > 100:
            r, p = stats.pearsonr(merged.loc[mask, trans_col], merged.loc[mask, leta_col])
            correlations.append({
                'emotion': trans_col,
                'pearson_r': round(r, 3),
                'p_value': f"{p:.2e}",
                'n': mask.sum()
            })

corr_df = pd.DataFrame(correlations)

# Display results
print("\n" + "="*60)
print("LETA vs TRANSFORMER CORRELATION RESULTS")
print("="*60)
print(corr_df.to_string(index=False))
print("="*60)
print(f"\nMean correlation: r = {corr_df['pearson_r'].mean():.3f}")
print(f"Range: {corr_df['pearson_r'].min():.3f} to {corr_df['pearson_r'].max():.3f}")

# Save and download results
transformer_df.to_csv("transformer_emotions.csv", index=False)
corr_df.to_csv("leta_transformer_correlations.csv", index=False)

print("\nDownloading results...")
files.download("transformer_emotions.csv")
files.download("leta_transformer_correlations.csv")

print("\nDONE!")

Upload 'authors40_full_records.csv':


Saving authors40_full_records.csv to authors40_full_records.csv

Upload 'pillar4_post_emotions.csv':


Saving pillar4_post_emotions.csv to pillar4_post_emotions.csv

GPU Available: True
GPU Name: Tesla T4

Loading data...
Loaded 18604 texts and 18604 LETA scores

Loading transformer model...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/329M [00:00<?, ?B/s]



Loading weights:   0%|          | 0/105 [00:00<?, ?it/s]

RobertaForSequenceClassification LOAD REPORT from: j-hartmann/emotion-english-distilroberta-base
Key                             | Status     |  | 
--------------------------------+------------+--+-
roberta.embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


model.safetensors:   0%|          | 0.00/329M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/294 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Model loaded on GPU!

Processing 18604 texts (estimated time: 20-40 minutes)...



  0%|          | 0/18604 [00:00<?, ?it/s][A
  0%|          | 1/18604 [00:00<3:30:05,  1.48it/s][A
  0%|          | 8/18604 [00:00<23:28, 13.20it/s]  [AYou seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset

  0%|          | 18/18604 [00:00<10:20, 29.94it/s][A
  0%|          | 29/18604 [00:00<06:32, 47.32it/s][A
  0%|          | 41/18604 [00:01<04:48, 64.39it/s][A
  0%|          | 54/18604 [00:01<03:51, 80.08it/s][A
  0%|          | 66/18604 [00:01<03:25, 90.13it/s][A
  0%|          | 79/18604 [00:01<03:06, 99.58it/s][A
  0%|          | 93/18604 [00:01<02:49, 109.32it/s][A
  1%|          | 106/18604 [00:01<02:52, 107.27it/s][A
  1%|          | 118/18604 [00:01<02:47, 110.16it/s][A
  1%|          | 130/18604 [00:01<02:44, 112.52it/s][A
  1%|          | 142/18604 [00:01<02:44, 111.97it/s][A
  1%|          | 160/18604 [00:02<02:21, 130.21it/s][A
  1%|          | 179/18604 [00:02<02:05, 146.47it/s][A
  1%|          | 197


Processed 18545 posts successfully

Computing correlations...
Matched 18545 posts

LETA vs TRANSFORMER CORRELATION RESULTS
 emotion  pearson_r   p_value     n
   anger      0.127  1.16e-67 18545
 disgust      0.081  4.07e-28 18545
    fear      0.120  4.83e-60 18545
     joy      0.276 1.38e-322 18545
 sadness      0.185 2.99e-142 18545
surprise     -0.054  2.00e-13 18545

Mean correlation: r = 0.123
Range: -0.054 to 0.276

Downloading results...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


DONE!
