In [1]:
import os
import pandas as pd
from tqdm import tqdm

# Path to BERT annotations
bert_path = 'BERT_annotations/BERT_annotations/'

# List to store all parsed lines
all_data = []

# Loop through all files
for filename in tqdm(os.listdir(bert_path)):
    filepath = os.path.join(bert_path, filename)
    
    with open(filepath, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            
            # Parse each line based on structure
            # Example: "dialog: Stay away from me!"
            if ':' in line:
                parts = line.split(':', 1)
                label = parts[0].strip()
                text = parts[1].strip()
                
                all_data.append({
                    'movie': filename,
                    'type': label,  # dialog, text, speaker_heading, scene_heading
                    'content': text
                })

# Convert to DataFrame
df = pd.DataFrame(all_data)
print(f"Total lines loaded: {len(df)}")
print(df.head(10))
print(df['type'].value_counts())

100%|██████████| 3996/3996 [00:07<00:00, 507.66it/s]


Total lines loaded: 9560546
                       movie             type         content
0  The Mask_0110475_anno.txt           dialog                
1  The Mask_0110475_anno.txt           dialog                
2  The Mask_0110475_anno.txt           dialog                
3  The Mask_0110475_anno.txt  speaker_heading        THE MASK
4  The Mask_0110475_anno.txt           dialog                
5  The Mask_0110475_anno.txt           dialog                
6  The Mask_0110475_anno.txt           dialog                
7  The Mask_0110475_anno.txt           dialog      Written by
8  The Mask_0110475_anno.txt           dialog                
9  The Mask_0110475_anno.txt  speaker_heading  Mark Verheiden
type
dialog             4609108
text               2666910
speaker_heading    1867723
scene_heading       416805
Name: count, dtype: int64


In [2]:
# Install transformers
!pip install transformers torch



In [3]:
from transformers import pipeline

# Load a pretrained emotion classifier
emotion_classifier = pipeline(
    "text-classification", 
    model="j-hartmann/emotion-english-distilroberta-base",
    return_all_scores=False
)

# Emotions available:
# anger, disgust, fear, joy, sadness, surprise, neutral

# Test it on a single line
text = "Stay away from me!"
result = emotion_classifier(text)
print(result)  # Returns emotion label and confidence score

  from .autonotebook import tqdm as notebook_tqdm
Device set to use cuda:0


[{'label': 'anger', 'score': 0.6600427031517029}]


In [5]:
from utils import preprocess_screenplay_data

# Usage:
df_consolidated = preprocess_screenplay_data(df)
print(f"Total sentences: {len(df_consolidated)}")
print(df_consolidated.head(20))
df_consolidated.to_csv('sentences_final.csv', index=False)

Total sentences: 4792210
                                   movie  scene_id  \
0   10 Cloverfield Lane_1179933_anno.txt    389666   
1   10 Cloverfield Lane_1179933_anno.txt    389667   
2   10 Cloverfield Lane_1179933_anno.txt    389668   
3   10 Cloverfield Lane_1179933_anno.txt    389670   
4   10 Cloverfield Lane_1179933_anno.txt    389671   
5   10 Cloverfield Lane_1179933_anno.txt    389671   
6   10 Cloverfield Lane_1179933_anno.txt    389671   
7   10 Cloverfield Lane_1179933_anno.txt    389671   
8   10 Cloverfield Lane_1179933_anno.txt    389671   
9   10 Cloverfield Lane_1179933_anno.txt    389671   
10  10 Cloverfield Lane_1179933_anno.txt    389671   
11  10 Cloverfield Lane_1179933_anno.txt    389671   
12  10 Cloverfield Lane_1179933_anno.txt    389672   
13  10 Cloverfield Lane_1179933_anno.txt    389672   
14  10 Cloverfield Lane_1179933_anno.txt    389672   
15  10 Cloverfield Lane_1179933_anno.txt    389672   
16  10 Cloverfield Lane_1179933_anno.txt    389672   
17 

In [None]:
"""
No progress bar
"""


""" from transformers import pipeline
from time import time

num_test_batches = 7992
batch_size = 600
test_size = num_test_batches * batch_size

texts = df_consolidated['sentence_text'].tolist()[:test_size]  # Fixed: use sentence_text

print(f"Total sentences in corpus: {len(df_consolidated)}")
print(f"Sentences to label: {len(texts)}")

start_time = time()
results = emotion_classifier(
    texts,
    batch_size=batch_size,
    truncation=True,
    max_length=512
)
end_time = time()

print(f"Completed in {(end_time - start_time)/60:.2f} minutes")

df_test = df_consolidated.iloc[:test_size].copy()
df_test['emotion'] = [r['label'] for r in results]
df_test['confidence'] = [r['score'] for r in results]

df_test.to_csv('sentences_labeled_TEST.csv', index=False)
print(df_test['emotion'].value_counts()) """

In [None]:
"""
progress bar
"""

from transformers import pipeline
from tqdm import tqdm
from time import time

num_test_batches = 7992
batch_size = 600
test_size = num_test_batches * batch_size

texts = df_consolidated['sentence_text'].tolist()[:test_size]

print(f"Total sentences in corpus: {len(df_consolidated)}")
print(f"Sentences to label: {len(texts)}")

start_time = time()

results = []
pbar = tqdm(total=len(texts), desc="Labeling")

for i in range(0, len(texts), batch_size):
    batch = texts[i:i + batch_size]

    out = emotion_classifier(
        batch,
        batch_size=batch_size,
        truncation=True,
        max_length=512
    )

    results.extend(out)
    pbar.update(len(batch))

pbar.close()

end_time = time()
print(f"Completed in {(end_time - start_time)/60:.2f} minutes")

df_test = df_consolidated.iloc[:test_size].copy()
df_test['emotion'] = [r['label'] for r in results]
df_test['confidence'] = [r['score'] for r in results]

df_test.to_csv('sentences_labeled_TEST_SEFSEF.csv', index=False)
print(df_test['emotion'].value_counts())


Total sentences in corpus: 4792210
Sentences to label: 4792210


Labeling:   0%|          | 5400/4792210 [00:03<43:16, 1843.87it/s]  You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Labeling: 100%|██████████| 4792210/4792210 [52:55<00:00, 1508.94it/s]  


Completed in 52.93 minutes
emotion
neutral     2769474
disgust      408864
anger        396386
surprise     362490
fear         321574
sadness      286987
joy          246435
Name: count, dtype: int64


In [None]:
""" # Look at the batch where it got stuck
stuck_position = len(results)
problem_batch = df_consolidated.iloc[stuck_position:stuck_position+600]

print(f"\nBatch that caused hang (starting at {stuck_position}):")
print(problem_batch[['movie', 'sentence_text']].head(20))

# Check for problematic sentences
print("\nSentence lengths in problem batch:")
print(problem_batch['sentence_text'].str.len().describe())

# Look for weird characters
print("\nFirst few sentences:")
for idx, sent in problem_batch['sentence_text'].head(5).items():
    print(f"{idx}: {repr(sent)[:200]}") """