In [1]:
import os
import pickle

os.chdir('../data')
with open('raw_text.pkl', 'rb') as file:
    data = pickle.load(file)

  data = pickle.load(file)


In [2]:
import random
random.seed(42)
keys = list(data.keys())
random.shuffle(keys)
train_keys = keys[:80]
test_keys = keys[80:]
train_data = {key: data[key] for key in train_keys}
test_data = {key: data[key] for key in test_keys}

In [3]:
train_texts = []
train_story_names = []
test_texts = []
test_story_names = []

for name, sequence in train_data.items():
    try:
        text = sequence.data
        text = ' '.join(text)
        train_texts.append(text)
        train_story_names.append(name)
    except Exception as e:
        print(f"Could not extract text for {name}: {e}")

for name, sequence in test_data.items():
    try:
        text = sequence.data
        text = ' '.join(text)
        test_texts.append(text)
        test_story_names.append(name)
    except Exception as e:
        print(f"Could not extract text for {name}: {e}")

In [4]:
import numpy as np
from tqdm import tqdm
import requests
import zipfile

# Your local download path
glove_dir = "../data/glove"
glove_zip = os.path.join(glove_dir, "glove.6B.zip")
glove_txt = os.path.join(glove_dir, "glove.6B.100d.txt")
glove_url = "http://nlp.stanford.edu/data/glove.6B.zip"

In [5]:
# Download if GloVe not already present
if not os.path.exists(glove_txt):
    print("Downloading GloVe to your directory...")
    r = requests.get(glove_url, stream=True)
    with open(glove_zip, "wb") as f:
        for chunk in r.iter_content(chunk_size=1024):
            if chunk:
                f.write(chunk)

    print("Unzipping GloVe...")
    with zipfile.ZipFile(glove_zip, "r") as zip_ref:
        zip_ref.extract("glove.6B.100d.txt", path=glove_dir)

    print("GloVe download complete.")

Downloading GloVe to your lab3 directory...
Unzipping GloVe...
GloVe download complete.


In [6]:
# Load GloVe vectors into dict
glove = {}
with open(glove_txt, 'r', encoding='utf8') as f:
    for line in f:
        parts = line.strip().split()
        word = parts[0]
        vec = np.array(parts[1:], dtype=np.float32)
        glove[word] = vec

print(f"Loaded GloVe with {len(glove)} words, dim = {vec.shape[0]}")

Loaded GloVe with 400000 words, dim = 100


In [10]:
os.chdir('../code')
import preprocessing as prep

TR_DURATION = 2
delays = [1, 2, 3, 4]

# Paths
subject_dirs = [
    "/ocean/projects/mth240012p/shared/data/subject2",
    "/ocean/projects/mth240012p/shared/data/subject3"
]

In [34]:
train_glove_dic = {}
for story, ds in train_data.items():
    try:
        if not hasattr(ds, "data") or not hasattr(ds, "data_times") or not hasattr(ds, "tr_times"):
            print(f"⚠️ Skipping {story}: missing attributes")
            continue

        # Build GloVe matrix
        glove_matrix = []
        for tokens in ds.data:
            vecs = [glove[word] for word in tokens if word in glove]
            if vecs:
                glove_matrix.append(np.mean(vecs, axis=0))
            else:
                glove_matrix.append(np.zeros(100))  # Assuming GloVe dim = 100

        word_matrix = np.stack(glove_matrix)  # (num_words, 100)

        # Downsample
        downsampled = prep.downsample_word_vectors(
            stories=[story],
            word_vectors={story: word_matrix},
            wordseqs={story: ds}
        )[story]

        # Trim
        trimmed = downsampled[5 : -10, :]


        delayed = prep.make_delayed(trimmed, delays)
        train_glove_dic[story] = delayed
    except Exception as e:
        print(f"ERROR processing {story}: {e}")

In [39]:
test_glove_dic = {}
for story, ds in test_data.items():
    try:
        if not hasattr(ds, "data") or not hasattr(ds, "data_times") or not hasattr(ds, "tr_times"):
            print(f"⚠️ Skipping {story}: missing attributes")
            continue

        # Build GloVe matrix
        glove_matrix = []
        for tokens in ds.data:
            vecs = [glove[word] for word in tokens if word in glove]
            if vecs:
                glove_matrix.append(np.mean(vecs, axis=0))
            else:
                glove_matrix.append(np.zeros(100))  # Assuming GloVe dim = 100

        word_matrix = np.stack(glove_matrix)  # (num_words, 100)

        # Downsample
        downsampled = prep.downsample_word_vectors(
            stories=[story],
            word_vectors={story: word_matrix},
            wordseqs={story: ds}
        )[story]

        # Trim
        trimmed = downsampled[5 : -10, :]


        delayed = prep.make_delayed(trimmed, delays)
        test_glove_dic[story] = delayed
    except Exception as e:
        print(f"ERROR processing {story}: {e}")

In [41]:
train_glove_dic['sweetaspie'].shape

(157, 400)