In [5]:
import os
import pickle

os.chdir('../data')
with open('raw_text.pkl', 'rb') as file:
    data = pickle.load(file)

In [6]:
import random
random.seed(42)
keys = list(data.keys())
random.shuffle(keys)
train_keys = keys[:80]
test_keys = keys[80:]
train_data = {key: data[key] for key in train_keys}
test_data = {key: data[key] for key in test_keys}

In [7]:
train_texts = []
train_story_names = []
test_texts = []
test_story_names = []

for name, sequence in train_data.items():
    try:
        text = sequence.data
        text = ' '.join(text)
        train_texts.append(text)
        train_story_names.append(name)
    except Exception as e:
        print(f"Could not extract text for {name}: {e}")

for name, sequence in test_data.items():
    try:
        text = sequence.data
        text = ' '.join(text)
        test_texts.append(text)
        test_story_names.append(name)
    except Exception as e:
        print(f"Could not extract text for {name}: {e}")

In [None]:
# Chunk 1: Load pretrained Word2Vec and prepare word-level matrix
import numpy as np
import gensim.downloader as api
from tqdm import tqdm

# Load small pretrained word2vec model
print("⏳ Loading pretrained Word2Vec from gensim...")
w2v = api.load("word2vec-google-news-300")  # Fast & decent
print(f"Loaded Word2Vec with {len(w2v.key_to_index)} words, vector size = {w2v.vector_size}")

⏳ Loading pretrained Word2Vec from gensim...
Loaded Word2Vec with 400000 words, vector size = 100


In [12]:
os.chdir('../code')
import preprocessing as prep

TR_DURATION = 2
delays = [1, 2, 3, 4]

# Paths
subject_dirs = [
    "/ocean/projects/mth240012p/shared/data/subject2",
    "/ocean/projects/mth240012p/shared/data/subject3"
]

In [14]:
train_word2vec_dic = {}
for name, ds in train_data.items():
    try:
        # Get word list
        word_list = ds.data.tolist() if isinstance(ds.data, np.ndarray) else ds.data
        vectors = []

        for word in word_list:
            if word in w2v:
                vectors.append(w2v[word])
            else:
                vectors.append(np.zeros(w2v.vector_size))  # fallback if OOV

        word_matrix = np.stack(vectors)  # shape: (num_words, embedding_dim)

        # Downsample to TR resolution
        downsampled = prep.downsample_word_vectors(
            stories=[name],
            word_vectors={name: word_matrix},
            wordseqs={name: ds}
        )[name]

        # Trim 5s from start and 10s from end
        trimmed = downsampled[5 : -10, :]

        # Create delayed embedding
        delayed = prep.make_delayed(trimmed, delays)
        train_word2vec_dic[name] = delayed
    except Exception as e:
        print(f"ERROR processing {name}: {e}")

In [18]:
test_word2vec_dic = {}
for name, ds in test_data.items():
    try:
        # Get word list
        word_list = ds.data.tolist() if isinstance(ds.data, np.ndarray) else ds.data
        vectors = []

        for word in word_list:
            if word in w2v:
                vectors.append(w2v[word])
            else:
                vectors.append(np.zeros(w2v.vector_size))  # fallback if OOV

        word_matrix = np.stack(vectors)  # shape: (num_words, embedding_dim)

        # Downsample to TR resolution
        downsampled = prep.downsample_word_vectors(
            stories=[name],
            word_vectors={name: word_matrix},
            wordseqs={name: ds}
        )[name]

        # Trim 5s from start and 10s from end
        trimmed = downsampled[5 : -10, :]

        # Create delayed embedding
        delayed = prep.make_delayed(trimmed, delays)
        test_word2vec_dic[name] = delayed
    except Exception as e:
        print(f"ERROR processing {name}: {e}")

In [17]:
train_word2vec_dic['sweetaspie'].shape

(157, 400)