In [22]:
import os
os.chdir('../data')

In [23]:
import pickle

with open('raw_text.pkl', 'rb') as file:
    data = pickle.load(file)

In [24]:
print(data.keys())

dict_keys(['sweetaspie', 'thatthingonmyarm', 'tildeath', 'indianapolis', 'lawsthatchokecreativity', 'golfclubbing', 'jugglingandjesus', 'shoppinginchina', 'cocoonoflove', 'hangtime', 'beneaththemushroomcloud', 'dialogue4', 'thepostmanalwayscalls', 'stumblinginthedark', 'kiksuya', 'haveyoumethimyet', 'theinterview', 'againstthewind', 'tetris', 'canplanetearthfeedtenbillionpeoplepart2', 'alternateithicatom', 'goldiethegoldfish', 'seedpotatoesofleningrad', 'onapproachtopluto', 'canplanetearthfeedtenbillionpeoplepart1', 'bluehope', 'superheroesjustforeachother', 'howtodraw', 'myfirstdaywiththeyankees', 'thumbsup', 'avatar', 'mayorofthefreaks', 'gangstersandcookies', 'breakingupintheageofgoogle', 'forgettingfear', 'waitingtogo', 'firetestforlove', 'goingthelibertyway', 'thefreedomridersandme', 'exorcism', 'itsabox', 'inamoment', 'afearstrippedbare', 'swimmingwithastronauts', 'ifthishaircouldtalk', 'whenmothersbullyback', 'vixenandtheussr', 'adollshouse', 'catfishingstrangerstofindmyself', '

In [25]:
import random
random.seed(42)
keys = list(data.keys())
random.shuffle(keys)
train_keys = keys[:80]
test_keys = keys[80:]
train_data = {key: data[key] for key in train_keys}
test_data = {key: data[key] for key in test_keys}

In [26]:
train_texts = []
train_story_names = []
test_texts = []
test_story_names = []

for name, sequence in train_data.items():
    try:
        text = sequence.data
        text = ' '.join(text)
        train_texts.append(text)
        train_story_names.append(name)
    except Exception as e:
        print(f"Could not extract text for {name}: {e}")

for name, sequence in test_data.items():
    try:
        text = sequence.data
        text = ' '.join(text)
        test_texts.append(text)
        test_story_names.append(name)
    except Exception as e:
        print(f"Could not extract text for {name}: {e}")

In [32]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(stop_words='english', min_df=5)
vectorizer.fit(train_texts)
print(f"Vocabulary size: {len(vectorizer.get_feature_names_out())}")

Vocabulary size: 1532


In [33]:
train_word_level_bow_vectors = {}
for name, sequence in train_data.items():
    try:
        words = sequence.data
        bow_matrix = vectorizer.transform(words).toarray()  # shape = (num_words, vocab_size)
        train_word_level_bow_vectors[name] = bow_matrix
        print(f"{name}: BoW shape = {bow_matrix.shape}")
    except Exception as e:
        print(f"Skipping {name} due to error: {e}")

penpal: BoW shape = (1592, 1532)
hangtime: BoW shape = (1927, 1532)
thatthingonmyarm: BoW shape = (2073, 1532)
theshower: BoW shape = (1383, 1532)
dialogue6: BoW shape = (1986, 1532)
haveyoumethimyet: BoW shape = (2985, 1532)
souls: BoW shape = (1868, 1532)
undertheinfluence: BoW shape = (1641, 1532)
cautioneating: BoW shape = (1587, 1532)
superheroesjustforeachother: BoW shape = (1440, 1532)
wheretheressmoke: BoW shape = (1839, 1532)
thetriangleshirtwaistconnection: BoW shape = (1448, 1532)
avatar: BoW shape = (1469, 1532)
gangstersandcookies: BoW shape = (1547, 1532)
adventuresinsayingyes: BoW shape = (2309, 1532)
fromboyhoodtofatherhood: BoW shape = (2755, 1532)
gpsformylostidentity: BoW shape = (1650, 1532)
becomingindian: BoW shape = (2619, 1532)
theclosetthatateeverything: BoW shape = (1928, 1532)
thesurprisingthingilearnedsailingsoloaroundtheworld: BoW shape = (2855, 1532)
dialogue2: BoW shape = (1835, 1532)
adollshouse: BoW shape = (1656, 1532)
escapingfromadirediagnosis: BoW s

In [34]:
test_word_level_bow_vectors = {}
for name, sequence in test_data.items():
    try:
        words = sequence.data
        bow_matrix = vectorizer.transform(words).toarray()  # shape = (num_words, vocab_size)
        test_word_level_bow_vectors[name] = bow_matrix
        print(f"{name}: BoW shape = {bow_matrix.shape}")
    except Exception as e:
        print(f"Skipping {name} due to error: {e}")

lifereimagined: BoW shape = (1800, 1532)
odetostepfather: BoW shape = (2675, 1532)
wildwomenanddancingqueens: BoW shape = (1218, 1532)
bluehope: BoW shape = (1941, 1532)
lifeanddeathontheoregontrail: BoW shape = (2389, 1532)
findingmyownrescuer: BoW shape = (1498, 1532)
listo: BoW shape = (2371, 1532)
dialogue1: BoW shape = (934, 1532)
thumbsup: BoW shape = (3083, 1532)
howtodraw: BoW shape = (1964, 1532)
buck: BoW shape = (1677, 1532)
canplanetearthfeedtenbillionpeoplepart3: BoW shape = (2066, 1532)
lawsthatchokecreativity: BoW shape = (2084, 1532)
threemonths: BoW shape = (2062, 1532)
canadageeseandddp: BoW shape = (2559, 1532)
dialogue4: BoW shape = (1692, 1532)
leavingbaghdad: BoW shape = (1976, 1532)
backsideofthestorm: BoW shape = (1964, 1532)
whyimustspeakoutaboutclimatechange: BoW shape = (2336, 1532)
stumblinginthedark: BoW shape = (2681, 1532)
notontheusualtour: BoW shape = (1431, 1532)
againstthewind: BoW shape = (838, 1532)
myfirstdaywiththeyankees: BoW shape = (2786, 1532)

In [35]:
os.chdir('../code')
import preprocessing as prep

TR_DURATION = 2
delays = [1, 2, 3, 4]

# Paths
subject_dirs = [
    "/ocean/projects/mth240012p/shared/data/subject2",
    "/ocean/projects/mth240012p/shared/data/subject3"
]

In [36]:
train_downsampled = prep.downsample_word_vectors(train_story_names, train_word_level_bow_vectors, train_data)

In [37]:
test_downsampled = prep.downsample_word_vectors(test_story_names, test_word_level_bow_vectors, test_data)

In [38]:
train_trimmed = {key: value[5:-10, :] for key, value in train_downsampled.items()}
test_trimmed = {key: value[5:-10, :] for key, value in test_downsampled.items()}

In [39]:
import numpy as np

for name in train_story_names:
    delayed = prep.make_delayed(train_trimmed[name], delays)
    np.savez_compressed(f"../data/bow/train/delayed_{name}.npz", delayed)

for name in test_story_names:
    delayed = prep.make_delayed(test_trimmed[name], delays)
    np.savez_compressed(f"../data/bow/test/delayed_{name}.npz", delayed)

In [40]:
embedding = np.load("../data/bow/train/delayed_sweetaspie.npz")

In [41]:
embedding['arr_0'].shape

(157, 6128)