In [1]:
import sys
import os
import random
import pickle
import numpy as np
from preprocessing import make_delayed
from preprocessing import downsample_word_vectors
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec

# Add the root project folder to sys.path (so ridge_utils becomes importable)
project_root = os.path.abspath('..')  # moves up from 'code/'
sys.path.append(project_root)

In [2]:
# Load the raw_text.pkl file
path_to_data = '/ocean/projects/mth240012p/shared/data'
with open(f'{path_to_data}/raw_text.pkl', 'rb') as f:
    raw_text = pickle.load(f)

print(type(raw_text))
print(len(raw_text)) # total 109 stories


<class 'dict'>
109


Before splitting the stories into train and test, since there are 8 more stories in the raw_text file than in the stories in subject 2 and 3, we exclude these stories since they cannot be used for both test or train. 

In [3]:
all_stories = set(raw_text.keys())
subj2_stories = set(os.path.splitext(f)[0] for f in os.listdir(f'{path_to_data}/subject2') if f.endswith('.npy'))
subj3_stories = set(os.path.splitext(f)[0] for f in os.listdir(f'{path_to_data}/subject3') if f.endswith('.npy'))
print(subj2_stories == subj3_stories) #fortunately, subject 2 and 3 has same stories
valid_stories = sorted(list(all_stories & subj2_stories & subj3_stories))
print(len(valid_stories)) 


True
101


Now we split test stories, train stories (3:7)

In [4]:
#we will just rename valid_stories as all_stories for the sake of simplicity.
all_stories = valid_stories
random.seed(42)
random.shuffle(all_stories)
split_idx = int(0.7 * len(all_stories))  # 70% for training

train_stories = all_stories[:split_idx]
test_stories = all_stories[split_idx:]

print(f"Train stories: {len(train_stories)}")
print(f"Test stories: {len(test_stories)}")
train_stories.sort()
test_stories.sort()

Train stories: 70
Test stories: 31


**Bag of Words Method**

1. For each story in train, we compute a matrix of one-hot vectors for each word in story.

Below, we first get a list of all the unique words in all the stories in train_stories. 

In [5]:
#compute and append all words in train_stories
allwords = []
for story in train_stories:
    temp_text = raw_text[story].data
    allwords += temp_text

print(len(allwords))

#compute unique words in train_stories
unique_words = list(set(allwords))
len(unique_words)


137105


10360

In [6]:
#compute and append all words in train_stories
allwords = []
for story in train_stories:
    temp_text = raw_text[story].data
    allwords += temp_text

print(len(allwords))

#compute unique words in train_stories
unique_words = list(set(allwords))
len(unique_words)


137105


10360

In [7]:
#make one-hot matrix for each story in training stories

word_to_index = {word: idx for idx, word in enumerate(unique_words)} #match each word to index
vocab_size = len(unique_words)

onehot_matrices = {}  # Storing one-hot matrix per story

for story in train_stories:
    words = raw_text[story].data
    num_words = len(words)
    
    # Initialize zero matrix
    onehot = np.zeros((num_words, vocab_size), dtype=np.int8)
    
    # Fill in the one-hot vectors
    for i, word in enumerate(words):
        if word in word_to_index:
            j = word_to_index[word]
            onehot[i, j] = 1
    
    onehot_matrices[story] = onehot #store matrix as value and story name as key

len(onehot_matrices) #70 keys == 70 train stories

70

In [8]:
#check if one-hot matrices are calculated well for first training story
for key, value in onehot_matrices.items():
    print("First key:", key)
    print(type(value))
    print("original shape of matrix for first story is:", value.shape) #row:number of total words in story
    #column: number of unique words in all training stories
    break


print("total length of words in first story is:", len(raw_text['itsabox'].data)) 
#number of rows of matrix matches the total length of words in first story

First key: adventuresinsayingyes
<class 'numpy.ndarray'>
original shape of matrix for first story is: (2309, 10360)
total length of words in first story is: 1708


2. Downsampling word vectors (one-hot matrices) to number of FMRIs taken for each story
Now we call downsample word vectors from the preprocessing.py to get the dimensions to match. 

In [9]:
#made dictionary whose keys are name of train story and values are raw_text[story]
wordseqs = {story: raw_text[story] for story in train_stories}

downsampled_vectors_1 = downsample_word_vectors(
    stories=train_stories,
    word_vectors=onehot_matrices,
    wordseqs=wordseqs
)


In [10]:
print(len(downsampled_vectors_1)) #matches the total number of training stories

#check the downsampled matrix of first two stories, 'adollshouse' and 'adventuresinsayingyes'
i=0
for key, value in downsampled_vectors_1.items():
    print("First key:", key)
    print("downsampled matrix for first story is", value.shape)
    i += 1
    if i == 2: break
#we can see that row of the matrix shrunk from 1708 to 370

70
First key: adventuresinsayingyes
downsampled matrix for first story is (406, 10360)
First key: afatherscover
downsampled matrix for first story is (327, 10360)


In [11]:
trimmed_vectors_1 = {} #trimmed matrices for each train story 

for story, matrix in downsampled_vectors_1.items():
    trimmed_matrix = matrix[5:-10]
    trimmed_vectors_1[story] = trimmed_matrix

# check for first trimmed matrix, which corresponds to story 
for key, value in trimmed_vectors_1.items():
    print("First key:", key)
    print("trimmed matrix for first story is", value.shape)
    break


First key: adventuresinsayingyes
trimmed matrix for first story is (391, 10360)


In [12]:
#We can see that the number of rows of trimmed matrices matches
# the number of rows in the FMRI scan for 'adollshouse' story
itsabox_s2 = np.load(f'{path_to_data}/subject2/adollshouse.npy')

print("Type:", type(itsabox_s2))
print("Shape:", itsabox_s2.shape)

Type: <class 'numpy.ndarray'>
Shape: (241, 94251)


3. Create lagged versions of the features using make_delayed with delays ranging form [1, 4] inclusive

Because the BOLD response lags behind the stimulus. If a word is spoken at time t, the associated brain response might show up 4–6 seconds later (2–3 TRs later).
So we remake each row vector for each TR so that the row vector for that particular TR
includes the information of the previous 4 TRs. 

Hence, for each matrix, the number of columns will increase by 4 folds. 

In [13]:
delayed_vectors_1 = {}

for story, trimmed_matrix in trimmed_vectors_1.items():
    X_lagged = make_delayed(trimmed_matrix, delays=[1, 2, 3, 4])
    delayed_vectors_1[story] = X_lagged

# check for first trimmed matrix, which corresponds to story 'penpal'
for key, value in delayed_vectors_1.items():
    print("First key:", key)
    print("delayed matrix for first story is", value.shape)
    break

First key: adventuresinsayingyes
delayed matrix for first story is (391, 41440)


Each row now contains the features from the previous 1–4 TRs.
This is using the four previous TR's chunk to predict the current TR. 

Now we make delayed vectors for test stories as well.

In [14]:
vocab_size = len(unique_words) #unique words are from TRAIN STORIES

onehot_matrices_test = {}  # Storing one-hot matrix per story

for story in test_stories:
    words = raw_text[story].data
    num_words = len(words)
    
    # Initialize zero matrix
    onehot = np.zeros((num_words, vocab_size), dtype=np.int8)
    
    # Fill in the one-hot vectors
    for i, word in enumerate(words):
        if word in word_to_index:
            j = word_to_index[word]
            onehot[i, j] = 1
    
    onehot_matrices_test[story] = onehot #store matrix as value and story name as key

print(len(onehot_matrices_test)) #30 keys

wordseqs2 = {story: raw_text[story] for story in test_stories} #wordseqs for test stories
downsampled_vectors_1_test = downsample_word_vectors(
    stories=test_stories,
    word_vectors=onehot_matrices_test,
    wordseqs=wordseqs2
)

trimmed_vectors_1_test = {} #trimmed matrices for each test story 

for story, matrix in downsampled_vectors_1_test.items():
    trimmed_matrix = matrix[5:-10]
    trimmed_vectors_1_test[story] = trimmed_matrix

delayed_vectors_1_test = {}

for story, trimmed_matrix in trimmed_vectors_1_test.items():
    X_lagged = make_delayed(trimmed_matrix, delays=[1, 2, 3, 4])
    delayed_vectors_1_test[story] = X_lagged



31


Each row now contains the features from the previous 1–4 TRs.
This is using the four previous TR's chunk to predict the current TR. 

**Word2Vec method**
Now we do the same process for Word2Vec method.

1. Make vectors for each word in train_stories
Create matrices for each stories. 

We use w2v pretrained model that gives a vector of length 300 for each word in train stories. 

In [15]:
# Load pre-trained Google News Word2Vec (300-dim)
# You need to download this separately
# Download from: https://code.google.com/archive/p/word2vec/

#unzip the gz file and put the bin file INSIDE the DATA folder
model_path = '../data/GoogleNews-vectors-negative300.bin'
w2v = KeyedVectors.load_word2vec_format(model_path, binary=True)


First, we check if the W2V model includes all the unique words for all the words in train_stories.
We use unique_words that we defined earlier. 
We see below that about 5% of the words are not defined in w2v, but since 5% is negligible, we just put 0 vectors for the words that aren't defined in w2v model. 

In [16]:
in_model = [word for word in unique_words if word in w2v]
not_in_model = [word for word in unique_words if word not in w2v]
print(f"Percentage of words Not in w2v model: {len(not_in_model)/len(unique_words):.4f}")

Percentage of words Not in w2v model: 0.0535


In [17]:
w2v['apple']

array([-0.06445312, -0.16015625, -0.01208496,  0.13476562, -0.22949219,
        0.16210938,  0.3046875 , -0.1796875 , -0.12109375,  0.25390625,
       -0.01428223, -0.06396484, -0.08056641, -0.05688477, -0.19628906,
        0.2890625 , -0.05151367,  0.14257812, -0.10498047, -0.04736328,
       -0.34765625,  0.35742188,  0.265625  ,  0.00188446, -0.01586914,
        0.00195312, -0.35546875,  0.22167969,  0.05761719,  0.15917969,
        0.08691406, -0.0267334 , -0.04785156,  0.23925781, -0.05981445,
        0.0378418 ,  0.17382812, -0.41796875,  0.2890625 ,  0.32617188,
        0.02429199, -0.01647949, -0.06494141, -0.08886719,  0.07666016,
       -0.15136719,  0.05249023, -0.04199219, -0.05419922,  0.00108337,
       -0.20117188,  0.12304688,  0.09228516,  0.10449219, -0.00408936,
       -0.04199219,  0.01409912, -0.02111816, -0.13476562, -0.24316406,
        0.16015625, -0.06689453, -0.08984375, -0.07177734, -0.00595093,
       -0.00482178, -0.00089264, -0.30664062, -0.0625    ,  0.07

In [18]:
#define function that returns matrix given word_list, model=w2v, 
#vector_size = 300 which is from the pretrained model

def embed_story_words(word_list, model, vector_size):
    embedded = []
    for word in word_list:
        if word in model:
            embedded.append(model[word])
        else:
            embedded.append(np.zeros(vector_size))  # unknown word
    return np.vstack(embedded)

In [19]:
vector_size = 300 #of pretrained model 
story_vectors_w2v = {} 
#Dictionary where key is story name and value is matrix, whose vectors correspond to each word in story

for story in train_stories:
    words = raw_text[story].data
    story_vectors_w2v[story] = embed_story_words(words, w2v, vector_size)

2. Downsample and Trim for matrices made by W2V model

In [20]:
# we already defines wordseqs in BoW method.
downsampled_vectors_2 = downsample_word_vectors(
    stories=train_stories,
    word_vectors=story_vectors_w2v,
    wordseqs=wordseqs
)

In [21]:
trimmed_vectors_2 = {} #trimmed matrices for each train story 

for story, matrix in downsampled_vectors_2.items():
    trimmed_matrix = matrix[5:-10]
    trimmed_vectors_2[story] = trimmed_matrix


3. Creating lagged versions

Since the current number of columns for each matrix ix 300, 
the new matrices will have 300*4=1200 columns.

In [22]:
delayed_vectors_2 = {}

for story, trimmed_matrix in trimmed_vectors_2.items():
    X_lagged = make_delayed(trimmed_matrix, delays=[1, 2, 3, 4])
    delayed_vectors_2[story] = X_lagged

# check for first trimmed,delayed matrix, which corresponds to first story
for key, value in delayed_vectors_2.items():
    print("First key:", key)
    print("delayed matrix for first story is", value.shape)
    break

First key: adventuresinsayingyes
delayed matrix for first story is (391, 1200)


Now we make embeddings for test stories.

In [23]:
story_vectors_w2v_test = {}
for story in test_stories:
    words = raw_text[story].data
    story_vectors_w2v_test[story] = embed_story_words(words, w2v, vector_size)

wordseqs2 = {story: raw_text[story] for story in test_stories}
downsampled_vectors_2_test = downsample_word_vectors(
    stories=test_stories,
    word_vectors=story_vectors_w2v_test,
    wordseqs=wordseqs2
)

trimmed_vectors_2_test = {} #trimmed matrices for each test story 
for story, matrix in downsampled_vectors_2_test.items():
    trimmed_matrix = matrix[5:-10]
    trimmed_vectors_2_test[story] = trimmed_matrix

delayed_vectors_2_test = {}
for story, trimmed_matrix in trimmed_vectors_2_test.items():
    X_lagged = make_delayed(trimmed_matrix, delays=[1, 2, 3, 4])
    delayed_vectors_2_test[story] = X_lagged

print(len(delayed_vectors_2_test))

31


In [24]:
import gc

del w2v
gc.collect()


0

**GloVe method** 
Basically we do the same as the two methods above. 


In [25]:
#Download a pre-trained GloVe file in https://nlp.stanford.edu/projects/glove/
#glove.6B.zip <- unzip this file and MOVE the glove.6B.300d.txt to DATA folder

#Since glove file is txt file, we will manually make a dictionary, glove_dict
#whose keys are words and values are 300 dim. vectors corresponding for each word of Glove Model. 
glove_file = '../data/glove.6B.300d.txt'

glove_dict = {}
with open(glove_file, "r", encoding="utf-8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = [float(val) for val in values[1:]]
        glove_dict[word] = vector

We use embed_story_words function defined earlier, used in previous model.

In [26]:
vector_size = 300 #of pretrained model (Glove)
story_vectors_glv = {} 

for story in train_stories:
    words = raw_text[story].data
    story_vectors_glv[story] = embed_story_words(words, glove_dict, vector_size)

2. Downsample and trim matrices

In [27]:
# we already defined wordseqs in BoW method.
downsampled_vectors_3 = downsample_word_vectors(
    stories=train_stories,
    word_vectors=story_vectors_glv,
    wordseqs=wordseqs
)

trimmed_vectors_3 = {} #trimmed matrices for each train story 

for story, matrix in downsampled_vectors_3.items():
    trimmed_matrix = matrix[5:-10]
    trimmed_vectors_3[story] = trimmed_matrix

3. Creating lagged version of matrices

Same as W2V, 300 * 4 =1200 will be the number of columns.

In [28]:
delayed_vectors_3 = {}

for story, trimmed_matrix in trimmed_vectors_3.items():
    X_lagged = make_delayed(trimmed_matrix, delays=[1, 2, 3, 4])
    delayed_vectors_3[story] = X_lagged

# check for first trimmed, delayed matrix, which corresponds to story 'itsabox'
for key, value in delayed_vectors_3.items():
    print("First key:", key)
    print("delayed matrix for first story is", value.shape)
    break

First key: adventuresinsayingyes
delayed matrix for first story is (391, 1200)


In [29]:
vector_size = 300 #of pretrained model (Glove)
story_vectors_glv_test = {} 

for story in test_stories:
    words = raw_text[story].data
    story_vectors_glv_test[story] = embed_story_words(words, glove_dict, vector_size)

# we already defined wordseqs2 for test stories. 
downsampled_vectors_3_test = downsample_word_vectors(
    stories=test_stories,
    word_vectors=story_vectors_glv_test,
    wordseqs=wordseqs2
)

trimmed_vectors_3_test = {} #trimmed matrices for each test story 

for story, matrix in downsampled_vectors_3_test.items():
    trimmed_matrix = matrix[5:-10]
    trimmed_vectors_3_test[story] = trimmed_matrix

delayed_vectors_3_test = {}

for story, trimmed_matrix in trimmed_vectors_3_test.items():
    X_lagged = make_delayed(trimmed_matrix, delays=[1, 2, 3, 4])
    delayed_vectors_3_test[story] = X_lagged

print(len(delayed_vectors_3_test))
#check for first matrix for story
for key, value in delayed_vectors_3_test.items():
    print("First key:", key)
    print("delayed matrix for first story is", value.shape)
    break

31
First key: adollshouse
delayed matrix for first story is (241, 1200)


In [30]:
#check if dimensions match with response matrix
story_s2 = np.load(f"{path_to_data}/subject2/adollshouse.npy")
print("Shape:", story_s2.shape)

Shape: (241, 94251)


In [31]:
del glove_dict
del raw_text
gc.collect()

11