In [1]:
import sys
import os
import random
import pickle
import numpy as np
from preprocessing import make_delayed
from preprocessing import downsample_word_vectors

# Add the root project folder to sys.path (so ridge_utils becomes importable)
project_root = os.path.abspath('..')  # moves up from 'code/'
sys.path.append(project_root)


In [2]:
# Load the raw_text.pkl file
with open('/ocean/projects/mth240012p/shared/data/raw_text.pkl', 'rb') as f:
    raw_text = pickle.load(f)

print(type(raw_text)) #raw_text file is a dict 
print(len(raw_text)) # total 109 stories
print(raw_text.keys()) #keys are names of the stories

<class 'dict'>
109
dict_keys(['sweetaspie', 'thatthingonmyarm', 'tildeath', 'indianapolis', 'lawsthatchokecreativity', 'golfclubbing', 'jugglingandjesus', 'shoppinginchina', 'cocoonoflove', 'hangtime', 'beneaththemushroomcloud', 'dialogue4', 'thepostmanalwayscalls', 'stumblinginthedark', 'kiksuya', 'haveyoumethimyet', 'theinterview', 'againstthewind', 'tetris', 'canplanetearthfeedtenbillionpeoplepart2', 'alternateithicatom', 'goldiethegoldfish', 'seedpotatoesofleningrad', 'onapproachtopluto', 'canplanetearthfeedtenbillionpeoplepart1', 'bluehope', 'superheroesjustforeachother', 'howtodraw', 'myfirstdaywiththeyankees', 'thumbsup', 'avatar', 'mayorofthefreaks', 'gangstersandcookies', 'breakingupintheageofgoogle', 'forgettingfear', 'waitingtogo', 'firetestforlove', 'goingthelibertyway', 'thefreedomridersandme', 'exorcism', 'itsabox', 'inamoment', 'afearstrippedbare', 'swimmingwithastronauts', 'ifthishaircouldtalk', 'whenmothersbullyback', 'vixenandtheussr', 'adollshouse', 'catfishingstrang

In [3]:
#let's see what's inside one story, 'penpal'
dir(raw_text['penpal'])

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 'chunk_to_data_ind',
 'chunkmeans',
 'chunks',
 'chunksums',
 'copy',
 'data',
 'data_times',
 'data_to_chunk_ind',
 'from_chunks',
 'from_grid',
 'mapdata',
 'split_inds',
 'tr_times']

In [4]:
print(type(raw_text['penpal'].data))
print(len(raw_text['penpal'].data)) #total of 1592 words
raw_text['penpal'].data[:10] #list of words from the stories


<class 'list'>
1592


['', 'i', 'was', 'overseas', 'not', 'that', 'long', 'ago', 'on', 'a']

In [5]:
print(type(raw_text['penpal'].data_times))
print(raw_text['penpal'].data_times.shape) #vector
raw_text['penpal'].data_times[:20]

<class 'numpy.ndarray'>
(1592,)


array([6.23582766e-03, 1.24467120e+00, 1.49410431e+00, 2.00294785e+00,
       2.41201814e+00, 2.59659864e+00, 2.83106576e+00, 3.17528345e+00,
       3.71405896e+00, 4.08408275e+00, 4.30358388e+00, 4.38253968e+00,
       4.60702948e+00, 4.95623583e+00, 5.34036281e+00, 5.62970522e+00,
       5.91904762e+00, 6.29319728e+00, 6.56258503e+00, 7.25600907e+00])

In [6]:
print(type(raw_text['penpal'].tr_times))
print(raw_text['penpal'].tr_times.shape) #how many FMRI scans per story
raw_text['penpal'].tr_times[:10]
#we can see that the FMRI shots were taken for 2 seconds each. 

<class 'numpy.ndarray'>
(270,)


array([-9., -7., -5., -3., -1.,  1.,  3.,  5.,  7.,  9.])

In [7]:
print(type(raw_text['penpal'].split_inds))
print(len(raw_text['penpal'].split_inds)) #list of indexes that shows where to split
# list of words into word chunks
# we can see that number of split indexes is one less than tr_times. 
raw_text['penpal'].split_inds[:15]
#first five word chunks are empty. meaning first five FMRI scans were shot
#before the story started. 

<class 'list'>
269


[0, 0, 0, 0, 0, 4, 10, 17, 23, 30, 36, 44, 48, 54, 61]

In [12]:
1592/270

5.896296296296296

Some stories in the raw_text files are not in subejct 2 and 3. 
We exclude them. 

In [8]:
all_stories = set(raw_text.keys())
subj2_stories = set(os.path.splitext(f)[0] for f in os.listdir('/ocean/projects/mth240012p/shared/data/subject2') if f.endswith('.npy'))
subj3_stories = set(os.path.splitext(f)[0] for f in os.listdir('/ocean/projects/mth240012p/shared/data/subject3') if f.endswith('.npy'))
print(subj2_stories == subj3_stories) #fortunately, subject 2 and 3 has same stories
valid_stories = sorted(list(all_stories & subj2_stories & subj3_stories))
print(len(valid_stories)) 

True
101


In [9]:
#stories that are in the raw_text file but not in both of the subjects
all_stories - set(valid_stories)

{'dialogue1',
 'dialogue2',
 'dialogue3',
 'dialogue4',
 'dialogue5',
 'dialogue6',
 'myfirstdaywiththeyankees',
 'onlyonewaytofindout'}

Now, we explore the npy files for subject 2 and 3.

In [11]:
# Load the file for random story to explore data 
penpal_s2 = np.load('/ocean/projects/mth240012p/shared/data/subject2/penpal.npy')

# Print basic info
print("Type:", type(penpal_s2))
print("Shape:", penpal_s2.shape)
print("Data (preview):", penpal_s2[:5])  # preview first 5 rows

#255 time points (FMRI time steps while subject is listening to story 'penpal')
#Each row has 94,251 values, which is the number of voxels (3D brain regions) measured

Type: <class 'numpy.ndarray'>
Shape: (255, 94251)
Data (preview): [[ 1.2453781  -0.41628303  0.40856305 ... -0.24667304 -0.04651232
  -1.07498571]
 [ 0.08730935 -0.20358665 -0.26335135 ... -0.02177695 -1.43287916
   0.43917887]
 [ 1.18076728  1.12036485  0.40589049 ... -0.41354723 -0.23335376
  -0.9543282 ]
 [-0.08483419 -0.98204067  0.3281862  ...  0.18280717 -0.08003498
  -0.22902656]
 [-0.01154614 -0.8344062  -0.02151216 ...  0.70086765  0.32259273
   1.06684996]]


In [10]:
# Load the file for random story to explore data 
penpal_s3 = np.load('/ocean/projects/mth240012p/shared/data/subject3/penpal.npy')

# Print basic info
print("Type:", type(penpal_s3))
print("Shape:", penpal_s3.shape)
print("Data (preview):", penpal_s3[:5])  # preview first 5 rows

#255 time points (FMRI time steps while subject is listening to story)
#Each row has 95,556 values, which is the number of voxels (brain regions) measured

Type: <class 'numpy.ndarray'>
Shape: (255, 95556)
Data (preview): [[-0.38695819  0.13897068  1.08238927 ...  1.15880633  0.30080054
   0.10387555]
 [ 1.62235331  0.12342972 -0.94471412 ... -0.95414812 -0.40348039
   1.9248084 ]
 [-0.49907409 -0.18462957  0.13827181 ...  0.30105886  1.58567952
   0.52668597]
 [-0.4617879   0.75625333  1.75015338 ... -2.12386876 -0.62403329
   0.14806919]
 [-1.33391335 -0.64554708  1.54707993 ... -0.83972679  0.26833241
   1.04564885]]


We can see that the number of voxels vary for subject 2 and 3, while the number of TRs for the same story is identical. 