Colab doesn't have enough RAM to even create a empty 25,000x89527 matrix that would store each word occurance for every datapoint. 
My solution is to sort the vocab list by their weight and then split that into groups of roughly equal size. The groups or 'buckets' are words with similar weights and the weight assigned to the whole bucket is the mean of its contents.

In [7]:
import numpy as np
import pandas as pd

words_file = open('/content/drive/MyDrive/B455/Project 4/Data/imdb.vocab')
words = words_file.readlines()
words_file.close()

weights_file = open('/content/drive/MyDrive/B455/Project 4/Data/imdbEr.txt')
word_weights = weights_file.readlines()
weights_file.close()

with_zeros = False

#entires come with a lingering \n from the file
words = [item.replace('\n', '') for item in words]
#converts the entries from 'str' to 'float'
word_weights = [float(item) for item in word_weights]

print(f'Total number of words: {len(words)}')
#total number of word buckets to be created
num_buckets = 500

df = pd.DataFrame({'words': words, 'weights': word_weights})
#sorts the words by their weight
df = df.sort_values(by=['weights'])

#separates vocab by words with non-zero and zero weights
neutral_words = df[df['weights'] == 0]
weighted_words = df[df['weights'] != 0]

#splits the DataFrame into a python list with num_buckets parts of roughly equal size
x = np.array_split(weighted_words, num_buckets, axis=0)
#constructs a list of tuples containing (pandas Series containing words, mean weight of all words in the pandas Series)
vocab_list = [(frame['words'], np.mean(frame['weights'])) for frame in x]

#append zero weighted words
if with_zeros:
  vocab_list.append((neutral_words['words'], np.mean(neutral_words['weights'])))

print(f'Number of \'buckets\': {num_buckets}\nAverage words per \'bucket\': {len(vocab_list[0][0])}')
print(f'Example bucket:\n{vocab_list[250]}')

# import pickle

# f = open('/content/drive/MyDrive/B455/Project 4/Data/vocab_list.p', 'wb')
# pickle.dump(vocab_list, f)
# f.close()

Total number of words: 89527
Number of 'buckets': 500
Average words per 'bucket': 113
Example bucket:
(22309         oily
11373     meatball
34117      tunisia
1918          sean
35762    pinkerton
           ...    
53257      colburn
45786        dozor
38889         berg
7410        rebels
678         easily
Name: words, Length: 112, dtype: object, 0.30500190816735717)


In [8]:
def read_feat_file(feat_list, vocab_list):
  yy = np.array([1 if int(line[0:2]) > 6 else 0 for line in feat_list])
  X_temp = [item[2:].split() for item in feat_list]

  #multiple occurances of the same word are not taken into account
  X_temp = [[int(item.split(':')[0]) for item in lst] for lst in X_temp] #list comprehension is magic

  xx = np.zeros((len(feat_list), len(vocab_list)))

  #takes about an hour to run this part because there's 3 loops going on for 50,000x500 operations plus extra
  #there's probably a better way to do this
  for i in range(xx.shape[0]):
    if i % 1000 == 0:
      print(f'Iter {i}/50000')
    for j in range(xx.shape[1]):
      #note: multiple occurances of the same word are counted as 1 (from before)
      #the value of xx[i,j] represents the number of words from bucket 'j' that appear in item 'i' (which can be more than 1)
      xx[i,j] = np.sum([1 if item in vocab_list[j][0] else 0 for item in X_temp[i]])

  return (xx, yy)


train = open('/content/drive/MyDrive/B455/Project 4/Data/train/labeledBow.feat')
test = open('/content/drive/MyDrive/B455/Project 4/Data/test/labeledBow.feat')

#bundles all 50,000 samples together to process at once
lines = train.readlines()
lines.extend(test.readlines())

train.close()
test.close()

data = read_feat_file(lines, vocab_list)

#storing the preprocessed data for later because this compuation takes a LONG time
if with_zeros:
  dump_file = open('/content/drive/MyDrive/B455/Project 4/Data/preprocessed_data.p', 'wb')
else:
  dump_file = open('/content/drive/MyDrive/B455/Project 4/Data/preprocessed_data_no_zeros.p', 'wb')

pickle.dump(data, dump_file)
dump_file.close()

Iter 0/50000
Iter 1000/50000
Iter 2000/50000
Iter 3000/50000
Iter 4000/50000
Iter 5000/50000
Iter 6000/50000
Iter 7000/50000
Iter 8000/50000
Iter 9000/50000
Iter 10000/50000
Iter 11000/50000
Iter 12000/50000
Iter 13000/50000
Iter 14000/50000
Iter 15000/50000
Iter 16000/50000
Iter 17000/50000
Iter 18000/50000
Iter 19000/50000
Iter 20000/50000
Iter 21000/50000
Iter 22000/50000
Iter 23000/50000
Iter 24000/50000
Iter 25000/50000
Iter 26000/50000
Iter 27000/50000
Iter 28000/50000
Iter 29000/50000
Iter 30000/50000
Iter 31000/50000
Iter 32000/50000
Iter 33000/50000
Iter 34000/50000
Iter 35000/50000
Iter 36000/50000
Iter 37000/50000
Iter 38000/50000
Iter 39000/50000
Iter 40000/50000
Iter 41000/50000
Iter 42000/50000
Iter 43000/50000
Iter 44000/50000
Iter 45000/50000
Iter 46000/50000
Iter 47000/50000
Iter 48000/50000
Iter 49000/50000
