In [None]:
## mounting
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
## filepath and unzip (archive of newsbooks)
import zipfile
path = '/content/gdrive/MyDrive/newsbooks_project_36490/newsbooks-1.zip'
with zipfile.ZipFile(path, 'r') as zip_ref:
    zip_ref.extractall()

import os
print('unzipped')
path_pref = 'booktxt/'
filepaths = [path_pref + '/' + path1 for path1 in os.listdir('/content/booktxt')] 
print(len(filepaths)) # files count in 'booktxt, where unzipped contents should be'

unzipped
1179


In [None]:
mk_str = str
import pickle
import numpy as np
pkl_get = pickle.load
pkl_put = pickle.dump

ERROR_LIST = [-1]

def get_file_lines (fname):
  ret = None
  try:
    ret = open(fname, 'r').readlines()
    ret = [ret1[:-1] for ret1 in ret][2:]
  except UnicodeDecodeError:
    ret = ERROR_LIST # Error
  
  return ret

def flatten_lines (A):
  if A == []: return A
  return [A[0]] + flatten_lines(A[1:])

def flatten(A):
  if A == []: return A
  return A[0] + flatten(A[1:])

def lines_2_words(lines):
  print(lines)
  return flatten_lines([line.split(' ') for line in lines]) if lines != ERROR_LIST else []
  

def jaccard_between(doc1_lines, doc2_lines):
  doc1, doc2 = flatten_lines(doc1_lines), flatten_lines(doc2_lines)

  doc1, doc2 = set(doc1), set(doc2)
  doc1_intersect_doc2_N = len(doc1.intersection(doc2))
  doc1_union_doc2_N = len(doc1) + len(doc2) - doc1_intersect_doc2_N

  return doc1_intersect_doc2_N / doc1_union_doc2_N

# baseline test and demo of jaccard index
lines_file1 = get_file_lines(filepaths[5])
lines_file2 = get_file_lines(filepaths[7])
words_file1 = flatten(lines_2_words(lines_file1))
words_file2 = flatten(lines_2_words(lines_file2))
print(jaccard_between(lines_file1, lines_file1))
print(jaccard_between(lines_file1, lines_file2))



[-1]
[-1]
1.0
1.0


In [None]:
def pair_up(A):
  n = len(A)
  ret1 = [[[A[i], A[j]] for j in range(n) if j > i] for i in range(n)]
  ret = []
  for line in ret1:
    ret.extend(line)
  
  return ret

def mk_result_str(elt1, elt2):
  i, lines1 = elt1 
  j, lines2 = elt2
  return 'jaccard between doc ' + mk_str(i) + ' and ' + mk_str(j) + ' is ' + mk_str(jaccard_between(lines1, lines2))

def jaccard_all_test():
  all_file_lines = [[idx, get_file_lines(filepaths[idx])] for idx in range(len(filepaths))]
  n1 = len(all_file_lines)
  all_file_lines = [ret for ret in all_file_lines if ret[1] != ERROR_LIST] # filter out errors
  print('num errors', n1 - len(all_file_lines))

  all_pairs = pair_up(all_file_lines) # just 10 pairs to test
  rets = [mk_result_str(pair_elt1, pair_elt2) for pair_elt1, pair_elt2 in all_pairs]

  return rets

# test_doc_results = jaccard_all_test()
# _ = [print(test_doc_results1) for test_doc_results1 in test_doc_results]


In [None]:
# _ = [print(result) for result in test_doc_results if float(result.split(' ')[-1]) > .1 and int(result.split(' ')[3]) < 606 and int(result.split(' ')[5]) < 606 ]

jaccard between doc 70 and 84 is 0.16046511627906976
jaccard between doc 93 and 537 is 0.10159651669085631
jaccard between doc 401 and 437 is 0.4789053591790194
jaccard between doc 435 and 580 is 0.23411978221415608
jaccard between doc 474 and 554 is 0.456359102244389


# **TF-IDF**


In [None]:
# Cite: Anup's data preprocessing code

import zipfile
path = '/content/gdrive/MyDrive/newsbooks_project_36490/newsbooks-1.zip'
with zipfile.ZipFile(path, 'r') as zip_ref:
    zip_ref.extractall()

import os
print('unzipped')
path_pref = 'booktxt/'
# I deleted the "/" here
filepaths = [path_pref + path1 for path1 in os.listdir('/content/booktxt')] 
print(len(filepaths)) # files count in 'booktxt, where unzipped contents should be'
# filepaths

unzipped
1179


### 1. Toy Example

In [None]:
# Toy Example with only two files
# res_lst = [" ".join(open(path, encoding="utf8", errors='ignore').read().splitlines()) for path in filepaths[:2]]

# from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
# countVectorizer = CountVectorizer(analyzer= 'word', stop_words='english')
# tfidfVectorizer = TfidfVectorizer(analyzer='word',stop_words= 'english')

# count_matrix = countVectorizer.fit_transform(res_lst)
# tfidf_matrix = tfidfVectorizer.fit_transform(res_lst)
# count_names = countVectorizer.get_feature_names()
# tfidf_names = tfidfVectorizer.get_feature_names()

# import pandas as pd
# pd.DataFrame.sparse.from_spmatrix(count_matrix, columns = count_names)
# pd.DataFrame.sparse.from_spmatrix(tfidf_matrix, columns = tfidf_names)

### 2. Run TF-IDF on all Text files

In [None]:
res_lst = [" ".join(open(path, encoding="utf8", errors='ignore').read().splitlines()) for path in filepaths]

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
tfidfVectorizer = TfidfVectorizer(analyzer='word',stop_words='english', token_pattern=r'(?u)\b[^\d\W_+*&@]+\b') # we exclude numbers

In [None]:
tfidf_matrix = tfidfVectorizer.fit_transform(res_lst)
tfidf_names = tfidfVectorizer.get_feature_names()

In [None]:
# TF-IDF scores for each word per document
import pandas as pd
df = pd.DataFrame.sparse.from_spmatrix(tfidf_matrix, columns = tfidf_names)
df.insert(0, 'doc_name', [path.split("/")[1].split(".")[0] for path in filepaths])
df.sort_values(by='doc_name', inplace=True)
df.reset_index(inplace=True, drop=True)
df

Unnamed: 0,doc_name,aa,aaccounts,aacordier,aad,aadm,aae,aaen,aaff,aais,aaken,aamentem,aamra,aanee,aanvers,aao,aarbour,aaren,aargile,aaron,aaubcate,aaw,aay,ab,aba,abah,abalter,abandio,abandon,abandoned,abandoneth,abandoning,abandonn,abardeen,abary,abase,abased,abasement,abasing,abassadors,...,zur,zurich,zurick,zurickers,zurzach,zurzack,zutphen,zutsen,zuy,zuylensteen,zvirczin,zvoez,zvooz,zvouz,zweybruck,zweybrugg,zweyer,zwich,zwiszerland,zwitzerland,zwoll,zwyer,zyburg,zωotomia,à,â,æ,æbrevi,æmilius,ænexum,ætermittendam,è,ô,üan,œ,ś,בך,ובוא,ךך,שןא
0,0000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0001,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0002,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0003,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0004,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1174,1174,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1175,1175,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1176,1176,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.027236,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1177,1177,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
# def transform(row, n):
#   # our original rows are sparse matrix, so we first obtain the nonzero indices
#   # we don't want to include the first column - the Doc Name
#   row = row.to_numpy()
#   inds = row.nonzero()
#   new_series = pd.Series(row[inds]).astype('float')
#   n_largest_inds = new_series.nlargest(n).index
#   return n_largest_inds

# new_df = df.apply(lambda row,n: transform(row,n), axis=1, n=5)


In [None]:
# pandas apply is too slow, so I installed a package here
# !pip install pandarallel

In [None]:
# from pandarallel import pandarallel
# pandarallel.initialize()

In [None]:
def getTopKImportantWordsFromDoc(df, doc_name, n):
  doc_name = int(doc_name)
  row = df.iloc[doc_name, :]
  row = row.to_numpy()[1:] # we don't want the doc_name to be in our array
  # we can optimize this later 
  # inds = row.nonzero()
  # new_series = pd.Series(row[inds]).astype('float')
  # n_largest_inds = new_series.nlargest(n).index
  n_largest_inds = pd.Series(row).astype('float').nlargest(n).index
  return n_largest_inds, row

In [None]:
words_lst = df.columns[1:] # the first column is doc_name, we don't want that
words_lst

Index(['aa', 'aaccounts', 'aacordier', 'aad', 'aadm', 'aae', 'aaen', 'aaff',
       'aais', 'aaken',
       ...
       'ætermittendam', 'è', 'ô', 'üan', 'œ', 'ś', 'בך', 'ובוא', 'ךך', 'שןא'],
      dtype='object', length=119285)

In [None]:
# row = df.iloc[462, :]
# row = row.to_numpy()[1:] # we don't want the doc_name to be in our array
# inds = row.nonzero()
# len(row[inds])

In [None]:
# we call getTopKImportantWordsFromDoc to get the top k important words from a specific document
inds, row = getTopKImportantWordsFromDoc(df, "0462", 10)
print(inds)
print(row[inds])
print(words_lst[inds])

Int64Index([84162, 58880, 118715, 32527, 49918, 938, 93794, 91361, 97373,
            80479],
           dtype='int64')
[0.19020957946961123 0.16643990072740597 0.16390151733549407
 0.15572010944317322 0.15550012805642052 0.14534716569115844
 0.13946031090760586 0.1137610222642816 0.11039333874172874
 0.10368373650105066]
Index(['premises', 'june', 'ye', 'dito', 'hath', 'act', 'said', 'reversions',
       'shall', 'persons'],
      dtype='object')


In [None]:
inds, row = getTopKImportantWordsFromDoc(df, "1001", 10)
print(words_lst[inds])

Index(['october', 'octob', 'colen', 'unr', 'having', 'men', 'hath', 'vigo',
       'franckford', 'frigats'],
      dtype='object')


# Jaccard with TF-IDF

In [None]:
# get top k=300 important words of each doc

def id_to_str(n):
  s = mk_str(n)
  return '0'*(4 - len(s)) + s

all_doc_top_300 = [getTopKImportantWordsFromDoc(df, id_to_str(doc_id), 300) for doc_id in range(600)]

In [None]:
# tempcode to store computation
# store_result(all_doc_top_300, '/content/gdrive/MyDrive/newsbooks_project_36490/storage.pkl')


In [None]:
def store_result (A, fname):
  f = open(fname, 'wb')
  pkl_put(A, f)
  f.close()

def get_result (fname):
  f = open(fname, 'rb')
  ret = pkl_get(f)
  f.close()
  return ret

def run_and_put_tfidf():
  all_doc_top_300 = [getTopKImportantWordsFromDoc(df, id_to_str(doc_id), 300) for doc_id in range(600)]
  store_result(all_doc_top_300, '/content/gdrive/MyDrive/newsbooks_project_36490/storage.pkl')

def get_tfidf():
  return get_result('/content/gdrive/MyDrive/newsbooks_project_36490/storage.pkl')

# do this once (about 11m) ALREADY DONE; DON'T WASTE 11minutes!
# run_and_put_tfidf()
# all_doc_top_300 = get_tfidf()

# do this all other times
all_doc_top_300 = get_tfidf()

In [None]:
# uses results from all_doc_top_300
def result_idx_to_words (idx):
  inds, row = all_doc_top_300[idx]
  return words_lst[inds]

In [None]:
# demo of words
print(result_idx_to_words(0)[:20])

Index(['page', 'chapters', 'act', 'shuttleworth', 'ordered', 'great', 'house',
       'committee', 'unr', 'time', 'marq', 'trustees', 'taken', 'sonldiers',
       'trustee', 'banber', 'skill', 'sea', 'shall', 'service'],
      dtype='object')


In [None]:
def jaccard_between_tfidf(doc1_important_words, doc2_important_words):
  doc1, doc2 = list(doc1_important_words), list(doc2_important_words) # df to list

  doc1, doc2 = set(doc1), set(doc2)
  doc1_intersect_doc2_N = len(doc1.intersection(doc2))
  doc1_union_doc2_N = len(doc1) + len(doc2) - doc1_intersect_doc2_N

  return doc1_intersect_doc2_N / doc1_union_doc2_N

# demo
print(jaccard_between_tfidf(result_idx_to_words(0), result_idx_to_words(1)))

0.0582010582010582


In [None]:
# all pairwise results
def mk_print(val, q1, q2):
  return 'tfidf jaccard between ' + mk_str(q1) + ' and ' + mk_str(q2) + ' is ' + mk_str(val) if val > .1 else ''

A = [[ mk_print(jaccard_between_tfidf(result_idx_to_words(q1), result_idx_to_words(q2)), q1, q2) for q2 in range(600) if q2 < q1] for q1 in range(600)]

In [None]:
# A = flatten(A)
print('\n'.join([a for a in A if a != '' and float(a.split(' ')[-1]) > .25 ]))

tfidf jaccard between 48 and 47 is 0.7094017094017094
tfidf jaccard between 72 and 56 is 0.3363028953229399
tfidf jaccard between 77 and 61 is 0.31868131868131866
tfidf jaccard between 81 and 57 is 0.2903225806451613
tfidf jaccard between 82 and 63 is 0.36054421768707484
tfidf jaccard between 88 and 30 is 0.31868131868131866
tfidf jaccard between 95 and 37 is 0.2765957446808511
tfidf jaccard between 98 and 27 is 0.28205128205128205
tfidf jaccard between 100 and 88 is 0.32450331125827814
tfidf jaccard between 101 and 89 is 0.29310344827586204
tfidf jaccard between 102 and 90 is 0.26582278481012656
tfidf jaccard between 104 and 85 is 0.2578616352201258
tfidf jaccard between 105 and 92 is 0.284796573875803
tfidf jaccard between 106 and 96 is 0.26582278481012656
tfidf jaccard between 158 and 155 is 0.34831460674157305
tfidf jaccard between 300 and 299 is 0.5544041450777202
tfidf jaccard between 396 and 340 is 0.3303769401330377
tfidf jaccard between 399 and 343 is 0.279317697228145
tfidf j