In [58]:
import numpy as np
from stop_list import closed_class_stop_words
from string import punctuation
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from tqdm import tqdm

In [59]:
def tokenize_clean(text, vocabulary, output_tokens_list):
    lemmatizer = WordNetLemmatizer()
    tokens = []
    
    if not text:
        return tokens
    
    dirty_tokens = word_tokenize(text)
    for dirty_token in dirty_tokens:
        if '-' in dirty_token:
            dirty_tokens.extend(dirty_token.split('-'))
        if dirty_token not in closed_class_stop_words and dirty_token not in punctuation and dirty_token.isalpha() and len(dirty_token) > 2:
            cleaned_token = dirty_token.lower().strip(punctuation)

            lemmatized_token = lemmatizer.lemmatize(cleaned_token)
            tokens.append(lemmatized_token)
            vocabulary.add(lemmatized_token)

    output_tokens_list.append(tokens)

In [60]:
with open('.\Cranfield_collection_HW\cran.qry') as file:
    queries = file.read().split('.I')

query_tokens_list = []
vocab = set()

for query in queries:
    query = query.split('.W')[-1].strip()
    tokenize_clean(query, vocab, query_tokens_list)


In [61]:
with open('.\Cranfield_collection_HW\cran.all.1400') as file:
    documents = file.read().split('.I')

document_tokens_list = []

for document in documents:
    document = document.split('.W')[-1].strip()
    tokenize_clean(document, vocab, document_tokens_list)

In [62]:
len(vocab)

5989

In [63]:
# create dictionary from vocabulary
sorted_vocab = sorted(vocab)
vocab2index = {word: i for i, word in enumerate(sorted_vocab)}
vocab2index.items()



In [64]:
# query tokens matrix, rows are queries, columns are tokens
query_matrix = np.zeros((len(query_tokens_list), len(vocab)))
print('dimension of query matrix:', query_matrix.shape)
document_matrix = np.zeros((len(document_tokens_list), len(vocab)))
print('dimension of document matrix:', document_matrix.shape)

dimension of query matrix: (225, 5989)
dimension of document matrix: (1398, 5989)


In [65]:
def populate_matrix(tokens_list, matrix, vocab2index):
    for text_row, tokens in enumerate(tqdm(tokens_list)):
        for token in tokens:
            token_col = vocab2index[token]
            tf = tokens.count(token) / len(tokens)
            df = sum([1 for query in tokens_list if token in query])
            idf = np.log(len(tokens_list) / df)
            matrix[text_row, token_col] = tf * idf
    return None

In [66]:
# populate query matrix with tf-idf values
populate_matrix(query_tokens_list, query_matrix, vocab2index)
# populate document matrix with tf-idf values
populate_matrix(document_tokens_list, document_matrix, vocab2index)

100%|██████████| 225/225 [00:00<00:00, 2368.43it/s]
  5%|▌         | 72/1398 [00:14<04:29,  4.93it/s]


KeyboardInterrupt: 

In [None]:
# def populate_matrix_log(tokens_list, matrix, vocab2index):
#     for text_row, tokens in enumerate(tqdm(tokens_list)):
#         for token in tokens:
#             token_col = vocab2index[token]
#             tf = tokens.count(token) / len(tokens)
#             df = sum([1 for query in tokens_list if token in query])
#             idf = np.log(len(tokens_list)/df)
#             matrix[text_row, token_col] = (1 + np.log(tf)) * idf
#     return None

In [None]:
# # populate query matrix with tf-idf values
# populate_matrix_log(query_tokens_list, query_matrix, vocab2index)
# # populate document matrix with tf-idf values
# populate_matrix_log(document_tokens_list, document_matrix, vocab2index)

100%|██████████| 225/225 [00:00<00:00, 2419.36it/s]
100%|██████████| 1398/1398 [03:37<00:00,  6.43it/s]


# performed poorlyo(╥﹏╥)o

In [None]:
Q = query_matrix
D = document_matrix
Q_norm = Q / np.linalg.norm(Q, axis=1, keepdims=True)
D_norm = D / np.linalg.norm(D, axis=1, keepdims=True)

similarity_matrix = Q_norm @ D_norm.T
similarity_matrix

array([[0.        , 0.03132523, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.01351929, 0.04564637, 0.        , ..., 0.01087861, 0.        ,
        0.        ],
       [0.01280079, 0.00727007, 0.        , ..., 0.01030045, 0.        ,
        0.        ],
       ...,
       [0.        , 0.05141163, 0.1479218 , ..., 0.07620213, 0.18350054,
        0.04607565],
       [0.00256453, 0.03821808, 0.00554205, ..., 0.        , 0.        ,
        0.        ],
       [0.09396131, 0.        , 0.        , ..., 0.        , 0.04846848,
        0.03851096]])

In [None]:
similarity_matrix[0,:].argsort()[::-1]

array([663, 661, 484, ..., 818, 817,   0], dtype=int64)

In [None]:
with open('output.txt', encoding='utf8', mode='w') as file:
    for query_idx in range(similarity_matrix.shape[0]):
        for document_idx in similarity_matrix[query_idx,:].argsort()[::-1]:
            score = similarity_matrix[query_idx, document_idx]
            if score > 0:
                file.write(f'{query_idx+1} {document_idx+1} {score}\n')

In [None]:
vocab2index

{'abbreviated': 0,
 'ability': 1,
 'ablated': 2,
 'ablating': 3,
 'ablation': 4,
 'ablative': 5,
 'able': 6,
 'abrupt': 7,
 'abruptly': 8,
 'absence': 9,
 'absent': 10,
 'absolute': 11,
 'absorbed': 12,
 'absorbing': 13,
 'absorption': 14,
 'abstract': 15,
 'abundantly': 16,
 'academic': 17,
 'accelerated': 18,
 'accelerates': 19,
 'accelerating': 20,
 'acceleration': 21,
 'accelerator': 22,
 'accelerometer': 23,
 'accentuated': 24,
 'acceptability': 25,
 'acceptable': 26,
 'acceptably': 27,
 'accepted': 28,
 'accessible': 29,
 'accidental': 30,
 'accommodate': 31,
 'accommodated': 32,
 'accommodation': 33,
 'accompanied': 34,
 'accompanies': 35,
 'accompany': 36,
 'accompanying': 37,
 'accomplish': 38,
 'accomplished': 39,
 'accord': 40,
 'accordance': 41,
 'according': 42,
 'accordingly': 43,
 'account': 44,
 'accountable': 45,
 'accounted': 46,
 'accounting': 47,
 'accrue': 48,
 'accumulated': 49,
 'accumulation': 50,
 'accuracy': 51,
 'accurate': 52,
 'accurately': 53,
 'acetate': 

In [None]:
lemmatizer = WordNetLemmatizer()
lemmatizer.lemmatize('ablated', pos='v')

'ablate'