# Chapter 1

In [None]:
# This dataset includes the top cited papers in machine learning.
# title|summary|authors|year
# You can read more on https://www.doradolist.com/papers/21-most-cited-machine-learning-papers
# Created by A.Nazari for NLP tasks

### Boolean Retrieval

### Inverted Index

### AND (intersect), OR (union), AND NOT (difference)

In [2]:
# Open file and create vocab
vocab = []
docs = []
filename = "dataSet.txt"

with open(filename, encoding='utf-8') as f:
    lines = f.readlines()
    for line in lines:
        # Word tokenization
        title = line.split("|")[0].lower()
        docs.append(title)
        for word in title.split():
            if word not in vocab:
                vocab.append(word)

print(len(vocab), "\n", vocab, "\n", len(docs), "\n", docs)

85 
 ['deep', 'residual', 'learning', 'for', 'image', 'recognition', 'adam:', 'a', 'method', 'stochastic', 'optimization', 'imagenet', 'classification', 'with', 'convolutional', 'neural', 'networks', 'random', 'forests', 'very', 'large-scale', 'scikit-learn:', 'machine', 'in', 'python', 'support-vector', 'generative', 'adversarial', 'nets', 'faster', 'r-cnn:', 'towards', 'real-time', 'object', 'detection', 'region', 'proposal', 'libsvm:', 'library', 'support', 'vector', 'machines', 'gradient-based', 'applied', 'to', 'document', 'imagenet:', 'hierarchical', 'database', 'going', 'deeper', 'convolutions', 'latent', 'dirichlet', 'allocation', 'batch', 'normalization:', 'accelerating', 'network', 'training', 'by', 'reducing', 'internal', 'covariate', 'shift', 'tensorflow:', 'system', 'dropout:', 'simple', 'way', 'prevent', 'from', 'overfitting', 'large', 'scale', 'visual', 'challenge', 'mapreduce:', 'simplified', 'data', 'processing', 'on', 'clusters', 'bagging', 'predictors'] 
 21 
 ['deep

In [2]:
# Term-document matrix
import numpy as np
V = len(vocab)
D = len(docs)
matrix = np.zeros((V, D))

for term_id, term in enumerate(vocab):
    for doc_id, doc in enumerate(docs):
        if term in doc:
            matrix[term_id, doc_id] = 1

print(matrix)

[[1. 0. 1. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 1. 0.]
 [0. 0. 0. ... 0. 0. 1.]
 [0. 0. 0. ... 0. 0. 1.]]


In [None]:
query = "Deep learning"

# Query representation
query_terms = query.lower().split()
query_representation = [matrix[vocab.index(i), :] for i in query_terms]

operator = "and"  # and/or
if operator == "and":
    relevant_docs_idx = np.ones(D)
else:
    relevant_docs_idx = np.zeros(D)

for rep in query_representation:
    if operator == "and":
        relevant_docs_idx = np.logical_and(relevant_docs_idx, rep)
    else:
        relevant_docs_idx = np.logical_or(relevant_docs_idx, rep)
        
print(np.where(relevant_docs_idx == True)[0])
relevant_docs = [docs[i] for i in np.where(relevant_docs_idx == True)[0]]
for doc in relevant_docs:
    print(doc)

[0 6]
deep residual learning for image recognition
deep learning


In [4]:
# Inverted index
inverted_index = {}

for term_id, term in enumerate(vocab):
    for doc_id, doc in enumerate(docs):
        if term in doc:
            if term in inverted_index.keys():
                inverted_index[term] = inverted_index[term] + [doc_id]
            else:
                inverted_index[term] = [doc_id]

print(inverted_index)

{'deep': [0, 2, 4, 6, 13, 15], 'residual': [0], 'learning': [0, 5, 6, 11, 16], 'for': [0, 1, 3, 4, 10, 16], 'image': [0, 2, 4, 12, 18], 'recognition': [0, 4, 11, 18], 'adam:': [1], 'a': [0, 1, 2, 3, 4, 5, 6, 8, 9, 10, 11, 12, 14, 15, 16, 17, 18, 19, 20], 'method': [1], 'stochastic': [1], 'optimization': [1], 'imagenet': [2, 12, 18], 'classification': [2], 'with': [2, 9, 13], 'convolutional': [2, 4], 'neural': [2, 17], 'networks': [2, 4, 7, 9, 17], 'random': [3], 'forests': [3], 'very': [4], 'large-scale': [4, 12, 16], 'scikit-learn:': [5], 'machine': [5, 10, 16], 'in': [0, 5, 6, 10, 11, 13, 15, 16, 17, 19, 20], 'python': [5], 'support-vector': [7], 'generative': [8], 'adversarial': [8], 'nets': [8], 'faster': [9], 'r-cnn:': [9], 'towards': [9], 'real-time': [9], 'object': [9], 'detection': [9], 'region': [9], 'proposal': [9], 'libsvm:': [10], 'library': [10], 'support': [7, 10], 'vector': [7, 10], 'machines': [10], 'gradient-based': [11], 'applied': [11], 'to': [1, 7, 9, 10, 11, 17, 20

In [5]:
# Intersect
def intersect(p, q):
    intersect_list = []
    i = j = 0
    while i < len(p) and j < len(q):
        if p[i] == q[j]:
            intersect_list.append(p[i])
            i += 1
            j += 1
        elif p[i] < q[j]:
            i += 1
        else:
            j += 1
    return intersect_list

intersect([1, 2, 3, 4, 5], [1, 3, 5, 7, 9])

[1, 3, 5]

In [6]:
query = "Deep learning image"

# Query preprocessing
query_terms = query.lower().split()

relevant_docs_idx = inverted_index[query_terms[0]]
for idx in range(1, len(query_terms)):
    relevant_docs_idx = intersect(relevant_docs_idx, inverted_index[query_terms[idx]])

print(relevant_docs_idx)
relevant_docs = [docs[i] for i in relevant_docs_idx]
for doc in relevant_docs:
    print(doc)

[0]
deep residual learning for image recognition


In [7]:
# Union
def union(p, q):
    i = j = 0
    union_list = []
    while i < len(p) and j < len(q):
        if p[i] < q[j]:
            union_list.append(p[i])
            i += 1
        elif p[i] > q[j]:
            union_list.append(q[j])
            j += 1
        else:
            union_list.append(p[i])
            i += 1
            j += 1
    while i < len(p):
        union_list.append(p[i])
        i += 1
    while j < len(q):
        union_list.append(q[j])
        j += 1
    return union_list

union([1, 2, 3, 4, 5], [1, 3, 5, 7, 9])

[1, 2, 3, 4, 5, 7, 9]

In [8]:
# Difference
def difference(p, q):
    i, j = 0, 0
    difference_list = []
    while i < len(p) and j < len(q):
        if p[i] < q[j]:
            difference_list.append(p[i])
            i += 1
        elif p[i] > q[j]:
            j += 1
        else:
            i += 1
            j += 1
    while i < len(p):
        difference_list.append(p[i])
        i += 1
    return difference_list

difference([1, 2, 3, 4, 5], [1, 3, 5, 7, 9])

[2, 4]

## Homework

### Homework 1: Optimal Boolean Retrieval

In [43]:
# Homework 1: Optimal Boolean Retrieval

def optimal_intersect(query_terms, inverted_index):
    sorted_terms = sorted(query_terms, key=lambda x: len(inverted_index.get(x, [])))
    result = []
    for term in sorted_terms:
        postings = inverted_index.get(term, [])
        if not postings:
            return []
        if not result:
            result = postings.copy()
        else:
            result = intersect(result, postings)
            if not result:
                break  
    return result

def optimal_union(query_terms, inverted_index):
    sorted_terms = sorted(query_terms, key=lambda x: len(inverted_index.get(x, [])))
    result = []
    for term in sorted_terms:
        postings = inverted_index.get(term, [])
        if not result:
            result = postings.copy()
        else:
            result = union(result, postings)
    return result

def optimal_difference(term1, term2, inverted_index):
    list1 = inverted_index.get(term1, [])
    list2 = inverted_index.get(term2, [])
    return difference(list1, list2)

def optimal_not(term, inverted_index, total_docs):
    return list(set(range(total_docs)) - set(inverted_index.get(term, [])))

# Example usage
query_terms = ["deep", "learning", "image"]
print([f"{term}: {inverted_index.get(term, [])}" for term in query_terms])

print("\nOptimal Union Result:", optimal_union(query_terms, inverted_index))
print("Optimal Intersect Result:", optimal_intersect(query_terms, inverted_index))
print("Optimal Difference Result:", optimal_difference("deep", "learning", inverted_index))

['deep: [0, 2, 4, 6, 13, 15]', 'learning: [0, 5, 6, 11, 16]', 'image: [0, 2, 4, 12, 18]']

Optimal Union Result: [0, 2, 4, 5, 6, 11, 12, 13, 15, 16, 18]
Optimal Intersect Result: [0]
Optimal Difference Result: [2, 4, 13, 15]


In [56]:
# Homework 2: Query Processing

query = inverted_index

# 1. Deep AND (NOT Learning)
not_learning_docs = optimal_not("learning", inverted_index, D)
query["NOT_learning"] = not_learning_docs 
deep_and_not_learning = optimal_intersect(["deep", "NOT_learning"], query)
query["deep_and_not_learning"] = deep_and_not_learning 

print("Relevant Docs for 'Deep AND (NOT Learning)':")
for doc_id in query["deep_and_not_learning"]:
    print(docs[doc_id])

# 2. Deep OR Learning
deep_or_learning = optimal_union(["deep", "learning"], query) 
query["deep_or_learning"] = deep_or_learning

print("\nRelevant Docs for 'Deep OR Learning':")
for doc_id in query["deep_or_learning"]:
    print(docs[doc_id])

# 3. (Deep AND (NOT Learning)) OR image
deep_and_not_learning_or_image = optimal_union(["deep_and_not_learning", "image"], query)

print("\nRelevant Docs for '(Deep AND (NOT Learning)) OR image':")
for doc_id in deep_and_not_learning_or_image:
    print(docs[doc_id])

Relevant Docs for 'Deep AND (NOT Learning)':
imagenet classification with deep convolutional neural networks
very deep convolutional networks for large-scale image recognition
going deeper with convolutions
batch normalization: accelerating deep network training by reducing internal covariate shift

Relevant Docs for 'Deep OR Learning':
deep residual learning for image recognition
imagenet classification with deep convolutional neural networks
very deep convolutional networks for large-scale image recognition
scikit-learn: machine learning in python
deep learning
gradient-based learning applied to document recognition
going deeper with convolutions
batch normalization: accelerating deep network training by reducing internal covariate shift
tensorflow: a system for large-scale machine learning

Relevant Docs for '(Deep AND (NOT Learning)) OR image':
deep residual learning for image recognition
imagenet classification with deep convolutional neural networks
very deep convolutional networ

In [60]:
# Homework 3: Boolean Retrieval Example

docs_hw3 = [
    "cat sat on the mat",
    "the dog chased the cat",
    "the mat was on the floor"
]

inverted_index_hw3 = {}
for doc_id, doc in enumerate(docs_hw3):
    for term in doc.split():
        if term not in inverted_index_hw3:
            inverted_index_hw3[term] = []
        inverted_index_hw3[term].append(doc_id)

# Query: cat AND (mat OR floor)
query = {}

query["mat"] = inverted_index_hw3["mat"]
query["floor"] = inverted_index_hw3["floor"]
query["cat"] = inverted_index_hw3["cat"]

mat_or_floor = optimal_union(["mat", "floor"], query)
query["mat_or_floor"] = mat_or_floor
cat_and_mat_or_floor = optimal_intersect(["cat", "mat_or_floor"], query)

print("Relevant Docs for 'cat AND (mat OR floor)':")
for doc_id in cat_and_mat_or_floor:
    print(docs_hw3[doc_id])

Relevant Docs for 'cat AND (mat OR floor)':
cat sat on the mat
