In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [114]:
import nltk
import csv
import numpy as np
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.tokenize import RegexpTokenizer
from nltk.stem import PorterStemmer
import os
import re
from functools import reduce
from tqdm import tqdm
import pandas as pd
from collections import Counter
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import heapq

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# 2. Search Engine

In [115]:
col_names = ['placeName', 'placeTags', 'placePeopleVisited',
       'placePeopleWant', 'placeDesc', 'placeShortDesc', 'placeNearby',
       'placeAddress', 'placeAlt', 'placeLong', 'placeEditors',
       'placePubDate', 'placeRelatedLists', 'placeRelatedPlaces',
       'placeURL']
df = pd.read_csv('/content/drive/MyDrive/ADM/Homework3/merged.tsv', sep='\t', names = col_names)
df = df.drop(0)

We decided to remove numbers because they don't carry any information. However, looking at the website, we see that many places are linked to a specific historical period (e.g. World War 2), so it might be a criterion a user takes into account when searching for a query. Due to this reason we leaved the dates, e.g. numbers of four digits that starts with a 1 or a 2. Then each year is converted in the corresponding century (1634 becomes 1600) since it is more likely to serch for a generic century rather than a specific date. 

In [116]:
ps = PorterStemmer()
stop_words = set(stopwords.words('english'))

def text_preprocessing(str):
  # remove punctuation and tokenize only words and numbers
  tokens = RegexpTokenizer(r'\w+')
  tokens = list(tokens.tokenize(str))
  # execute stemming
  tokens = [ps.stem(word.lower()) for word in tokens if not word.lower() in stop_words]
  # remove numbers and leave dates
  tokens = [el for el in tokens if not re.match(r'^\d{1,3}$|^[^12]{1}\d{3}$|^\d{5,}$', el)]
  # process dates
  tokens = list(map(lambda x: x[:2] + '00' if re.match(r'^\d+$', x) else x, tokens))
  # remove mixted string (letters and numbers)
  tokens = [el for el in tokens if re.match(r'^\d+$|^[a-z]+$', el)]
  return tokens

## 2.1. Conjunctive query

### 2.1.1) Create your index!

In [117]:
# CREATE VOCABULARY

# apply the preprocessing to the place description
df.placeDesc = df.placeDesc.apply(text_preprocessing)
# compute vocabulary
words_count = Counter(df.placeDesc.sum())
total_words = sorted(list(set(words_count)))
vocabulary = dict(zip(total_words, list(range(1, len(total_words) + 1))))

In [118]:
# SAVE VOCABULARY

# with open('/content/drive/MyDrive/ADM/Homework3/vocabulary.pkl', 'wb') as f:
#   pickle.dump(vocabulary, f)

In [32]:
# LOAD VOCABULARY

# with open('/content/drive/MyDrive/ADM/Homework3/vocabulary.pkl', 'rb') as f:
#     vocabulary = pickle.load(f)

In [119]:
# COMPUTE INVERTED INDEX

inverted_index = {}
for word, index in tqdm(vocabulary.items()):
  inverted_index[index] = list(df[df.placeDesc.apply(lambda x: word in x)].index)  

100%|██████████| 37562/37562 [18:45<00:00, 33.38it/s]


In [120]:
# SAVE INVERTED INDEX


with open('/content/drive/MyDrive/ADM/Homework3/inverted_index.pkl', 'wb') as f:
    pickle.dump(inverted_index, f)

In [None]:
# LOAD INVERTED INDEX

# with open('/content/drive/MyDrive/ADM/Homework3/inverted_index.pkl', 'rb') as f:
#     inverted_index = pickle.load(f)

### 2.1.2) Execute the query

In [121]:
# store inverted index into dataframe
inverted_index_df = pd.DataFrame(inverted_index.items(), columns = ['term_index', 'document'])

What we do is preprocess the query with the same function we used for the documents and then computing the intersection of all the posting lists corresponding to the tokens of the query.

In [122]:
query = 'american museum'
query = text_preprocessing(query)
query = [vocabulary[q] for q in query]
a, b = inverted_index_df[inverted_index_df.term_index.isin(query)].document.values
results = list(set(a) & set(b)) # document ids of the results

In [125]:
#display results
display = df.loc[results][['placeName', 'placeDesc', 'placeURL']]
display.head(10)

Unnamed: 0,placeName,placeDesc,placeURL
6656,Evel Knievel Museum,"[evel, knievel, museum, take, thrill, spill, s...",https://www.atlasobscura.com/places/evel-knie...
4097,McNutt Sculpture Garden,"[hidden, sculptur, garden, exist, san, antonio...",https://www.atlasobscura.com/places/mcnutt-sc...
2561,The Wolfsonian-FIU,"[wolfsonian, florida, intern, univers, wolfson...",https://www.atlasobscura.com/places/wolfsonia...
6657,Old Time Wooden Nickel Company,"[adag, goe, take, wooden, nickel, one, fit, po...",https://www.atlasobscura.com/places/old-time-...
1538,National Museum of Health and Medicine,"[hous, downtown, washington, c, nation, museum...",https://www.atlasobscura.com/places/national-...
2054,Soumaya Museum,"[hous, whop, piec, predominantli, central, ame...",https://www.atlasobscura.com/places/soumaya-m...
4103,World's Largest Shuttlecocks,"[across, expans, ground, nelson, atkin, museum...",https://www.atlasobscura.com/places/world-s-l...
5126,"Harriet Beecher Stowe, Slavery to Freedom Museum","[earli, brick, georgian, townhous, sit, incons...",https://www.atlasobscura.com/places/harriet-b...
1543,Rock Art Ranch,"[rock, art, ranch, near, winslow, arizona, pri...",https://www.atlasobscura.com/places/rock-art-...
3086,The Athenian Agora,"[lie, right, beneath, northern, slope, acropol...",https://www.atlasobscura.com/places/the-athen...


## 2.2) Conjunctive query & Ranking score


### 2.2.1) Inverted index

In [126]:
#reload dataset
col_names = ['placeName', 'placeTags', 'placePeopleVisited',
       'placePeopleWant', 'placeDesc', 'placeShortDesc', 'placeNearby',
       'placeAddress', 'placeAlt', 'placeLong', 'placeEditors',
       'placePubDate', 'placeRelatedLists', 'placeRelatedPlaces',
       'placeURL']
df = pd.read_csv('/content/drive/MyDrive/ADM/Homework3/merged.tsv', sep='\t', names = col_names)
df = df.drop(0)

In [127]:
k = 10

We compute the df-idf using scikit-learn library.

In [128]:
model = TfidfVectorizer(input='content', lowercase=False, tokenizer=text_preprocessing)
tf_idf = model.fit_transform(df.placeDesc)

2.2.2) Execute the query

As before we give to the query the same treatment we gave to the documents, in this case computing the embedding, then we find all the documents that contains all the tokens of the query, then we compute the cosine similarity with respect to the query and, using a heap for efficiency, we find the top k documents. 

In [129]:
query = 'american museum'
query_vec = model.transform([query])
# find all documents that contains all queries
documents_with_all_query = reduce(np.intersect1d, tuple([np.nonzero(tf_idf[:, vocabulary[q] - 1].toarray())[0] for q in text_preprocessing(query)]))
# find similarities between query and all the documents
similarities = cosine_similarity(tf_idf, query_vec)
# case 1: number of results less than k
if len(documents_with_all_query) < k:
  similar_pages = enumerate(-similarities)
  similar_pages = [el[::-1] for el in similar_pages]
  heapq.heapify(similar_pages)
  k = len(np.nonzero(similarities))
else:
# case 2: otherwise 
  similar_pages = [(-similarities[b], b) for b in documents_with_all_query]  
  heapq.heapify(similar_pages)

In [130]:
# get top k documents
top_k_elements = []
for i in range(k):
  top_k_elements.append(heapq.heappop(similar_pages))

In [131]:
#display results
display = df.loc[[el[1] + 1 for el in top_k_elements]][['placeName', 'placeDesc', 'placeURL']]
display['scores'] = [-el[0][0] for el in top_k_elements]
display.head(k)

Unnamed: 0,placeName,placeDesc,placeURL,scores
141,Museum of the Weird,The dime or dime store museum is by all accou...,https://www.atlasobscura.com/places/museum-we...,0.298123
6204,Sweet Home Cafe,Thomas Downing was the oyster king. In 19th-c...,https://www.atlasobscura.com/places/sweet-hom...,0.292803
1199,Harvard Museum of Natural History,Collecting three different institutions into ...,https://www.atlasobscura.com/places/harvard-m...,0.283666
6220,Siriraj Medical Museum,The Siriraj Medical Museum abounds with medic...,https://www.atlasobscura.com/places/siriraj-m...,0.273827
4463,American Writers Museum,The American Writers Museum—tucked away on th...,https://www.atlasobscura.com/places/american-...,0.269401
2669,Milwaukee Art Museum,Like the Guggenheim in New York and the Oakla...,https://www.atlasobscura.com/places/milwaukee...,0.257663
6310,Self-Taught Genius Gallery,"In 2017, the American Folk Art Museum in Manh...",https://www.atlasobscura.com/places/self-taug...,0.250283
1904,National World War II Museum,"Perhaps once thought too narrowly focused, t...",https://www.atlasobscura.com/places/national-...,0.241701
2109,Museum of Psychphonics,The Museum of Psychphonics is a modern-day wu...,https://www.atlasobscura.com/places/museum-of...,0.231118
3441,Geppi's Entertainment Museum,It’s a unique place that can create a sentime...,https://www.atlasobscura.com/places/geppi-s-e...,0.228142
