# Introduction to text indexing techniques
See the extracted dataset sample [here](https://unimi2013.sharepoint.com/:u:/s/InformationRetrieval/EaL7kid2qzdCmAA8RO-m5iQBsvCl5cuNIdn0rsJN1FUhSg?e=fdXkkB)

In [1]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

In [2]:
import os
import nltk

In [3]:
from collections import defaultdict, Counter

In [4]:
folder = "/Users/flint/Data/recipe/text-sample/"
files = [f for f in os.listdir(folder) if f.endswith('.txt')]
recipes = []
for file in files:
    with open(os.path.join(folder, file), 'r') as data:
        recipes.append(data.read())

FileNotFoundError: [WinError 3] Impossibile trovare il percorso specificato: '/Users/flint/Data/recipe/text-sample/'

In [None]:
from string import punctuation

In [None]:
print(recipes[0])

In [None]:
nltk_tokenize = lambda text: [x.lower() for x in nltk.word_tokenize(text) if x not in punctuation]

In [None]:
len(recipes)

In [None]:
s = 'This is a sentence for a test'
nltk_tokenize(s)

## Bag of words and Inverted Index
The occurrences of words in a corpus can be represented by the *document-term* matrix, a matrix $T^{docs \times vocabulary}$. Howevere, such a matrix is sparse and has a huge dimensionality. We can address this issue by supporting data structures.

**BOW**
```
doc_id: {w: frequency, ...}
```
**Inverted Index**
```
w: [doc_id, ...]
```
**Rich inverted Index**
```
w: [(doc_id, position, freq, ...), ...]
```

In [None]:
BOW = defaultdict(lambda: defaultdict(lambda: 0))
I = defaultdict(set)

In [None]:
corpus = list(enumerate(recipes))
for i, document in tqdm(corpus):
    tokens = nltk_tokenize(document.lower())
    for token in tokens:
        BOW[i][token] += 1
        I[token].add(i)

In [None]:
list(BOW[0].items())[:10]

In [None]:
list(I['couscous'])[:10]

In [None]:
len(I['a'])

### To document-term Matrix

In [None]:
T = pd.DataFrame(BOW).fillna(0, inplace=False).T

In [None]:
T.head()

In [None]:
T.shape

## Naive IR

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
q = 'pasta with broccoli and cheese and parmesan'
V = list(T.columns)
qv = np.zeros(len(V))
for token in nltk_tokenize(q):
    try:
        word_index = V.index(token)
        qv[word_index] += 1
    except ValueError:
        pass

In [None]:
Sigma = cosine_similarity(qv.reshape(1, -1), T)

In [None]:
answers = sorted(enumerate(Sigma[0]), key=lambda x: -x[1])

In [None]:
print(corpus[answers[0][0]][1])

## Word relevance
Take the top 5 relevant words for document 0. What problems do you see in there?

In [None]:
print(sorted(BOW[0].items(), key=lambda x: -x[1])[:5], sum(BOW[0].values()))
print(sorted(BOW[1].items(), key=lambda x: -x[1])[:5], sum(BOW[1].values()))

### Term Frequency (TF)

In [None]:
def tf(term, bow):
    return 0.5 + (0.5 * bow[term] / max(bow.values()))

In [None]:
tfBOW0 = [(w, tf(w, BOW[0])) for w in BOW[0].keys()]

In [None]:
print(sorted([(w, tf(w, BOW[0])) for w in BOW[0].keys()], key=lambda x: -x[1])[:5])
print(sorted([(w, tf(w, BOW[1])) for w in BOW[1].keys()], key=lambda x: -x[1])[:5])

### Inverse Document Frequency (IDF)

In [None]:
def idf(word):
    df = len(I[word])
    if df >= 10:
        return np.log(len(I) / len(I[word]))
    else:
        return 0

In [None]:
idf_list = pd.Series(dict([(w, idf(w)) for w in V]))

In [None]:
idf_list.sort_values(ascending=True).iloc[:10]

### TfIdf

In [None]:
def tfidf(word, bow):
    return tf(word, bow)*idf(word)

In [None]:
print(sorted([(w, tfidf(w, BOW[0])) for w in BOW[0].keys()], key=lambda x: -x[1])[:5])
print(sorted([(w, tfidf(w, BOW[1])) for w in BOW[1].keys()], key=lambda x: -x[1])[:5])

### TfIdf by scikit-learn

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
vectorizer = TfidfVectorizer(tokenizer=nltk_tokenize)
X = vectorizer.fit_transform(recipes)

In [None]:
X.shape

In [None]:
type(X)

In [None]:
vocabulary = list(vectorizer.get_feature_names_out())

In [None]:
vocabulary.index('broccoli')

In [None]:
X[0,3146]

In [None]:
vectorizer.inverse_transform(X)[0]

In [None]:
X[0,vocabulary.index('cup')]

In [None]:
print(sorted(enumerate(X[0].toarray()[0]), key=lambda x: -x[1])[:5])
print(sorted(enumerate(X[1].toarray()[0]), key=lambda x: -x[1])[:5])

In [None]:
print([(vocabulary[w], s) for w, s in sorted(enumerate(X[0].toarray()[0]), key=lambda x: -x[1])[:5]])
print([(vocabulary[w], s) for w, s in sorted(enumerate(X[1].toarray()[0]), key=lambda x: -x[1])[:5]])

## Vector Space Visualization

In [None]:
import matplotlib.pyplot as plt

In [None]:
from sklearn.decomposition import PCA

In [None]:
pca = PCA(n_components=2)

In [None]:
Xa = pca.fit_transform(X.toarray())

In [None]:
fig, ax = plt.subplots(figsize=(6,6))
ax.scatter(Xa[:,0], Xa[:,1], alpha=0.4, c='#cccccc')
ax.scatter(Xa[0,0], Xa[0,1], alpha=0.9, c='#cc0000', s=100)
ax.scatter(Xa[1,0], Xa[1,1], alpha=0.9, c='#0000cc', s=100)
plt.tight_layout()
plt.show()

## Query

In [None]:
query = 'teriyaki rice'
q = vectorizer.transform([query]).toarray()

In [None]:
qa = pca.transform(q)

In [None]:
qa

In [None]:
fig, ax = plt.subplots(figsize=(6,6))
ax.scatter(Xa[:,0], Xa[:,1], alpha=0.4, c='#cccccc')
ax.scatter(Xa[0,0], Xa[0,1], alpha=0.9, c='#cc0000', s=100)
ax.scatter(Xa[1,0], Xa[1,1], alpha=0.9, c='#0000cc', s=100)
ax.scatter(qa[0,0], qa[0,1], alpha=0.9, c='#003300', s=100, marker='s')
plt.tight_layout()
plt.show()

In [None]:
query = 'pasta with broccoli and cheese and parmesan'
q = vectorizer.transform([query])

In [None]:
Sigma = cosine_similarity(q, X)

In [None]:
Sigma.shape

In [None]:
answers = sorted(enumerate(Sigma[0]), key=lambda x: -x[1])

In [None]:
answers[:5]

In [None]:
for doc_id, score in answers:
    print(recipes[doc_id])
    print("===============")