In [167]:
import scipy.sparse.linalg
from datasets import load_dataset
import pickle
from nltk.stem import PorterStemmer
import string
import enchant
from scipy import sparse
from math import log
from sklearn.preprocessing import normalize
from sklearn.utils.extmath import randomized_svd

In [2]:
from nltk.corpus import stopwords
english_dict = enchant.Dict("en_US")
stopwords = stopwords.words("english")

### Download all english wikipedia articles

In [2]:
dataset = load_dataset("wikipedia", "20220301.en", beam_runner="DirectRunner")

Found cached dataset wikipedia (C:/Users/k2002/.cache/huggingface/datasets/wikipedia/20220301.en/2.0.0/aa542ed919df55cc5d3347f42dd4521d05ca68751f50dbc32bae2a7f1e167559)


  0%|          | 0/1 [00:00<?, ?it/s]

### Download english stopwords

In [22]:
import nltk
nltk.download('stopwords')
nltk.download("punkt")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\k2002\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\k2002\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
dataset = dataset["train"]

### Keep articles containing one of those words: ["fruit", "fruits", "vegetable", "vegetables", "meat"]

In [4]:
new_dataset = []
my_words = ["fruit", "fruits", "vegetable", "vegetables", "meat"]
for site in dataset:
    tmp_text = site["text"].split(" ")
    for word in my_words:
        if word in tmp_text:
            new_dataset.append(site)
            break

In [5]:
print(len(new_dataset))

77605


In [6]:
dataset = new_dataset

### Keep articles containing one of those words: ["eat", "food"]

In [13]:
new_dataset = []
for site in dataset:
    tmp_text = site["text"].split(" ")
    if "eat" in tmp_text or "food" in tmp_text:
        new_dataset.append(site)

In [None]:
print(f"Number of elements in my database {len(new_dataset)}")

NameError: name 'new_dataset' is not defined

In [15]:
dataset = new_dataset

### Save database to file

In [25]:
with open('dataset/my_dataset.bin', 'wb') as file:
    pickle.dump(dataset, file)

In [3]:
with open('dataset/my_dataset.bin', 'rb') as file:
    dataset = pickle.load(file)

### Reset id of every article

In [4]:
for i in range(len(dataset)):
    dataset[i]["id"] = i

### Return list of simplified words for given text (article)

In [5]:
def simplify_string(text: str) -> list:
    def my_filter(x):
        if len(x) < 2 or any(char.isdigit() for char in x) or not english_dict.check(x):
            return False
        return True

    ps = PorterStemmer()
    text = text.lower().translate(str.maketrans("\n", " ", string.punctuation)).split(" ")
    text = [word for word in text if word not in stopwords]
    text = list(filter(my_filter, text))
    text = list(map(ps.stem, text))
    return text

### Creating bag of words for every article (and save progress to file every 100 articles)

In [127]:
all_words = set()
words_per_article = list()  #[id] -> dict of words
word_occurance = dict()   # word -> in_how_many_articles

In [None]:
for i in range(len(dataset)):
    text = simplify_string(dataset[i]["text"])
    word_counter = dict() # TODO better deafult_dict liblary
    for word in text:
        if word not in word_counter:
            word_counter[word] = 1
        else:
            word_counter[word] += 1
    for word in word_counter:
        all_words.add(word)
        if word not in word_occurance:
            word_occurance[word] = 1
        else:
            word_occurance[word] += 1
    words_per_article.append(word_counter)
    if i % 100 == 0:
        print(f"SAVING for i = {i}")
        with open('dataset/all_words.bin', 'wb') as file:
            pickle.dump(all_words, file)
        with open('dataset/words_per_article.bin', 'wb') as file:
            pickle.dump(words_per_article, file)
        with open('dataset/word_occurance.bin', 'wb') as file:
            pickle.dump(word_occurance, file)
    print(f"{i} / { len(dataset)}")

In [136]:
xd = False
if xd: # anty misslick
    with open('dataset/all_words.bin', 'wb') as file:
        pickle.dump(all_words, file)
    with open('dataset/words_per_article.bin', 'wb') as file:
        pickle.dump(words_per_article, file)
    with open('dataset/word_occurance.bin', 'wb') as file:
        pickle.dump(word_occurance, file)

In [7]:
with open('dataset/all_words.bin', 'rb') as file:
    all_words = pickle.load(file)
with open('dataset/words_per_article.bin', 'rb') as file:
    words_per_article = pickle.load(file)  #[id] -> dict of words
with open('dataset/word_occurance.bin', 'rb') as file:
    word_occurance = pickle.load(file)     # word -> in_how_many_articles

### Sorting list of all words from every article

In [8]:
all_words = sorted(list(all_words))
all_words_indexes = dict()
for i, word in enumerate(all_words):
    all_words_indexes[word] = i

In [10]:
N = len(dataset)
M = len(all_words)
print(f"I have {N} articles and {M} words")

I have 21052 articles and 41688 words


### Helpful functions

In [None]:
def save_sparse_mat_file(sparse_matrix, filename):
    sparse.save_npz(f"dataset/{filename}", sparse_matrix)

def create_sparse_matrix_for_all_articles(use_idf = False):
    row = []
    col = []
    word_counter = []
    for i in range(N):
        dict_of_words = words_per_article[i]
        for word in dict_of_words:
            col.append(i)
            row.append(all_words_indexes[word])
            if use_idf:  # TODO make this if nicer (and not repeted 1kkkkkk times)
                word_counter.append(dict_of_words[word] * log(N / word_occurance[word]))
            else:
                word_counter.append(dict_of_words[word])
        # print(f"{i} / {N}")
    return sparse.csr_matrix((word_counter, (row, col)), shape=(M, N))


def create_single_vector(text: str):  # already transposition
    text = simplify_string(text)
    tmp = []
    for word in text:
        if word in all_words:
            tmp.append(word)
    text = tmp

    word_counter = dict()
    for word in text:
        if word not in word_counter:
            word_counter[word] = 1
        else:
            word_counter[word] += 1

    row = []
    col = []

    data = []
    for word in word_counter:
        row.append(0)
        col.append(all_words_indexes[word])
        data.append(word_counter[word])
    return sparse.csr_matrix((data, (row, col)), shape=(1, M))


def return_k_nearest_articles(searched_phrase, matrix_of_words, k = 10):
    q = create_single_vector(searched_phrase)
    q_norm = sparse.linalg.norm(q, axis=1)
    ans = []
    min_val = 0
    for i in range(N):
        col = matrix_of_words.getcol(i)
        val = q * col
        if not val:
            continue
        val = (val.data / (q_norm * sparse.linalg.norm(col, axis=0)))[0]
        if len(ans) < k:
            ans.append((i, val))
        elif val > min_val:
            ans.append((i, val))
            ans.sort(key = lambda x: x[1], reverse=True)
            ans.pop(-1)
            min_val = ans[-1][1]
        # print(f"{i} / {N}")
    ans = [index[0] for index in ans]
    return ans

def return_k_nearest_articles_normalized(searched_phrase, k = 10):
    q = create_single_vector(searched_phrase)
    q_normalized = normalize(q, norm="l1", axis=1)
    ans_mat = q_normalized @ sparse_mat_normalized
    cx = ans_mat.tocoo()
    ans = []
    min_val = 0
    for i, val in zip(cx.col, cx.data):
        if len(ans) < k:
            ans.append((i, val))
        elif val > min_val:
            ans.append((i, val))
            ans.sort(key = lambda x: x[1], reverse=True)
            ans.pop(-1)
            min_val = ans[-1][1]
        # print(f"{i} / {N}")
    ans = [index[0] for index in ans]
    return ans

def return_k_nearest_articles_svd(searched_phrase, svd_size, k = 10):
    q = create_single_vector(searched_phrase)
    q_normalized = normalize(q, norm="l1", axis=1)
    svd_U, svd_D, svd_V = load_svd_from_file(svd_size)
    #              1 x M       M x SVD     SVD x SVD         SVD x N
    ans_mat = ((q_normalized @ svd_U) @ sparse.diags(svd_D)) @ svd_V
    ans_mat = sparse.csr_matrix(ans_mat)
    cx = ans_mat.tocoo()
    ans = []
    min_val = 0
    for i, val in zip(cx.col, cx.data):
        if len(ans) < k:
            ans.append((i, val))
        elif val > min_val:
            ans.append((i, val))
            ans.sort(key = lambda x: x[1], reverse=True)
            ans.pop(-1)
            min_val = ans[-1][1]
        # print(f"{i} / {N}")
    ans = [index[0] for index in ans]
    return ans

def load_svd_from_file(k):
    with open(f'dataset/sparse_svd_U_{k}.bin', 'rb') as file:
        mat_S = pickle.load(file)
    with open(f'dataset/sparse_svd_D_{k}.bin', 'rb') as file:
        mat_D = pickle.load(file)
    with open(f'dataset/sparse_svd_V_{k}.bin', 'rb') as file:
        mat_V = pickle.load(file)
    return mat_S, mat_D, mat_V

### Creating sparse matrix for every article

In [12]:
xd = False
if xd:  # anty missclick
    sparse_mat = create_sparse_matrix_for_all_articles(False)
    save_sparse_mat_file(sparse_mat, "sparse_mat_default.npz")

### Inverse document frequency matrix

In [13]:
xd = False
if xd:  # anty missclick
    sparse_mat_idf = create_sparse_matrix_for_all_articles(True)
    save_sparse_mat_file(sparse_mat_idf, "sparse_mat_idf.npz")

### Normalised idf matrix

In [277]:
xd = False
if xd:  # anty missclick
    sparse_mat_normalized = normalize(sparse_mat_idf, norm="l1", axis=0)
    save_sparse_mat_file(sparse_mat_normalized, "sparse_mat_normalized.npz")

### Normalised idf matrixes after svd (I tested different numbers of components)

In [171]:
xd = True
k = 100
if xd:  # anty missclick
    # maybe use sklearn.decomposition.TruncatedSVD
    mat_S, mat_D, mat_V = randomized_svd(sparse_mat_normalized, n_components=k, random_state=None)
    with open(f'dataset/sparse_svd_U_{k}.bin', 'wb') as file:
        pickle.dump(mat_S, file)
    with open(f'dataset/sparse_svd_D_{k}.bin', 'wb') as file:
        pickle.dump(mat_D, file)
    with open(f'dataset/sparse_svd_V_{k}.bin', 'wb') as file:
        pickle.dump(mat_V, file)

In [12]:
sparse_mat = sparse.load_npz("dataset/sparse_mat_idf.npz")
sparse_mat_idf = sparse.load_npz("dataset/sparse_mat_default.npz")
sparse_mat_normalized = sparse.load_npz("dataset/sparse_mat_normalized.npz")

# Testing search engine

## Test 1 -> "round dish with mozarella, tomato sauce and mushrooms"

In [267]:
k_nearest = return_k_nearest_articles("round dish with mozarella, tomato sauce and mushrooms", sparse_mat)
for i, index in enumerate(k_nearest):
    print(f"{i+1}: {dataset[index]['title']} | {dataset[index]['url']}")

1: List of mushroom dishes | https://en.wikipedia.org/wiki/List%20of%20mushroom%20dishes
2: Tomato | https://en.wikipedia.org/wiki/Tomato
3: List of sauces | https://en.wikipedia.org/wiki/List%20of%20sauces
4: Sauce | https://en.wikipedia.org/wiki/Sauce
5: À la zingara | https://en.wikipedia.org/wiki/%C3%80%20la%20zingara
6: Stewed tomatoes | https://en.wikipedia.org/wiki/Stewed%20tomatoes
7: Edible mushroom | https://en.wikipedia.org/wiki/Edible%20mushroom
8: Rainbow sauce | https://en.wikipedia.org/wiki/Rainbow%20sauce
9: Worcestershire sauce | https://en.wikipedia.org/wiki/Worcestershire%20sauce
10: List of condiments | https://en.wikipedia.org/wiki/List%20of%20condiments


In [111]:
k_nearest_idf = return_k_nearest_articles("round dish with mozarella, tomato sauce and mushrooms", sparse_mat_idf)
for i, index in enumerate(k_nearest_idf):
    print(f"{i+1}: {dataset[index]['title']} | {dataset[index]['url']}")

1: List of mushroom dishes | https://en.wikipedia.org/wiki/List%20of%20mushroom%20dishes
2: À la zingara | https://en.wikipedia.org/wiki/%C3%80%20la%20zingara
3: Sauce | https://en.wikipedia.org/wiki/Sauce
4: List of sauces | https://en.wikipedia.org/wiki/List%20of%20sauces
5: List of condiments | https://en.wikipedia.org/wiki/List%20of%20condiments
6: Tomato | https://en.wikipedia.org/wiki/Tomato
7: Worcestershire sauce | https://en.wikipedia.org/wiki/Worcestershire%20sauce
8: Stewed tomatoes | https://en.wikipedia.org/wiki/Stewed%20tomatoes
9: Picadillo | https://en.wikipedia.org/wiki/Picadillo
10: Rainbow sauce | https://en.wikipedia.org/wiki/Rainbow%20sauce


In [46]:
k_nearest_normalized = return_k_nearest_articles_normalized("round dish with mozarella, tomato sauce and mushrooms")
for i, index in enumerate(k_nearest_normalized):
    print(f"{i+1}: {dataset[index]['title']} | {dataset[index]['url']}")

1: List of sauces | https://en.wikipedia.org/wiki/List%20of%20sauces
2: List of mushroom dishes | https://en.wikipedia.org/wiki/List%20of%20mushroom%20dishes
3: Chargha | https://en.wikipedia.org/wiki/Chargha
4: Sauce | https://en.wikipedia.org/wiki/Sauce
5: Rainbow sauce | https://en.wikipedia.org/wiki/Rainbow%20sauce
6: List of condiments | https://en.wikipedia.org/wiki/List%20of%20condiments
7: Wine sauce | https://en.wikipedia.org/wiki/Wine%20sauce
8: À la zingara | https://en.wikipedia.org/wiki/%C3%80%20la%20zingara
9: Lists of prepared foods | https://en.wikipedia.org/wiki/Lists%20of%20prepared%20foods
10: Kibbeh nayyeh | https://en.wikipedia.org/wiki/Kibbeh%20nayyeh


In [141]:
k_nearest_svd = return_k_nearest_articles_svd("round dish with mozarella, tomato sauce and mushrooms", 50)
for i, index in enumerate(k_nearest_svd):
    print(f"{i+1}: {dataset[index]['title']} | {dataset[index]['url']}")

1: Lists of prepared foods | https://en.wikipedia.org/wiki/Lists%20of%20prepared%20foods
2: Chargha | https://en.wikipedia.org/wiki/Chargha
3: Kibbeh nayyeh | https://en.wikipedia.org/wiki/Kibbeh%20nayyeh
4: List of seafood dishes | https://en.wikipedia.org/wiki/List%20of%20seafood%20dishes
5: Disanxian | https://en.wikipedia.org/wiki/Disanxian
6: Khatti Dal | https://en.wikipedia.org/wiki/Khatti%20Dal
7: Beyti kebab | https://en.wikipedia.org/wiki/Beyti%20kebab
8: Cholera (food) | https://en.wikipedia.org/wiki/Cholera%20%28food%29
9: Goat curry | https://en.wikipedia.org/wiki/Goat%20curry
10: Carpaccio | https://en.wikipedia.org/wiki/Carpaccio


In [173]:
k_nearest_svd = return_k_nearest_articles_svd("round dish with mozarella, tomato sauce and mushrooms", 100)
for i, index in enumerate(k_nearest_svd):
    print(f"{i+1}: {dataset[index]['title']} | {dataset[index]['url']}")

1: List of sauces | https://en.wikipedia.org/wiki/List%20of%20sauces
2: Rainbow sauce | https://en.wikipedia.org/wiki/Rainbow%20sauce
3: Chargha | https://en.wikipedia.org/wiki/Chargha
4: Sauce | https://en.wikipedia.org/wiki/Sauce
5: Wine sauce | https://en.wikipedia.org/wiki/Wine%20sauce
6: List of condiments | https://en.wikipedia.org/wiki/List%20of%20condiments
7: Lists of prepared foods | https://en.wikipedia.org/wiki/Lists%20of%20prepared%20foods
8: Kibbeh nayyeh | https://en.wikipedia.org/wiki/Kibbeh%20nayyeh
9: List of seafood dishes | https://en.wikipedia.org/wiki/List%20of%20seafood%20dishes
10: Disanxian | https://en.wikipedia.org/wiki/Disanxian


In [143]:
k_nearest_svd = return_k_nearest_articles_svd("round dish with mozarella, tomato sauce and mushrooms", 250)
for i, index in enumerate(k_nearest_svd):
    print(f"{i+1}: {dataset[index]['title']} | {dataset[index]['url']}")

1: List of sauces | https://en.wikipedia.org/wiki/List%20of%20sauces
2: Sauce | https://en.wikipedia.org/wiki/Sauce
3: Rainbow sauce | https://en.wikipedia.org/wiki/Rainbow%20sauce
4: Chargha | https://en.wikipedia.org/wiki/Chargha
5: List of condiments | https://en.wikipedia.org/wiki/List%20of%20condiments
6: Wine sauce | https://en.wikipedia.org/wiki/Wine%20sauce
7: Kibbeh nayyeh | https://en.wikipedia.org/wiki/Kibbeh%20nayyeh
8: List of seafood dishes | https://en.wikipedia.org/wiki/List%20of%20seafood%20dishes
9: Lists of prepared foods | https://en.wikipedia.org/wiki/Lists%20of%20prepared%20foods
10: Beyti kebab | https://en.wikipedia.org/wiki/Beyti%20kebab


In [144]:
k_nearest_svd = return_k_nearest_articles_svd("round dish with mozarella, tomato sauce and mushrooms", 500)
for i, index in enumerate(k_nearest_svd):
    print(f"{i+1}: {dataset[index]['title']} | {dataset[index]['url']}")

1: List of sauces | https://en.wikipedia.org/wiki/List%20of%20sauces
2: Chargha | https://en.wikipedia.org/wiki/Chargha
3: Sauce | https://en.wikipedia.org/wiki/Sauce
4: Rainbow sauce | https://en.wikipedia.org/wiki/Rainbow%20sauce
5: List of condiments | https://en.wikipedia.org/wiki/List%20of%20condiments
6: Wine sauce | https://en.wikipedia.org/wiki/Wine%20sauce
7: Kibbeh nayyeh | https://en.wikipedia.org/wiki/Kibbeh%20nayyeh
8: List of seafood dishes | https://en.wikipedia.org/wiki/List%20of%20seafood%20dishes
9: Lists of prepared foods | https://en.wikipedia.org/wiki/Lists%20of%20prepared%20foods
10: Beyti kebab | https://en.wikipedia.org/wiki/Beyti%20kebab


In [145]:
k_nearest_svd = return_k_nearest_articles_svd("round dish with mozarella, tomato sauce and mushrooms", 1000)
for i, index in enumerate(k_nearest_svd):
    print(f"{i+1}: {dataset[index]['title']} | {dataset[index]['url']}")

1: List of sauces | https://en.wikipedia.org/wiki/List%20of%20sauces
2: Chargha | https://en.wikipedia.org/wiki/Chargha
3: Sauce | https://en.wikipedia.org/wiki/Sauce
4: Rainbow sauce | https://en.wikipedia.org/wiki/Rainbow%20sauce
5: List of mushroom dishes | https://en.wikipedia.org/wiki/List%20of%20mushroom%20dishes
6: List of condiments | https://en.wikipedia.org/wiki/List%20of%20condiments
7: Wine sauce | https://en.wikipedia.org/wiki/Wine%20sauce
8: Lists of prepared foods | https://en.wikipedia.org/wiki/Lists%20of%20prepared%20foods
9: À la zingara | https://en.wikipedia.org/wiki/%C3%80%20la%20zingara
10: Kibbeh nayyeh | https://en.wikipedia.org/wiki/Kibbeh%20nayyeh


In [270]:
print("Correct answer was pizza :c")

Correct answer was pizza :c


## Test 2 -> "Neapolitan pizza"

In [112]:
k_nearest = return_k_nearest_articles("Neapolitan pizza", sparse_mat)
for i, index in enumerate(k_nearest):
    print(f"{i+1}: {dataset[index]['title']} | {dataset[index]['url']}")

1: List of pizza varieties by country | https://en.wikipedia.org/wiki/List%20of%20pizza%20varieties%20by%20country
2: History of pizza | https://en.wikipedia.org/wiki/History%20of%20pizza
3: Pizza | https://en.wikipedia.org/wiki/Pizza
4: Chicago-style pizza | https://en.wikipedia.org/wiki/Chicago-style%20pizza
5: Iranian pizza | https://en.wikipedia.org/wiki/Iranian%20pizza
6: Pizza in the United States | https://en.wikipedia.org/wiki/Pizza%20in%20the%20United%20States
7: Pizza al taglio | https://en.wikipedia.org/wiki/Pizza%20al%20taglio
8: National Pizza Month | https://en.wikipedia.org/wiki/National%20Pizza%20Month
9: Sicilian pizza | https://en.wikipedia.org/wiki/Sicilian%20pizza
10: Hawaiian pizza | https://en.wikipedia.org/wiki/Hawaiian%20pizza


In [115]:
k_nearest_idf = return_k_nearest_articles("Neapolitan pizza", sparse_mat_idf)
for i, index in enumerate(k_nearest_idf):
    print(f"{i+1}: {dataset[index]['title']} | {dataset[index]['url']}")

1: List of pizza varieties by country | https://en.wikipedia.org/wiki/List%20of%20pizza%20varieties%20by%20country
2: History of pizza | https://en.wikipedia.org/wiki/History%20of%20pizza
3: Pizza | https://en.wikipedia.org/wiki/Pizza
4: Chicago-style pizza | https://en.wikipedia.org/wiki/Chicago-style%20pizza
5: Iranian pizza | https://en.wikipedia.org/wiki/Iranian%20pizza
6: Pizza in the United States | https://en.wikipedia.org/wiki/Pizza%20in%20the%20United%20States
7: Pizza al taglio | https://en.wikipedia.org/wiki/Pizza%20al%20taglio
8: National Pizza Month | https://en.wikipedia.org/wiki/National%20Pizza%20Month
9: Sicilian pizza | https://en.wikipedia.org/wiki/Sicilian%20pizza
10: Hawaiian pizza | https://en.wikipedia.org/wiki/Hawaiian%20pizza


In [119]:
k_nearest_normalized = return_k_nearest_articles_normalized("Neapolitan pizza")
for i, index in enumerate(k_nearest_normalized):
    print(f"{i+1}: {dataset[index]['title']} | {dataset[index]['url']}")

1: National Pizza Month | https://en.wikipedia.org/wiki/National%20Pizza%20Month
2: Iranian pizza | https://en.wikipedia.org/wiki/Iranian%20pizza
3: List of pizza varieties by country | https://en.wikipedia.org/wiki/List%20of%20pizza%20varieties%20by%20country
4: Pizza al taglio | https://en.wikipedia.org/wiki/Pizza%20al%20taglio
5: Chicago-style pizza | https://en.wikipedia.org/wiki/Chicago-style%20pizza
6: History of pizza | https://en.wikipedia.org/wiki/History%20of%20pizza
7: Pizza | https://en.wikipedia.org/wiki/Pizza
8: Sicilian pizza | https://en.wikipedia.org/wiki/Sicilian%20pizza
9: Pizza in the United States | https://en.wikipedia.org/wiki/Pizza%20in%20the%20United%20States
10: Hawaiian pizza | https://en.wikipedia.org/wiki/Hawaiian%20pizza


In [132]:
k_nearest_svd = return_k_nearest_articles_svd("Neapolitan pizza", 50)
for i, index in enumerate(k_nearest_svd):
    print(f"{i+1}: {dataset[index]['title']} | {dataset[index]['url']}")

1: National Pizza Month | https://en.wikipedia.org/wiki/National%20Pizza%20Month
2: Iranian pizza | https://en.wikipedia.org/wiki/Iranian%20pizza
3: List of pizza varieties by country | https://en.wikipedia.org/wiki/List%20of%20pizza%20varieties%20by%20country
4: Pizza al taglio | https://en.wikipedia.org/wiki/Pizza%20al%20taglio
5: Chicago-style pizza | https://en.wikipedia.org/wiki/Chicago-style%20pizza
6: History of pizza | https://en.wikipedia.org/wiki/History%20of%20pizza
7: Pizza | https://en.wikipedia.org/wiki/Pizza
8: Sicilian pizza | https://en.wikipedia.org/wiki/Sicilian%20pizza
9: Pizza in the United States | https://en.wikipedia.org/wiki/Pizza%20in%20the%20United%20States
10: Hawaiian pizza | https://en.wikipedia.org/wiki/Hawaiian%20pizza


In [175]:
k_nearest_svd = return_k_nearest_articles_svd("Neapolitan pizza", 100)
for i, index in enumerate(k_nearest_svd):
    print(f"{i+1}: {dataset[index]['title']} | {dataset[index]['url']}")

1: Geographically indicated foods of the United Kingdom | https://en.wikipedia.org/wiki/Geographically%20indicated%20foods%20of%20the%20United%20Kingdom
2: Cheese spread | https://en.wikipedia.org/wiki/Cheese%20spread
3: Marble cheese | https://en.wikipedia.org/wiki/Marble%20cheese
4: Shropshire Blue | https://en.wikipedia.org/wiki/Shropshire%20Blue
5: Processed cheese | https://en.wikipedia.org/wiki/Processed%20cheese
6: Cheese soup | https://en.wikipedia.org/wiki/Cheese%20soup
7: Cheese bun | https://en.wikipedia.org/wiki/Cheese%20bun
8: Fried cheese | https://en.wikipedia.org/wiki/Fried%20cheese
9: Sirene | https://en.wikipedia.org/wiki/Sirene
10: Kashkaval | https://en.wikipedia.org/wiki/Kashkaval


In [159]:
k_nearest_svd = return_k_nearest_articles_svd("Neapolitan pizza", 250)
for i, index in enumerate(k_nearest_svd):
    print(f"{i+1}: {dataset[index]['title']} | {dataset[index]['url']}")

1: National Pizza Month | https://en.wikipedia.org/wiki/National%20Pizza%20Month
2: Iranian pizza | https://en.wikipedia.org/wiki/Iranian%20pizza
3: Chicago-style pizza | https://en.wikipedia.org/wiki/Chicago-style%20pizza
4: List of pizza varieties by country | https://en.wikipedia.org/wiki/List%20of%20pizza%20varieties%20by%20country
5: Pizza al taglio | https://en.wikipedia.org/wiki/Pizza%20al%20taglio
6: History of pizza | https://en.wikipedia.org/wiki/History%20of%20pizza
7: Pizza | https://en.wikipedia.org/wiki/Pizza
8: Sicilian pizza | https://en.wikipedia.org/wiki/Sicilian%20pizza
9: Pizza in the United States | https://en.wikipedia.org/wiki/Pizza%20in%20the%20United%20States
10: Hawaiian pizza | https://en.wikipedia.org/wiki/Hawaiian%20pizza


In [148]:
k_nearest_svd = return_k_nearest_articles_svd("Neapolitan pizza", 500)
for i, index in enumerate(k_nearest_svd):
    print(f"{i+1}: {dataset[index]['title']} | {dataset[index]['url']}")

1: National Pizza Month | https://en.wikipedia.org/wiki/National%20Pizza%20Month
2: Iranian pizza | https://en.wikipedia.org/wiki/Iranian%20pizza
3: List of pizza varieties by country | https://en.wikipedia.org/wiki/List%20of%20pizza%20varieties%20by%20country
4: Chicago-style pizza | https://en.wikipedia.org/wiki/Chicago-style%20pizza
5: Pizza al taglio | https://en.wikipedia.org/wiki/Pizza%20al%20taglio
6: History of pizza | https://en.wikipedia.org/wiki/History%20of%20pizza
7: Pizza | https://en.wikipedia.org/wiki/Pizza
8: Sicilian pizza | https://en.wikipedia.org/wiki/Sicilian%20pizza
9: Pizza in the United States | https://en.wikipedia.org/wiki/Pizza%20in%20the%20United%20States
10: Hawaiian pizza | https://en.wikipedia.org/wiki/Hawaiian%20pizza


In [149]:
k_nearest_svd = return_k_nearest_articles_svd("Neapolitan pizza", 1000)
for i, index in enumerate(k_nearest_svd):
    print(f"{i+1}: {dataset[index]['title']} | {dataset[index]['url']}")

1: National Pizza Month | https://en.wikipedia.org/wiki/National%20Pizza%20Month
2: Iranian pizza | https://en.wikipedia.org/wiki/Iranian%20pizza
3: List of pizza varieties by country | https://en.wikipedia.org/wiki/List%20of%20pizza%20varieties%20by%20country
4: Pizza al taglio | https://en.wikipedia.org/wiki/Pizza%20al%20taglio
5: Chicago-style pizza | https://en.wikipedia.org/wiki/Chicago-style%20pizza
6: History of pizza | https://en.wikipedia.org/wiki/History%20of%20pizza
7: Pizza | https://en.wikipedia.org/wiki/Pizza
8: Sicilian pizza | https://en.wikipedia.org/wiki/Sicilian%20pizza
9: Pizza in the United States | https://en.wikipedia.org/wiki/Pizza%20in%20the%20United%20States
10: Hawaiian pizza | https://en.wikipedia.org/wiki/Hawaiian%20pizza


## Test 3 -> "meat-based sauce in Italian cuisine, typical of the city of Bologna"

In [176]:
k_nearest = return_k_nearest_articles("meat-based sauce in Italian cuisine, typical of the city of Bologna", sparse_mat)
for i, index in enumerate(k_nearest):
    print(f"{i+1}: {dataset[index]['title']} | {dataset[index]['url']}")

1: Sauce | https://en.wikipedia.org/wiki/Sauce
2: List of sauces | https://en.wikipedia.org/wiki/List%20of%20sauces
3: European cuisine | https://en.wikipedia.org/wiki/European%20cuisine
4: List of Asian cuisines | https://en.wikipedia.org/wiki/List%20of%20Asian%20cuisines
5: Oceanic cuisine | https://en.wikipedia.org/wiki/Oceanic%20cuisine
6: Balkan cuisine | https://en.wikipedia.org/wiki/Balkan%20cuisine
7: List of cuisines of the Americas | https://en.wikipedia.org/wiki/List%20of%20cuisines%20of%20the%20Americas
8: List of European cuisines | https://en.wikipedia.org/wiki/List%20of%20European%20cuisines
9: List of African cuisines | https://en.wikipedia.org/wiki/List%20of%20African%20cuisines
10: Cuisine of the Indian subcontinent | https://en.wikipedia.org/wiki/Cuisine%20of%20the%20Indian%20subcontinent


In [177]:
k_nearest_idf = return_k_nearest_articles("meat-based sauce in Italian cuisine, typical of the city of Bologna", sparse_mat_idf)
for i, index in enumerate(k_nearest_idf):
    print(f"{i + 1}: {dataset[index]['title']} | {dataset[index]['url']}")

1: Sauce | https://en.wikipedia.org/wiki/Sauce
2: List of sauces | https://en.wikipedia.org/wiki/List%20of%20sauces
3: European cuisine | https://en.wikipedia.org/wiki/European%20cuisine
4: Balkan cuisine | https://en.wikipedia.org/wiki/Balkan%20cuisine
5: Oceanic cuisine | https://en.wikipedia.org/wiki/Oceanic%20cuisine
6: List of Asian cuisines | https://en.wikipedia.org/wiki/List%20of%20Asian%20cuisines
7: List of European cuisines | https://en.wikipedia.org/wiki/List%20of%20European%20cuisines
8: List of cuisines of the Americas | https://en.wikipedia.org/wiki/List%20of%20cuisines%20of%20the%20Americas
9: Haute cuisine | https://en.wikipedia.org/wiki/Haute%20cuisine
10: Cuisine of the Indian subcontinent | https://en.wikipedia.org/wiki/Cuisine%20of%20the%20Indian%20subcontinent


In [178]:
k_nearest_normalized = return_k_nearest_articles_normalized("meat-based sauce in Italian cuisine, typical of the city of Bologna")
for i, index in enumerate(k_nearest_normalized):
    print(f"{i + 1}: {dataset[index]['title']} | {dataset[index]['url']}")

1: Oceanic cuisine | https://en.wikipedia.org/wiki/Oceanic%20cuisine
2: European cuisine | https://en.wikipedia.org/wiki/European%20cuisine
3: List of sauces | https://en.wikipedia.org/wiki/List%20of%20sauces
4: Sauce | https://en.wikipedia.org/wiki/Sauce
5: Balkan cuisine | https://en.wikipedia.org/wiki/Balkan%20cuisine
6: Dal bhat | https://en.wikipedia.org/wiki/Dal%20bhat
7: List of European cuisines | https://en.wikipedia.org/wiki/List%20of%20European%20cuisines
8: Cuisine of the Indian subcontinent | https://en.wikipedia.org/wiki/Cuisine%20of%20the%20Indian%20subcontinent
9: Malian cuisine | https://en.wikipedia.org/wiki/Malian%20cuisine
10: Rainbow sauce | https://en.wikipedia.org/wiki/Rainbow%20sauce


In [179]:
k_nearest_svd = return_k_nearest_articles_svd("meat-based sauce in Italian cuisine, typical of the city of Bologna", 50)
for i, index in enumerate(k_nearest_svd):
    print(f"{i + 1}: {dataset[index]['title']} | {dataset[index]['url']}")

1: Oceanic cuisine | https://en.wikipedia.org/wiki/Oceanic%20cuisine
2: European cuisine | https://en.wikipedia.org/wiki/European%20cuisine
3: Balkan cuisine | https://en.wikipedia.org/wiki/Balkan%20cuisine
4: Dal bhat | https://en.wikipedia.org/wiki/Dal%20bhat
5: List of European cuisines | https://en.wikipedia.org/wiki/List%20of%20European%20cuisines
6: Cuisine of the Indian subcontinent | https://en.wikipedia.org/wiki/Cuisine%20of%20the%20Indian%20subcontinent
7: Caribbean cuisine | https://en.wikipedia.org/wiki/Caribbean%20cuisine
8: List of Asian cuisines | https://en.wikipedia.org/wiki/List%20of%20Asian%20cuisines
9: Kuurdak | https://en.wikipedia.org/wiki/Kuurdak
10: Khatti Dal | https://en.wikipedia.org/wiki/Khatti%20Dal


In [180]:
k_nearest_svd = return_k_nearest_articles_svd("meat-based sauce in Italian cuisine, typical of the city of Bologna", 100)
for i, index in enumerate(k_nearest_svd):
    print(f"{i + 1}: {dataset[index]['title']} | {dataset[index]['url']}")

1: Oceanic cuisine | https://en.wikipedia.org/wiki/Oceanic%20cuisine
2: European cuisine | https://en.wikipedia.org/wiki/European%20cuisine
3: List of sauces | https://en.wikipedia.org/wiki/List%20of%20sauces
4: Dal bhat | https://en.wikipedia.org/wiki/Dal%20bhat
5: Balkan cuisine | https://en.wikipedia.org/wiki/Balkan%20cuisine
6: List of European cuisines | https://en.wikipedia.org/wiki/List%20of%20European%20cuisines
7: Sauce | https://en.wikipedia.org/wiki/Sauce
8: Cuisine of the Indian subcontinent | https://en.wikipedia.org/wiki/Cuisine%20of%20the%20Indian%20subcontinent
9: Caribbean cuisine | https://en.wikipedia.org/wiki/Caribbean%20cuisine
10: List of Asian cuisines | https://en.wikipedia.org/wiki/List%20of%20Asian%20cuisines


In [181]:
k_nearest_svd = return_k_nearest_articles_svd("meat-based sauce in Italian cuisine, typical of the city of Bologna", 250)
for i, index in enumerate(k_nearest_svd):
    print(f"{i + 1}: {dataset[index]['title']} | {dataset[index]['url']}")

1: Oceanic cuisine | https://en.wikipedia.org/wiki/Oceanic%20cuisine
2: European cuisine | https://en.wikipedia.org/wiki/European%20cuisine
3: List of sauces | https://en.wikipedia.org/wiki/List%20of%20sauces
4: Dal bhat | https://en.wikipedia.org/wiki/Dal%20bhat
5: Sauce | https://en.wikipedia.org/wiki/Sauce
6: List of European cuisines | https://en.wikipedia.org/wiki/List%20of%20European%20cuisines
7: Balkan cuisine | https://en.wikipedia.org/wiki/Balkan%20cuisine
8: Cuisine of the Indian subcontinent | https://en.wikipedia.org/wiki/Cuisine%20of%20the%20Indian%20subcontinent
9: Rainbow sauce | https://en.wikipedia.org/wiki/Rainbow%20sauce
10: List of Asian cuisines | https://en.wikipedia.org/wiki/List%20of%20Asian%20cuisines


In [182]:
k_nearest_svd = return_k_nearest_articles_svd("meat-based sauce in Italian cuisine, typical of the city of Bologna", 500)
for i, index in enumerate(k_nearest_svd):
    print(f"{i + 1}: {dataset[index]['title']} | {dataset[index]['url']}")

1: Oceanic cuisine | https://en.wikipedia.org/wiki/Oceanic%20cuisine
2: European cuisine | https://en.wikipedia.org/wiki/European%20cuisine
3: List of sauces | https://en.wikipedia.org/wiki/List%20of%20sauces
4: Sauce | https://en.wikipedia.org/wiki/Sauce
5: Balkan cuisine | https://en.wikipedia.org/wiki/Balkan%20cuisine
6: Dal bhat | https://en.wikipedia.org/wiki/Dal%20bhat
7: List of European cuisines | https://en.wikipedia.org/wiki/List%20of%20European%20cuisines
8: Majjige huli | https://en.wikipedia.org/wiki/Majjige%20huli
9: Cuisine of the Indian subcontinent | https://en.wikipedia.org/wiki/Cuisine%20of%20the%20Indian%20subcontinent
10: Malian cuisine | https://en.wikipedia.org/wiki/Malian%20cuisine


In [183]:
k_nearest_svd = return_k_nearest_articles_svd("meat-based sauce in Italian cuisine, typical of the city of Bologna", 1000)
for i, index in enumerate(k_nearest_svd):
    print(f"{i + 1}: {dataset[index]['title']} | {dataset[index]['url']}")

1: Oceanic cuisine | https://en.wikipedia.org/wiki/Oceanic%20cuisine
2: European cuisine | https://en.wikipedia.org/wiki/European%20cuisine
3: List of sauces | https://en.wikipedia.org/wiki/List%20of%20sauces
4: Sauce | https://en.wikipedia.org/wiki/Sauce
5: Balkan cuisine | https://en.wikipedia.org/wiki/Balkan%20cuisine
6: Dal bhat | https://en.wikipedia.org/wiki/Dal%20bhat
7: List of European cuisines | https://en.wikipedia.org/wiki/List%20of%20European%20cuisines
8: Cuisine of the Indian subcontinent | https://en.wikipedia.org/wiki/Cuisine%20of%20the%20Indian%20subcontinent
9: Malian cuisine | https://en.wikipedia.org/wiki/Malian%20cuisine
10: Rainbow sauce | https://en.wikipedia.org/wiki/Rainbow%20sauce


In [185]:
print("Correct answer was Bolognese sauce :c")

Correct answer was Bolognese sauce :c


# Test 4 ->
"Beef steaks are usually grilled, pan-fried, or broiled. The more tender cuts from the loin and rib are cooked quickly, using dry heat, and served whole. Less tender cuts from the chuck or round are cooked with moist heat or are mechanically tenderized (cf. cube steak)."

In [187]:
test_phrase = "Beef steaks are usually grilled, pan-fried, or broiled. The more tender cuts from the loin and rib are cooked quickly, using dry heat, and served whole. Less tender cuts from the chuck or round are cooked with moist heat or are mechanically tenderized (cf. cube steak)."

In [188]:
k_nearest = return_k_nearest_articles(test_phrase, sparse_mat)
for i, index in enumerate(k_nearest):
    print(f"{i + 1}: {dataset[index]['title']} | {dataset[index]['url']}")

1: Beefsteak | https://en.wikipedia.org/wiki/Beefsteak
2: Steak | https://en.wikipedia.org/wiki/Steak
3: Meat tenderness | https://en.wikipedia.org/wiki/Meat%20tenderness
4: Chicken fried steak | https://en.wikipedia.org/wiki/Chicken%20fried%20steak
5: Beef | https://en.wikipedia.org/wiki/Beef
6: Pot roast | https://en.wikipedia.org/wiki/Pot%20roast
7: Omaha Steaks | https://en.wikipedia.org/wiki/Omaha%20Steaks
8: Grilling | https://en.wikipedia.org/wiki/Grilling
9: Pittsburgh rare | https://en.wikipedia.org/wiki/Pittsburgh%20rare
10: Salisbury steak | https://en.wikipedia.org/wiki/Salisbury%20steak


In [189]:
k_nearest_idf = return_k_nearest_articles(test_phrase, sparse_mat_idf)
for i, index in enumerate(k_nearest_idf):
    print(f"{i + 1}: {dataset[index]['title']} | {dataset[index]['url']}")

1: Beefsteak | https://en.wikipedia.org/wiki/Beefsteak
2: Steak | https://en.wikipedia.org/wiki/Steak
3: Chicken fried steak | https://en.wikipedia.org/wiki/Chicken%20fried%20steak
4: Meat tenderness | https://en.wikipedia.org/wiki/Meat%20tenderness
5: Grilling | https://en.wikipedia.org/wiki/Grilling
6: Flattop grill | https://en.wikipedia.org/wiki/Flattop%20grill
7: Pot roast | https://en.wikipedia.org/wiki/Pot%20roast
8: Brisket | https://en.wikipedia.org/wiki/Brisket
9: Thermal cooking | https://en.wikipedia.org/wiki/Thermal%20cooking
10: Beef | https://en.wikipedia.org/wiki/Beef


In [190]:
k_nearest_normalized = return_k_nearest_articles_normalized(test_phrase)
for i, index in enumerate(k_nearest_normalized):
    print(f"{i + 1}: {dataset[index]['title']} | {dataset[index]['url']}")

1: Beefsteak | https://en.wikipedia.org/wiki/Beefsteak
2: Broiler (disambiguation) | https://en.wikipedia.org/wiki/Broiler%20%28disambiguation%29
3: Meat tenderness | https://en.wikipedia.org/wiki/Meat%20tenderness
4: Chicken fried steak | https://en.wikipedia.org/wiki/Chicken%20fried%20steak
5: Steak | https://en.wikipedia.org/wiki/Steak
6: Carryover cooking | https://en.wikipedia.org/wiki/Carryover%20cooking
7: Chinese cooking techniques | https://en.wikipedia.org/wiki/Chinese%20cooking%20techniques
8: Ribs (food) | https://en.wikipedia.org/wiki/Ribs%20%28food%29
9: Pot roast | https://en.wikipedia.org/wiki/Pot%20roast
10: Pork ribs | https://en.wikipedia.org/wiki/Pork%20ribs


In [191]:
k_nearest_svd = return_k_nearest_articles_svd(test_phrase, 50)
for i, index in enumerate(k_nearest_svd):
    print(f"{i + 1}: {dataset[index]['title']} | {dataset[index]['url']}")

1: Shocking (cooking) | https://en.wikipedia.org/wiki/Shocking%20%28cooking%29
2: Chinese cooking techniques | https://en.wikipedia.org/wiki/Chinese%20cooking%20techniques
3: Broiler (disambiguation) | https://en.wikipedia.org/wiki/Broiler%20%28disambiguation%29
4: Tava | https://en.wikipedia.org/wiki/Tava
5: Low-temperature cooking | https://en.wikipedia.org/wiki/Low-temperature%20cooking
6: Clay pot cooking | https://en.wikipedia.org/wiki/Clay%20pot%20cooking
7: Tess Mallos | https://en.wikipedia.org/wiki/Tess%20Mallos
8: Carryover cooking | https://en.wikipedia.org/wiki/Carryover%20cooking
9: Dum pukht | https://en.wikipedia.org/wiki/Dum%20pukht
10: Cuisine of pre-colonial Philippines | https://en.wikipedia.org/wiki/Cuisine%20of%20pre-colonial%20Philippines


In [192]:
k_nearest_svd = return_k_nearest_articles_svd(test_phrase, 100)
for i, index in enumerate(k_nearest_svd):
    print(f"{i + 1}: {dataset[index]['title']} | {dataset[index]['url']}")

1: Chinese cooking techniques | https://en.wikipedia.org/wiki/Chinese%20cooking%20techniques
2: Shocking (cooking) | https://en.wikipedia.org/wiki/Shocking%20%28cooking%29
3: Broiler (disambiguation) | https://en.wikipedia.org/wiki/Broiler%20%28disambiguation%29
4: Low-temperature cooking | https://en.wikipedia.org/wiki/Low-temperature%20cooking
5: Clay pot cooking | https://en.wikipedia.org/wiki/Clay%20pot%20cooking
6: Tava | https://en.wikipedia.org/wiki/Tava
7: Geera pork | https://en.wikipedia.org/wiki/Geera%20pork
8: Dum pukht | https://en.wikipedia.org/wiki/Dum%20pukht
9: Carryover cooking | https://en.wikipedia.org/wiki/Carryover%20cooking
10: Thermal cooking | https://en.wikipedia.org/wiki/Thermal%20cooking


In [193]:
k_nearest_svd = return_k_nearest_articles_svd(test_phrase, 250)
for i, index in enumerate(k_nearest_svd):
    print(f"{i + 1}: {dataset[index]['title']} | {dataset[index]['url']}")

1: Chinese cooking techniques | https://en.wikipedia.org/wiki/Chinese%20cooking%20techniques
2: Broiler (disambiguation) | https://en.wikipedia.org/wiki/Broiler%20%28disambiguation%29
3: Shocking (cooking) | https://en.wikipedia.org/wiki/Shocking%20%28cooking%29
4: Low-temperature cooking | https://en.wikipedia.org/wiki/Low-temperature%20cooking
5: Tava | https://en.wikipedia.org/wiki/Tava
6: Carryover cooking | https://en.wikipedia.org/wiki/Carryover%20cooking
7: Dum pukht | https://en.wikipedia.org/wiki/Dum%20pukht
8: Thermal cooking | https://en.wikipedia.org/wiki/Thermal%20cooking
9: Clay pot cooking | https://en.wikipedia.org/wiki/Clay%20pot%20cooking
10: Kuzu şiş | https://en.wikipedia.org/wiki/Kuzu%20%C5%9Fi%C5%9F


In [194]:
k_nearest_svd = return_k_nearest_articles_svd(test_phrase, 500)
for i, index in enumerate(k_nearest_svd):
    print(f"{i + 1}: {dataset[index]['title']} | {dataset[index]['url']}")

1: Beefsteak | https://en.wikipedia.org/wiki/Beefsteak
2: Broiler (disambiguation) | https://en.wikipedia.org/wiki/Broiler%20%28disambiguation%29
3: Chinese cooking techniques | https://en.wikipedia.org/wiki/Chinese%20cooking%20techniques
4: Steak | https://en.wikipedia.org/wiki/Steak
5: Chicken fried steak | https://en.wikipedia.org/wiki/Chicken%20fried%20steak
6: Shocking (cooking) | https://en.wikipedia.org/wiki/Shocking%20%28cooking%29
7: Carryover cooking | https://en.wikipedia.org/wiki/Carryover%20cooking
8: Low-temperature cooking | https://en.wikipedia.org/wiki/Low-temperature%20cooking
9: Ribs (food) | https://en.wikipedia.org/wiki/Ribs%20%28food%29
10: Pork ribs | https://en.wikipedia.org/wiki/Pork%20ribs


In [195]:
k_nearest_svd = return_k_nearest_articles_svd(test_phrase, 1000)
for i, index in enumerate(k_nearest_svd):
    print(f"{i + 1}: {dataset[index]['title']} | {dataset[index]['url']}")

1: Beefsteak | https://en.wikipedia.org/wiki/Beefsteak
2: Broiler (disambiguation) | https://en.wikipedia.org/wiki/Broiler%20%28disambiguation%29
3: Chinese cooking techniques | https://en.wikipedia.org/wiki/Chinese%20cooking%20techniques
4: Carryover cooking | https://en.wikipedia.org/wiki/Carryover%20cooking
5: Steak | https://en.wikipedia.org/wiki/Steak
6: Chicken fried steak | https://en.wikipedia.org/wiki/Chicken%20fried%20steak
7: Dicing | https://en.wikipedia.org/wiki/Dicing
8: Shocking (cooking) | https://en.wikipedia.org/wiki/Shocking%20%28cooking%29
9: Ribs (food) | https://en.wikipedia.org/wiki/Ribs%20%28food%29
10: Pork ribs | https://en.wikipedia.org/wiki/Pork%20ribs


In [197]:
print("It worked! (I copied part of wikipedia page but it worked!) Correct answer was Beefsteak")

Correct answer was Beefsteak
It worked! (I copied part of wikipedia page but it worked!) Correct answer was Beefsteak


# Test 5 ->

In [198]:
k_nearest = return_k_nearest_articles("Fruit that grows on the trees. This fruit is red and very sweet.", sparse_mat)
for i, index in enumerate(k_nearest):
    print(f"{i + 1}: {dataset[index]['title']} | {dataset[index]['url']}")

1: Detarium senegalense | https://en.wikipedia.org/wiki/Detarium%20senegalense
2: Tamarillo | https://en.wikipedia.org/wiki/Tamarillo
3: Sclerocarya birrea | https://en.wikipedia.org/wiki/Sclerocarya%20birrea
4: Syzygium cumini | https://en.wikipedia.org/wiki/Syzygium%20cumini
5: Spondias purpurea | https://en.wikipedia.org/wiki/Spondias%20purpurea
6: Tree | https://en.wikipedia.org/wiki/Tree
7: Tree of life | https://en.wikipedia.org/wiki/Tree%20of%20life
8: Pyrus pashia | https://en.wikipedia.org/wiki/Pyrus%20pashia
9: Myrciaria dubia | https://en.wikipedia.org/wiki/Myrciaria%20dubia
10: Fruit | https://en.wikipedia.org/wiki/Fruit


In [199]:
k_nearest_idf = return_k_nearest_articles("Fruit that grows on the trees. This fruit is red and very sweet.", sparse_mat_idf)
for i, index in enumerate(k_nearest_idf):
    print(f"{i + 1}: {dataset[index]['title']} | {dataset[index]['url']}")

1: Detarium senegalense | https://en.wikipedia.org/wiki/Detarium%20senegalense
2: Spondias purpurea | https://en.wikipedia.org/wiki/Spondias%20purpurea
3: Sclerocarya birrea | https://en.wikipedia.org/wiki/Sclerocarya%20birrea
4: Artocarpus integer | https://en.wikipedia.org/wiki/Artocarpus%20integer
5: Syzygium cumini | https://en.wikipedia.org/wiki/Syzygium%20cumini
6: Fruit | https://en.wikipedia.org/wiki/Fruit
7: Tamarillo | https://en.wikipedia.org/wiki/Tamarillo
8: Pitaya | https://en.wikipedia.org/wiki/Pitaya
9: Sonneratia caseolaris | https://en.wikipedia.org/wiki/Sonneratia%20caseolaris
10: Pyrus pashia | https://en.wikipedia.org/wiki/Pyrus%20pashia


In [200]:
k_nearest_normalized = return_k_nearest_articles_normalized("Fruit that grows on the trees. This fruit is red and very sweet.")
for i, index in enumerate(k_nearest_normalized):
    print(f"{i + 1}: {dataset[index]['title']} | {dataset[index]['url']}")

1: Fruit dove | https://en.wikipedia.org/wiki/Fruit%20dove
2: Psychotria dallachiana | https://en.wikipedia.org/wiki/Psychotria%20dallachiana
3: Annona salzmannii | https://en.wikipedia.org/wiki/Annona%20salzmannii
4: Palm nut | https://en.wikipedia.org/wiki/Palm%20nut
5: Couepia bracteosa | https://en.wikipedia.org/wiki/Couepia%20bracteosa
6: Srikaya | https://en.wikipedia.org/wiki/Srikaya
7: Sonneratia caseolaris | https://en.wikipedia.org/wiki/Sonneratia%20caseolaris
8: Confiture | https://en.wikipedia.org/wiki/Confiture
9: Khanom sane chan | https://en.wikipedia.org/wiki/Khanom%20sane%20chan
10: Buddy Fruits | https://en.wikipedia.org/wiki/Buddy%20Fruits


In [201]:
k_nearest_svd = return_k_nearest_articles_svd("Fruit that grows on the trees. This fruit is red and very sweet.", 50)
for i, index in enumerate(k_nearest_svd):
    print(f"{i + 1}: {dataset[index]['title']} | {dataset[index]['url']}")

1: Fruit dove | https://en.wikipedia.org/wiki/Fruit%20dove
2: Annona salzmannii | https://en.wikipedia.org/wiki/Annona%20salzmannii
3: Srikaya | https://en.wikipedia.org/wiki/Srikaya
4: Couepia bracteosa | https://en.wikipedia.org/wiki/Couepia%20bracteosa
5: Palm nut | https://en.wikipedia.org/wiki/Palm%20nut
6: Psychotria dallachiana | https://en.wikipedia.org/wiki/Psychotria%20dallachiana
7: Sonneratia caseolaris | https://en.wikipedia.org/wiki/Sonneratia%20caseolaris
8: Confiture | https://en.wikipedia.org/wiki/Confiture
9: Buddy Fruits | https://en.wikipedia.org/wiki/Buddy%20Fruits
10: Khanom sane chan | https://en.wikipedia.org/wiki/Khanom%20sane%20chan


In [202]:
k_nearest_svd = return_k_nearest_articles_svd("Fruit that grows on the trees. This fruit is red and very sweet.", 100)
for i, index in enumerate(k_nearest_svd):
    print(f"{i + 1}: {dataset[index]['title']} | {dataset[index]['url']}")

1: Fruit dove | https://en.wikipedia.org/wiki/Fruit%20dove
2: Annona salzmannii | https://en.wikipedia.org/wiki/Annona%20salzmannii
3: Srikaya | https://en.wikipedia.org/wiki/Srikaya
4: Couepia bracteosa | https://en.wikipedia.org/wiki/Couepia%20bracteosa
5: Psychotria dallachiana | https://en.wikipedia.org/wiki/Psychotria%20dallachiana
6: Palm nut | https://en.wikipedia.org/wiki/Palm%20nut
7: Sonneratia caseolaris | https://en.wikipedia.org/wiki/Sonneratia%20caseolaris
8: Beautiful fruit dove | https://en.wikipedia.org/wiki/Beautiful%20fruit%20dove
9: Confiture | https://en.wikipedia.org/wiki/Confiture
10: Buddy Fruits | https://en.wikipedia.org/wiki/Buddy%20Fruits


In [203]:
k_nearest_svd = return_k_nearest_articles_svd("Fruit that grows on the trees. This fruit is red and very sweet.", 250)
for i, index in enumerate(k_nearest_svd):
    print(f"{i + 1}: {dataset[index]['title']} | {dataset[index]['url']}")

1: Fruit dove | https://en.wikipedia.org/wiki/Fruit%20dove
2: Psychotria dallachiana | https://en.wikipedia.org/wiki/Psychotria%20dallachiana
3: Annona salzmannii | https://en.wikipedia.org/wiki/Annona%20salzmannii
4: Palm nut | https://en.wikipedia.org/wiki/Palm%20nut
5: Srikaya | https://en.wikipedia.org/wiki/Srikaya
6: Couepia bracteosa | https://en.wikipedia.org/wiki/Couepia%20bracteosa
7: Sonneratia caseolaris | https://en.wikipedia.org/wiki/Sonneratia%20caseolaris
8: Confiture | https://en.wikipedia.org/wiki/Confiture
9: Beautiful fruit dove | https://en.wikipedia.org/wiki/Beautiful%20fruit%20dove
10: Khanom sane chan | https://en.wikipedia.org/wiki/Khanom%20sane%20chan


In [204]:
k_nearest_svd = return_k_nearest_articles_svd("Fruit that grows on the trees. This fruit is red and very sweet.", 500)
for i, index in enumerate(k_nearest_svd):
    print(f"{i + 1}: {dataset[index]['title']} | {dataset[index]['url']}")

1: Fruit dove | https://en.wikipedia.org/wiki/Fruit%20dove
2: Psychotria dallachiana | https://en.wikipedia.org/wiki/Psychotria%20dallachiana
3: Annona salzmannii | https://en.wikipedia.org/wiki/Annona%20salzmannii
4: Palm nut | https://en.wikipedia.org/wiki/Palm%20nut
5: Couepia bracteosa | https://en.wikipedia.org/wiki/Couepia%20bracteosa
6: Srikaya | https://en.wikipedia.org/wiki/Srikaya
7: Sonneratia caseolaris | https://en.wikipedia.org/wiki/Sonneratia%20caseolaris
8: Confiture | https://en.wikipedia.org/wiki/Confiture
9: Khanom sane chan | https://en.wikipedia.org/wiki/Khanom%20sane%20chan
10: Beautiful fruit dove | https://en.wikipedia.org/wiki/Beautiful%20fruit%20dove


In [205]:
k_nearest_svd = return_k_nearest_articles_svd("Fruit that grows on the trees. This fruit is red and very sweet.", 1000)
for i, index in enumerate(k_nearest_svd):
    print(f"{i + 1}: {dataset[index]['title']} | {dataset[index]['url']}")

1: Fruit dove | https://en.wikipedia.org/wiki/Fruit%20dove
2: Psychotria dallachiana | https://en.wikipedia.org/wiki/Psychotria%20dallachiana
3: Annona salzmannii | https://en.wikipedia.org/wiki/Annona%20salzmannii
4: Palm nut | https://en.wikipedia.org/wiki/Palm%20nut
5: Couepia bracteosa | https://en.wikipedia.org/wiki/Couepia%20bracteosa
6: Srikaya | https://en.wikipedia.org/wiki/Srikaya
7: Sonneratia caseolaris | https://en.wikipedia.org/wiki/Sonneratia%20caseolaris
8: Confiture | https://en.wikipedia.org/wiki/Confiture
9: Khanom sane chan | https://en.wikipedia.org/wiki/Khanom%20sane%20chan
10: Buddy Fruits | https://en.wikipedia.org/wiki/Buddy%20Fruits


# Conclusions

### In order to my search engine to work, searched phrase has to be very specific (and long). The resoults are often chaotic and are not the correct ones.
### I don't see any improvements in resoults after idf. Results are just more mixed up but I can't tell if they are better.
### I think that normalization helped a little. It made results more strict and specific (usuall in a good way)
### SVD was interesting. For k = 50 results were usually not bad. For k = 100 sometimes results were 100% bad (I don't know why).
### For k = 500 or k = 1000 results were the best, but there were very similar to those after normalization. I did not find SVD usefull
