In [53]:
import random
import requests
import pandas as pd
import numpy as np

import scrape

from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.spatial.distance import cosine

In [60]:
docs_df = pd.read_parquet("large_set.pq")[:2000]
texts = docs_df.text.tolist()
texts = [" ".join(text.split(";")) for text in texts]

In [61]:
tfidf = TfidfVectorizer(use_idf=True, smooth_idf=False)

In [62]:
dfTFIDF = pd.DataFrame(tfidf.fit_transform(texts).toarray(), index=docs_df.url, columns=tfidf.get_feature_names_out())

In [63]:
def search_query(query: str, df: pd.DataFrame) -> str:
    query = tfidf.transform([query]).toarray()[0] 
    return (1-df.apply(lambda x: cosine(x, query), axis=1).sort_values()).index[0]

def search_history(history: list[str], df: pd.DataFrame, top: int) -> str:
    history_augmented = [tfidf.transform([query]).toarray()[0] for query in history]
    results = [(1-df.apply(lambda x: cosine(x, query), axis=1).sort_values())[:top] for query in history_augmented]
    results_joint = pd.concat(results, axis=1)
    results_joint.replace(np.nan, 0, inplace=True)
    return results_joint.sum(axis=1).sort_values(ascending=False).index[0]


In [64]:
selected = random.choices(docs_df.values.tolist(), k=10)

vectorized = [" ".join(entry[2].split(";")) for entry in selected]

In [65]:
search_history(vectorized, dfTFIDF, 10)

'https://en.wikipedia.org/wiki/Federa%C3%A7%C3%A3o_Amapaense_de_Futebol'

In [66]:
for entry in selected:
    print(entry[1])

https://en.wikipedia.org/wiki/%C4%90%E1%BA%A1i_C%C3%A1t_T%C6%B0%E1%BB%9Dng
https://en.wikipedia.org/wiki/Agim_Zeka
https://en.wikipedia.org/wiki/An_Gearanach
https://en.wikipedia.org/wiki/Hyde-St._John_House
https://en.wikipedia.org/wiki/Bento_Gon%C3%A7alves_da_Silva
https://en.wikipedia.org/wiki/St_George's_Quarter
https://en.wikipedia.org/wiki/Jon_Becker
https://en.wikipedia.org/wiki/Elsie_Roy_Elementary_School
https://en.wikipedia.org/wiki/Masthorn
https://en.wikipedia.org/wiki/Federa%C3%A7%C3%A3o_Amapaense_de_Futebol


In [85]:
urls = [
    "https://en.wikipedia.org/wiki/Primary_school",
    "https://en.wikipedia.org/wiki/Education",
    "https://en.wikipedia.org/wiki/University",
    "https://en.wikipedia.org/wiki/Ethiopia"
]

documents = [requests.get(url).text for url in urls]
parsed_titles, parsed_texts = zip(*[scrape.parse_wiki_article_from_document(document) for document in documents])

In [89]:
search_history(parsed_texts[:2], dfTFIDF, 10)

'https://en.wikipedia.org/wiki/Adult_education_in_the_United_Kingdom'

In [84]:
dfTFIDF.shape

(2000, 136832)