In [63]:
import requests
import pandas as pd
import numpy as np

import scrape

from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.spatial.distance import cosine

In [64]:
docs_df = pd.read_parquet("related_large_set.pq")[:2000]
texts = docs_df.text.tolist()
texts = [" ".join(text.split(";")) for text in texts]

In [65]:
tfidf = TfidfVectorizer(use_idf=True, smooth_idf=False)

In [66]:
dfTFIDF = pd.DataFrame(tfidf.fit_transform(texts).toarray(), index=docs_df.url, columns=tfidf.get_feature_names_out())

In [67]:
def search_query(query: str, df: pd.DataFrame) -> str:
    query = tfidf.transform([query]).toarray()[0] 
    return (1-df.apply(lambda x: cosine(x, query), axis=1).sort_values()).index[0]

def search_history(history: list[str], df: pd.DataFrame, top: int) -> str:
    history_augmented = [tfidf.transform([query]).toarray()[0] for query in history]
    results = [(1-df.apply(lambda x: cosine(x, query), axis=1).sort_values())[:top] for query in history_augmented]
    results_joint = pd.concat(results, axis=1)
    results_joint.replace(np.nan, 0, inplace=True)
    return results_joint.sum(axis=1).sort_values(ascending=False).index[0]


In [68]:
def get_recomendation_from_texts(texts: list[str], df: pd.DataFrame) -> str:
    return search_history(texts, df, 10)

def get_recomendation_from_urls(urls: list[str], df: pd.DataFrame) -> str:
    documents = [requests.get(url).text for url in urls]
    _, parsed_texts = zip(*[scrape.parse_content_from_bs(BeautifulSoup(document)) for document in documents])
    return get_recomendation_from_texts(parsed_texts, df)

In [69]:
urls = [
    "https://en.wikipedia.org/wiki/Primary_school",
    "https://en.wikipedia.org/wiki/Education",
    "https://en.wikipedia.org/wiki/University",
    "https://en.wikipedia.org/wiki/Ethiopia"
]

documents = [requests.get(url).text for url in urls]
parsed_titles, parsed_texts = zip(*[scrape.parse_content_from_bs(BeautifulSoup(document)) for document in documents])

In [70]:
search_history(parsed_texts, dfTFIDF, 10)

'https://en.wikipedia.org/wiki/Primary_school'

In [71]:
docs_df.url[2]

'https://en.wikipedia.org/wiki/German_occupation_of_Denmark'

In [72]:
get_recomendation_from_urls([
    "https://en.wikipedia.org/wiki/Ronald_Reagan",
    "https://en.wikipedia.org/wiki/Economy_of_the_United_States"
], dfTFIDF)

'https://en.wikipedia.org/wiki/Centre-right'