In [2]:
import requests
from bs4 import BeautifulSoup

import pandas as pd

base_url = 'https://www.ergo-log.com/'

# Create article links DataFrame

## Helper Functions

In [3]:
def limit_archive_links(archive_links: list) -> list:
    """Get a list of both parts of the year to limit duplications"""
    year_a = None
    year_b = None
    years = []
    
    for archive_link in archive_links:
        archive_link = archive_link.get('href')
        if "a.html" in archive_link and year_a == None:
            year_a = archive_link
            years.append(year_a)
        if "b.html" in archive_link and year_b == None:
            year_b = archive_link
            years.append(year_b)

        # Break loop if both are found
        if len(years) == 2:
            break
    
    return years

## Scrape links

In [4]:
all_article_links = []

archive_url_name = 'archives.html'
response = requests.get(base_url + archive_url_name)
soup = BeautifulSoup(response.text, 'html.parser')

archives = soup.find_all(id='bodytekstindex')

# Archive years
for archive in archives:
    # print(archive, '\n')
    archive_links = limit_archive_links(archive.find_all('a'))
    
    # Archive months
    for archive_link in archive_links:
        monthly_archive_link = base_url + archive_link

        response = requests.get(monthly_archive_link)
        soup = BeautifulSoup(response.text, 'html.parser')
        articles = soup.find_all(id='bodytekst')
        
        # Archive articles
        for article in articles:
            # print(article, '\n')
            article_links = article.find_all('a')            
        
            # Archive articles links
            for article_link in article_links:                
                article_date = article_link.next_sibling.strip()
                archive_article_link = base_url + article_link.get('href')

                link_data = {'article_date' : article_date,
                             'archive_link' : monthly_archive_link,
                             'article_link' : archive_article_link}
                
                all_article_links.append(link_data)

In [5]:
# Duplicated article links as they can be posted in year part a, b and in other years
df_ergo_logs_links = pd.DataFrame(all_article_links).drop_duplicates(subset=["article_link"])

# Scrape articles

## Helper Functions

In [6]:
def get_article_text(article_link:str) -> str:
    """"""
    response = requests.get(article_link)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    return soup.get_text()


def format_article_text(article_text:str) -> str:
    """"""
    try:
        # Remove everything below 'More:\n'
        article_text = article_text.split('More:\n')[0]   
    
        # Remove everything above the date
        pattern = r'\d{2}\.\d{2}\.\d{4}'
        article_text = re.split(pattern, article_text)[1]
        
        # Reduce the \n to single \n    
        article_text = re.sub('\n+', '\n', article_text)
    except: 
        pass

    return article_text

## Scrape, format and to database

In [8]:
df_ergo_logs = df_ergo_logs_links

In [9]:
import os
ergologs_data_folder = 'ergologs_data'

# Create data folder
try:
    os.mkdir(ergologs_data_folder)
except:
    print(f'{ergologs_data_folder} folder already exists')

# Get and store article text
df_ergo_logs['article_text'] = df_ergo_logs.article_link.apply(lambda x: format_article_text(get_article_text(x)))

# Store
# df_ergo_logs.to_csv(f"./{ergologs_data_folder}/ergologs.csv", index=False)

ergologs_data folder already exists


## Post formatting
- Removing when article not found: '\n\n404 Not Found\n\nNot Found\nThe requested URL was not found on this server.\n\n'
- Specific issue: '  503 Backend fetch failed   Error 503 Backend fetch failed Backend fetch failed Guru Meditation: XID: 266283067  Varnish cache server '

In [10]:
# Removing non-articles
df_ergo_logs = df_ergo_logs[df_ergo_logs.article_text != '\n\n404 Not Found\n\nNot Found\nThe requested URL was not found on this server.\n\n']
df_ergo_logs = df_ergo_logs[df_ergo_logs.article_text != '  503 Backend fetch failed   Error 503 Backend fetch failed Backend fetch failed Guru Meditation: XID: 266283067  Varnish cache server ']

# Remove \n at the beginning
df_ergo_logs.article_text = df_ergo_logs.article_text.str[1:]

In [11]:
# Create article fields
df_ergo_logs['article_title'] = df_ergo_logs.article_text.apply(lambda x: x.split('\n')[1])
df_ergo_logs['article_date'] = df_ergo_logs['article_date'].fillna('')

In [25]:
import os

In [28]:
df_ergo_logs.to_parquet(f"ergologs.parquet", index=False)

FileNotFoundError: [Errno 2] No such file or directory: 'ergologs.parquet'

# To vector database: Faiss

In [None]:
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from langchain.vectorstores import FAISS
from langchain.embeddings import OpenAIEmbeddings

In [None]:
# Custom embeddings
# embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

# OpenAI embeddings
embedding_function = OpenAIEmbeddings()

In [None]:
df_ergo_logs

In [None]:
loader = DataFrameLoader(df_ergo_logs, page_content_column="article_text")
docs = loader.load()

db = FAISS.from_documents(documents=docs, embedding=embedding_function)

In [None]:
# Save db
db.save_local("./ergologs_data/db_faiss_OAI")