In [100]:
import os
from datetime import datetime
from pathlib import Path
from time import strptime, mktime

import pandas as pd
from bs4 import BeautifulSoup
from lxml import etree

from functions.feed_updaters import get_feed_tree_from_url

In [105]:
"""
We want to compare two feeds. We want to check:
- The number of items in each feed
- Which items are possibly cross posts
- Compare their titles and content
- Compare meta-data?
"""

intro_str = """Welcome to The Nonlinear Library, where we use Text-to-Speech software to convert the best writing from the Rationalist and EA communities into audio.
This is: {item_title}, published by {item_author} on {item_date} on {item_web_long}."""
outro_str = """<p>Thanks for listening. To help us out with The Nonlinear Library or to learn more, please visit nonlinear.org. </p>"""


def get_feed_tree_from_local_storage(filename):
    tree = etree.parse(filename)
    return tree.getroot()


def get_feed_dataframe(feed_filename):
    feed = get_feed_tree_from_local_storage(feed_filename)
    titles = [title_element.text.strip() for title_element in feed.findall('channel/item/title')]
    descriptions = [description_element.text.strip() for description_element in
                    feed.findall('channel/item/description')]
    description_html = [BeautifulSoup(description, 'html.parser') for description in descriptions]
    n_description_paragraphs = [len(description.find_all('p')) for description in description_html]

    contents = ['' if not item.find('content') else item.find('content').text.strip() for item in
                feed.findall('channel/item')]
    content_html = [BeautifulSoup(content, 'html.parser') for content in contents]
    n_content_paragraphs = [len(content.find_all('p')) for content in content_html]

    def date_str_to_datetime(date_str):
        return datetime.fromtimestamp(mktime(strptime(date_str, '%a, %d %b %Y %H:%M:%S %Z')))

    pub_date = [date_str_to_datetime(pub_date_elem.text.strip()) for pub_date_elem in
                feed.findall('channel/item/pubDate')]

    data = {
        'title': titles,
        'title_size': list(map(len, titles)),
        'description': descriptions,
        'description_size': list(map(len, descriptions)),
        'n_description_paragraphs': n_description_paragraphs,
        'content': contents,
        'content_size': list(map(len, contents)),
        'n_content_paragraphs': n_content_paragraphs,
        'pub_date': pub_date
    }
    df = pd.DataFrame(data)
    df['description_size'] -= len(intro_str) + len(outro_str)
    return df


def download_feed(destination_directory, feed_url, destination_filename) -> etree.Element:
    destination_path = Path(destination_directory)
    if not destination_path.exists():
        destination_path.mkdir(parents=True, exist_ok=True)

    feed = get_feed_tree_from_url(feed_url)

    filename = os.path.join(destination_path, destination_filename)
    tree = etree.ElementTree(feed)
    tree.write(filename, pretty_print=True, xml_declaration=True, encoding='utf-8')

    return feed


In [98]:
# Download feeds from production

feed_file_names = [
    'nonlinear-library-AF.xml',
    'nonlinear-library-EA.xml',
    'nonlinear-library-LW.xml'
]

production_feed_url_base = 'https://storage.googleapis.com/rssfile/'
dev_feed_url_base = 'https://storage.googleapis.com/newcode/'

production_feed_urls = [production_feed_url_base + feed_name for feed_name in feed_file_names]
production_feed_destination_filenames = ['test_files/production/' + feed_file_name for feed_file_name in
                                         feed_file_names]

dev_feed_urls = [dev_feed_url_base + feed_file_name for feed_file_name in feed_file_names]
dev_feed_destination_filenames = ['test_files/dev/' + feed_file_name for feed_file_name in feed_file_names]

production_feeds = [download_feed('test_files/production', production_feed_url_base + feed_file_name, feed_file_name)
                    for feed_file_name in feed_file_names]
dev_feeds = [download_feed('test_files/dev', dev_feed_url_base + feed_file_name, feed_file_name) for feed_file_name in
             feed_file_names]

In [106]:
feed_to_compare = 'nonlinear-library-AF.xml'

dev_feed = get_feed_dataframe('test_files/dev/' + feed_to_compare)
prod_feed = get_feed_dataframe('test_files/production/' + feed_to_compare)

In [108]:
dev_feed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   title                     7 non-null      object        
 1   title_size                7 non-null      int64         
 2   description               7 non-null      object        
 3   description_size          7 non-null      int64         
 4   n_description_paragraphs  7 non-null      int64         
 5   content                   7 non-null      object        
 6   content_size              7 non-null      int64         
 7   n_content_paragraphs      7 non-null      int64         
 8   pub_date                  7 non-null      datetime64[ns]
dtypes: datetime64[ns](1), int64(5), object(3)
memory usage: 632.0+ bytes


In [109]:
prod_feed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   title                     6 non-null      object        
 1   title_size                6 non-null      int64         
 2   description               6 non-null      object        
 3   description_size          6 non-null      int64         
 4   n_description_paragraphs  6 non-null      int64         
 5   content                   6 non-null      object        
 6   content_size              6 non-null      int64         
 7   n_content_paragraphs      6 non-null      int64         
 8   pub_date                  6 non-null      datetime64[ns]
dtypes: datetime64[ns](1), int64(5), object(3)
memory usage: 560.0+ bytes


In [110]:
dev_feed.head()

Unnamed: 0,title,title_size,description,description_size,n_description_paragraphs,content,content_size,n_content_paragraphs,pub_date
0,AF - Behavioural statistics for a maze-solving...,68,"Welcome to The Nonlinear Library, where we use...",58162,77,,0,0,2023-04-20 23:26:08
1,AF - The Learning-Theoretic Agenda: Status 202...,64,"Welcome to The Nonlinear Library, where we use...",437582,300,,0,0,2023-04-19 06:21:29
2,AF - Capabilities and alignment of LLM cogniti...,75,"Welcome to The Nonlinear Library, where we use...",54474,93,,0,0,2023-04-18 17:29:29
3,AF - Possibilizing vs. actualizing by Tsvi Ben...,56,"Welcome to The Nonlinear Library, where we use...",12354,14,,0,0,2023-04-16 16:55:40
4,AF - Concave Utility Question by Scott Garrabrant,49,"Welcome to The Nonlinear Library, where we use...",31322,40,,0,0,2023-04-15 01:15:01


In [111]:
prod_feed.describe()

Unnamed: 0,title_size,description_size,n_description_paragraphs,content_size,n_content_paragraphs,pub_date
count,6.0,6.0,6.0,6.0,6.0,6
mean,58.666667,95253.166667,90.5,0.0,0.0,2023-04-16 10:21:21.500000256
min,48.0,12354.0,14.0,0.0,0.0,2023-04-14 06:56:18
25%,50.75,16214.75,31.0,0.0,0.0,2023-04-14 16:11:24.249999872
50%,58.0,26286.5,54.0,0.0,0.0,2023-04-15 21:05:20.500000
75%,63.0,48686.0,86.75,0.0,0.0,2023-04-18 05:21:01.750000128
max,75.0,437582.0,300.0,0.0,0.0,2023-04-19 06:21:29
std,10.112698,168406.067668,106.500235,0.0,0.0,


In [113]:
dev_feed.describe()

Unnamed: 0,title_size,description_size,n_description_paragraphs,content_size,n_content_paragraphs,pub_date
count,7.0,7.0,7.0,7.0,7.0,7
mean,60.0,89954.428571,88.571429,0.0,0.0,2023-04-17 01:56:19.571428864
min,48.0,12354.0,14.0,0.0,0.0,2023-04-14 06:56:18
25%,52.5,17893.5,34.0,0.0,0.0,2023-04-14 19:12:36.500000
50%,60.0,31322.0,68.0,0.0,0.0,2023-04-16 16:55:40
75%,66.0,56318.0,85.0,0.0,0.0,2023-04-18 23:55:29
max,75.0,437582.0,300.0,0.0,0.0,2023-04-20 23:26:08
std,9.882645,154370.893416,97.354776,0.0,0.0,


In [114]:
# See which titles are found in dev that are not in production.
dev_feed[~dev_feed['title'].isin(prod_feed['title'].tolist())]

Unnamed: 0,title,title_size,description,description_size,n_description_paragraphs,content,content_size,n_content_paragraphs,pub_date
0,AF - Behavioural statistics for a maze-solving...,68,"Welcome to The Nonlinear Library, where we use...",58162,77,,0,0,2023-04-20 23:26:08


In [115]:
# See which titles are found in production that are not in dev
prod_feed[~prod_feed['title'].isin(dev_feed['title'].tolist())]

Unnamed: 0,title,title_size,description,description_size,n_description_paragraphs,content,content_size,n_content_paragraphs,pub_date
