In [1]:
import os
from datetime import datetime
from pathlib import Path
from time import strptime, mktime

import pandas as pd
from bs4 import BeautifulSoup
from lxml import etree

from functions.feed_updaters import get_feed_tree_from_url

In [2]:
"""
We want to compare two feeds. We want to check:
- The number of items in each feed, their titles and their contents (description and content tags)
- Which items are possibly cross posts by detecting the post with minimum content
- See what posts are missing from one feed in order to investigate why they're missing
"""

intro_str = """Welcome to The Nonlinear Library, where we use Text-to-Speech software to convert the best writing from the Rationalist and EA communities into audio.
This is: {item_title}, published by {item_author} on {item_date} on {item_web_long}."""
outro_str = """<p>Thanks for listening. To help us out with The Nonlinear Library or to learn more, please visit nonlinear.org. </p>"""


def get_feed_tree_from_local_storage(filename):
    tree = etree.parse(filename)
    return tree.getroot()


def get_feed_dataframe(feed_filename):
    feed = get_feed_tree_from_local_storage(feed_filename)
    titles = [title_element.text.strip() for title_element in feed.findall('channel/item/title')]
    descriptions = [description_element.text.strip() for description_element in
                    feed.findall('channel/item/description')]
    description_html = [BeautifulSoup(description, 'html.parser') for description in descriptions]
    n_description_paragraphs = [len(description.find_all('p')) for description in description_html]

    contents = ['' if not item.find('content') else item.find('content').text.strip() for item in
                feed.findall('channel/item')]
    content_html = [BeautifulSoup(content, 'html.parser') for content in contents]
    n_content_paragraphs = [len(content.find_all('p')) for content in content_html]

    def date_str_to_datetime(date_str):
        return datetime.fromtimestamp(mktime(strptime(date_str, '%a, %d %b %Y %H:%M:%S %Z')))

    pub_date = [date_str_to_datetime(pub_date_elem.text.strip()) for pub_date_elem in
                feed.findall('channel/item/pubDate')]

    data = {
        'title': titles,
        'title_size': list(map(len, titles)),
        'description': descriptions,
        'description_size': list(map(len, descriptions)),
        'n_description_paragraphs': n_description_paragraphs,
        'content': contents,
        'content_size': list(map(len, contents)),
        'n_content_paragraphs': n_content_paragraphs,
        'pub_date': pub_date
    }
    df = pd.DataFrame(data)
    df['description_size'] -= len(intro_str) + len(outro_str)
    return df


def download_feed(destination_directory, feed_url, destination_filename) -> etree.Element:
    destination_path = Path(destination_directory)
    if not destination_path.exists():
        destination_path.mkdir(parents=True, exist_ok=True)

    feed = get_feed_tree_from_url(feed_url)

    filename = os.path.join(destination_path, destination_filename)
    tree = etree.ElementTree(feed)
    tree.write(filename, pretty_print=True, xml_declaration=True, encoding='utf-8')

    return feed


In [13]:
# Download feeds from production

feed_file_names = [
    'nonlinear-library-AF.xml',
    'nonlinear-library-EA.xml',
    'nonlinear-library-LW.xml'
]

production_feed_url_base = 'https://storage.googleapis.com/rssfile/'
dev_feed_url_base = 'https://storage.googleapis.com/newcode/'

production_feed_urls = [production_feed_url_base + feed_name for feed_name in feed_file_names]
production_feed_destination_filenames = ['test_files/production/' + feed_file_name for feed_file_name in
                                         feed_file_names]

dev_feed_urls = [dev_feed_url_base + feed_file_name for feed_file_name in feed_file_names]
dev_feed_destination_filenames = ['test_files/dev/' + feed_file_name for feed_file_name in feed_file_names]

production_feeds = [download_feed('test_files/production', production_feed_url_base + feed_file_name, feed_file_name)
                    for feed_file_name in feed_file_names]
dev_feeds = [download_feed('test_files/dev', dev_feed_url_base + feed_file_name, feed_file_name) for feed_file_name in
             feed_file_names]

In [15]:
feed_to_compare = 'nonlinear-library-EA.xml'

dev_feed = get_feed_dataframe('test_files/dev/' + feed_to_compare)
prod_feed = get_feed_dataframe('test_files/production/' + feed_to_compare)

In [16]:
dev_feed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   title                     8 non-null      object        
 1   title_size                8 non-null      int64         
 2   description               8 non-null      object        
 3   description_size          8 non-null      int64         
 4   n_description_paragraphs  8 non-null      int64         
 5   content                   8 non-null      object        
 6   content_size              8 non-null      int64         
 7   n_content_paragraphs      8 non-null      int64         
 8   pub_date                  8 non-null      datetime64[ns]
dtypes: datetime64[ns](1), int64(5), object(3)
memory usage: 704.0+ bytes


In [17]:
prod_feed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   title                     8 non-null      object        
 1   title_size                8 non-null      int64         
 2   description               8 non-null      object        
 3   description_size          8 non-null      int64         
 4   n_description_paragraphs  8 non-null      int64         
 5   content                   8 non-null      object        
 6   content_size              8 non-null      int64         
 7   n_content_paragraphs      8 non-null      int64         
 8   pub_date                  8 non-null      datetime64[ns]
dtypes: datetime64[ns](1), int64(5), object(3)
memory usage: 704.0+ bytes


In [18]:
prod_feed.describe()

Unnamed: 0,title_size,description_size,n_description_paragraphs,content_size,n_content_paragraphs,pub_date
count,8.0,8.0,8.0,8.0,8.0,8
mean,94.75,26926.375,51.0,0.0,0.0,2023-04-21 19:21:07.500000
min,54.0,1594.0,4.0,0.0,0.0,2023-04-20 23:21:14
25%,78.0,1946.0,5.75,0.0,0.0,2023-04-21 10:08:51
50%,100.0,3712.0,11.5,0.0,0.0,2023-04-21 16:18:48.500000
75%,104.25,6213.25,18.0,0.0,0.0,2023-04-21 22:49:58.750000128
max,139.0,189526.0,332.0,0.0,0.0,2023-04-23 10:39:28
std,26.553181,65733.768334,113.696337,0.0,0.0,


In [19]:
dev_feed.describe()

Unnamed: 0,title_size,description_size,n_description_paragraphs,content_size,n_content_paragraphs,pub_date
count,8.0,8.0,8.0,8.0,8.0,8
mean,94.75,26926.375,51.0,0.0,0.0,2023-04-21 19:21:07.500000
min,54.0,1594.0,4.0,0.0,0.0,2023-04-20 23:21:14
25%,78.0,1946.0,5.75,0.0,0.0,2023-04-21 10:08:51
50%,100.0,3712.0,11.5,0.0,0.0,2023-04-21 16:18:48.500000
75%,104.25,6213.25,18.0,0.0,0.0,2023-04-21 22:49:58.750000128
max,139.0,189526.0,332.0,0.0,0.0,2023-04-23 10:39:28
std,26.553181,65733.768334,113.696337,0.0,0.0,


In [20]:
# See which titles are found in dev that are not in production.
dev_feed[~dev_feed['title'].isin(prod_feed['title'].tolist())]

Unnamed: 0,title,title_size,description,description_size,n_description_paragraphs,content,content_size,n_content_paragraphs,pub_date


In [21]:
# See which titles are found in production that are not in dev
prod_feed[~prod_feed['title'].isin(dev_feed['title'].tolist())]

Unnamed: 0,title,title_size,description,description_size,n_description_paragraphs,content,content_size,n_content_paragraphs,pub_date


In [22]:
dev_feed[dev_feed['n_description_paragraphs'] == dev_feed['n_description_paragraphs'].min()]

Unnamed: 0,title,title_size,description,description_size,n_description_paragraphs,content,content_size,n_content_paragraphs,pub_date
6,EA - High schoolers can apply to the Atlas Fel...,101,"Welcome to The Nonlinear Library, where we use...",1956,4,,0,0,2023-04-21 04:41:39


In [23]:
prod_feed[prod_feed['n_description_paragraphs'] == prod_feed['n_description_paragraphs'].min()]

Unnamed: 0,title,title_size,description,description_size,n_description_paragraphs,content,content_size,n_content_paragraphs,pub_date
6,EA - High schoolers can apply to the Atlas Fel...,101,"Welcome to The Nonlinear Library, where we use...",1956,4,,0,0,2023-04-21 04:41:39
