Skip to content

Commit

Permalink
feat(NewTab): Boost syndicated article to position 1
Browse files Browse the repository at this point in the history
  • Loading branch information
mmiermans committed Jul 19, 2023
1 parent 1264a55 commit 46ab4a1
Show file tree
Hide file tree
Showing 8 changed files with 173 additions and 7 deletions.
4 changes: 2 additions & 2 deletions app/data_providers/corpus/corpus_api_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,8 @@ async def fetch(
query = """
query ScheduledSurface($scheduledSurfaceId: ID!, $date_today: Date!, $date_yesterday: Date!) {
scheduledSurface(id: $scheduledSurfaceId) {
items_today: items(date: $date_today) { corpusItem { id topic publisher } scheduledDate }
items_yesterday: items(date: $date_yesterday) { corpusItem { id topic publisher } scheduledDate }
items_today: items(date: $date_today) { corpusItem { id topic publisher url } scheduledDate }
items_yesterday: items(date: $date_yesterday) { corpusItem { id topic publisher url } scheduledDate }
}
}
"""
Expand Down
5 changes: 4 additions & 1 deletion app/data_providers/slate_providers/new_tab_slate_provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from app.models.corpus_recommendation_model import CorpusRecommendationModel
from app.models.corpus_slate_lineup_model import RecommendationSurfaceId
from app.models.localemodel import LocaleModel
from app.rankers.algorithms import thompson_sampling, spread_publishers
from app.rankers.algorithms import thompson_sampling, spread_publishers, boost_syndicated

# Maximum tileId that Firefox can support. Firefox uses Javascript to store this value. The max value of a Javascript
# number can be found using `Number.MAX_SAFE_INTEGER`. which is 2^53 - 1 because it uses a 64-bit IEEE 754 float.
Expand Down Expand Up @@ -109,4 +109,7 @@ async def rank_corpus_items(self, items: List[CorpusItemModel], *args, **kwargs)
# for duplicate publishers.
items = spread_publishers(items, spread_distance=PUBLISHER_SPREAD_DISTANCE)

# Final step is to boost a syndicated article (if one exists).
items = boost_syndicated(items, metrics)

return items
10 changes: 10 additions & 0 deletions app/models/corpus_item_model.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import re
from datetime import datetime
from typing import Optional

Expand All @@ -8,10 +9,19 @@ class CorpusItemModel(BaseModel):
id: str
topic: str = None
publisher: str = None
url: str = None

ranked_with_engagement_updated_at: Optional[datetime] = Field(
default=None,
description='If this recommendation was ranked based on engagement data, this timestamp (in UNIX time, seconds)'
' indicates the recency of the engagement data, enabling us to monitor the delay in our engagement'
' feedback loop. If engagement data was not utilized for ranking, this value will be null.'
)

@property
def is_syndicated(self) -> Optional[bool]:
"""
:return: True if item is syndicated, False if not syndicated, or None if unknown because url is not available.
"""
if self.url is not None:
return re.search(r'^(https?://)?(www\.)?(get)?pocket\.com/explore/item', self.url) is not None
29 changes: 29 additions & 0 deletions app/rankers/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -320,3 +320,32 @@ def unique_domains_first(recs: List) -> List:
else:
duplicates.append(r)
return uniques + duplicates


def boost_syndicated(
recs: CorpusItemListType,
metrics: Dict[(int or str), 'CorpusItemEngagementModel'],
impression_cap: int = 3000000,
boostable_slot: int = 1,
):
"""
Boost a syndicated article with fewer than `impression_cap` impressions into `boostable_slot`.
Requirements and experiment results: https://docs.google.com/document/d/1Vgq63DZQF-pz7R3kvcNXgkUd1I829FZqkIUlpIVY_g4
:param recs: List of CorpusItem. `url` attribute is used to determine whether a CorpusItem is syndicated.
:param metrics: Engagement keyed on CorpusItem.id.
:param impression_cap: Syndicated articles need to have fewer than this many impressions to qualify. Defaults to 3M.
See above Google Doc for more details on this threshold.
:param boostable_slot: 0-based slot to boost an item into. Defaults to slot 1, which is the second recommendation.
"""
boostable_rec = next(
(
r for r in recs[boostable_slot + 1:]
if r.is_syndicated and (r.id not in metrics or metrics[r.id].trailing_1_day_impressions < impression_cap)
), None)
if boostable_rec:
recs = copy(recs) # Don't change the input
recs.remove(boostable_rec)
recs.insert(boostable_slot, boostable_rec)

return recs
30 changes: 30 additions & 0 deletions tests/assets/engagement_metrics.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
import datetime
from typing import Dict, List

from app.models.corpus_item_model import CorpusItemModel
from app.models.metrics.corpus_item_engagement_model import CorpusItemEngagementModel
from app.models.metrics.firefox_new_tab_metrics_model import FirefoxNewTabMetricsModel
from app.models.metrics.metrics_model import MetricsModel

Expand Down Expand Up @@ -109,3 +112,30 @@ def generate_firefox_metrics(recommendation_ids: List[str]) -> Dict[str, 'Firefo
metrics.update(_get_firefox_new_tab_metrics_model_dict(**kwargs))

return metrics


def generate_corpus_engagement(recommendations: List[CorpusItemModel]) -> Dict[str, 'CorpusItemEngagementModel']:
"""
:return: Dictionary where keys are recommendation ids, and values are CorpusItemEngagementModel, with
- trailing_1_day_opens equal to 33 * (i + 1), for the i'th recommendation
- trailing_1_day_impressions being equal to 999
"""
return {
rec.id: CorpusItemEngagementModel(
key=f'NEW_TAB_EN_US/edc5571f-7adb-537a-afd8-5612155d54da/{rec.id}',
recommendation_surface_id='NEW_TAB_EN_US',
corpus_slate_configuration_id='edc5571f-7adb-537a-afd8-5612155d54da',
corpus_item_id=rec.id,
trailing_1_day_opens=33 * (index + 1), # 33, 66, 99, etc.
trailing_1_day_impressions=999,
trailing_7_day_opens=0,
trailing_7_day_impressions=0,
trailing_14_day_opens=0,
trailing_14_day_impressions=0,
trailing_21_day_opens=0,
trailing_21_day_impressions=0,
trailing_28_day_opens=0,
trailing_28_day_impressions=0,
updated_at=datetime.datetime.now(),
) for index, rec in enumerate(recommendations)
}
18 changes: 16 additions & 2 deletions tests/unit/data_providers/test_new_tab_slate_provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,8 @@
from app.data_providers.slate_providers.new_tab_slate_provider import NewTabSlateProvider, PUBLISHER_SPREAD_DISTANCE
from app.models.corpus_item_model import CorpusItemModel
from app.models.corpus_slate_lineup_model import RecommendationSurfaceId
from app.models.localemodel import LocaleModel
from tests.mocks.corpus_clients import CORPUS_API_CLIENT_FIXTURE_ITEM_COUNT
from tests.assets.topics import all_topic_fixtures
from tests.assets.topics import all_topic_fixtures, business_topic


@pytest.fixture
Expand Down Expand Up @@ -139,6 +138,21 @@ async def test_rank_corpus_items_with_engagement_failure(
assert len(corpus_items_10) == len(ranked_items)
assert any(r.levelname == 'ERROR' for r in caplog.records)

async def test_boost_syndicated_article(self, new_tab_slate_provider, corpus_items_10, aiocache_functions_fixture):
# Append a syndicated article
syndicated_article = CorpusItemModel(
id='syndicated-rec',
topic=business_topic.corpus_topic_id,
publisher='The Original Publisher',
url='https://getpocket.com/explore/item/this-is-a-syndicated-article',
)
corpus_items_10.append(syndicated_article)

ranked_items = await new_tab_slate_provider.rank_corpus_items(items=corpus_items_10)

assert len(corpus_items_10) == len(ranked_items)
assert ranked_items[1] == syndicated_article

@pytest.mark.parametrize(
('recommendation_surface_id', 'expected_utm_source'),
[
Expand Down
27 changes: 27 additions & 0 deletions tests/unit/models/test_corpus_item_model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
import pytest

from app.models.corpus_item_model import CorpusItemModel
from tests.assets.topics import business_topic


@pytest.mark.parametrize(('url', 'is_syndicated'), [
('https://getpocket.com/explore/item/8-natural-ways-to-repel-insects-without-bug-spray', True),
('http://getpocket.com/explore/item/8-natural-ways-to-repel-insects-without-bug-spray', True),
('https://www.getpocket.com/explore/item/8-natural-ways-to-repel-insects-without-bug-spray', True),
('https://pocket.com/explore/item/8-natural-ways-to-repel-insects-without-bug-spray', True), # pocket.com redirect
('https://getpocket.com/explore/item/the-secrets-of-real-life-wedding-crashers?utm_source=pocket-newtab', True),
('https://getpocket.com/collections/the-unexpected-flavor-combos-too-delicious-not-to-try', False), # collection
('https://getpocket.com/explore/entertainment', False), # topic page
('https://www.harpersbazaar.com/beauty/hair/a44284121/hair-braiders-harlem-injuries-protections/', False),
('https://example.com/?utm_content=https://www.getpocket.com/explore/item/example', False),
(None, None),
])
def test_corpus_item_model_is_syndicated(url, is_syndicated):
corpus_item = CorpusItemModel(
id='rec-123',
topic=business_topic.corpus_topic_id,
publisher='The Original Publisher',
url=url,
)

assert corpus_item.is_syndicated == is_syndicated
57 changes: 55 additions & 2 deletions tests/unit/rankers/test_algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,14 @@
import pytest
from app.models.corpus_item_model import CorpusItemModel

from tests.assets.engagement_metrics import generate_metrics, generate_firefox_metrics, generate_metrics_model_dict
from tests.assets.engagement_metrics import generate_metrics, generate_firefox_metrics, generate_metrics_model_dict, \
generate_corpus_engagement
from tests.assets.topics import *
from tests.unit.utils import generate_recommendations, generate_curated_configs, generate_nontopic_configs, generate_lineup_configs
from app.config import ROOT_DIR
from app.rankers.algorithms import spread_publishers, top5, top15, top30, thompson_sampling, rank_topics, \
thompson_sampling_1day, thompson_sampling_7day, thompson_sampling_14day, blocklist, top1_topics, top3_topics, \
firefox_thompson_sampling_1day, rank_by_impression_caps, rank_by_preferred_topics
firefox_thompson_sampling_1day, rank_by_impression_caps, rank_by_preferred_topics, boost_syndicated
from app.models.personalized_topic_list import PersonalizedTopicList, PersonalizedTopicElement
from operator import itemgetter

Expand All @@ -35,6 +36,15 @@ def get_recs():

return recs

@staticmethod
def get_syndicated_rec():
return CorpusItemModel(
id='syndicated-rec',
topic=business_topic.corpus_topic_id,
publisher='The Original Publisher',
url='https://getpocket.com/explore/item/this-is-a-syndicated-article',
)


@pytest.mark.parametrize("user_prefs", [
([t for i, t in enumerate(MockCorpusItems.get_topics()) if i % 2 == 0]),
Expand Down Expand Up @@ -376,3 +386,46 @@ def test_no_topic_slates(self):

for topic_ranker in [top1_topics, top3_topics, rank_topics]:
self.assertRaises(ValueError, topic_ranker, input_configs, full_topic_profile)


class TestAlgorithmsBoostSyndicated(unittest.TestCase):

def test_boost_syndicated_article(self):
recs = MockCorpusItems.get_recs()
recs.append(MockCorpusItems.get_syndicated_rec())
metrics = generate_corpus_engagement(recs)

reordered = boost_syndicated(recs, metrics)

assert len(recs) == len(reordered)
# Last item in recs (-1) is moved to index 1 in reordered.
assert recs[-1] == reordered[1]
assert recs[:-1] == [reordered[0]] + reordered[2:]

def test_no_urls(self):
recs = MockCorpusItems.get_recs() # get_recs does not set url to syndicated article
metrics = generate_corpus_engagement(recs)

reordered = boost_syndicated(recs, metrics)

assert recs == reordered

def test_disqualify_by_impression_cap(self):
recs = MockCorpusItems.get_recs()
syndicated_rec = MockCorpusItems.get_syndicated_rec()
recs.append(syndicated_rec)
metrics = generate_corpus_engagement(recs)
metrics[syndicated_rec.id].trailing_1_day_impressions = 10 * 1000 * 1000 # Impression cap is 3 million

reordered = boost_syndicated(recs, metrics)

assert recs == reordered

def test_only_boost_upwards(self):
recs = MockCorpusItems.get_recs()
recs.insert(0, MockCorpusItems.get_syndicated_rec())
metrics = generate_corpus_engagement(recs)

reordered = boost_syndicated(recs, metrics)

assert recs == reordered

0 comments on commit 46ab4a1

Please sign in to comment.