In [378]:
## CONFIGURE THIS ##
AWS_ACCESS_KEY_ID=""
AWS_SECRET_ACCESS_KEY=""
AWS_SESSION_TOKEN=""
app_name='Corpus Search Services'
# corpus search services consumer key
consumer_key=''
pocket_graph_endpoint='https://getpocket.com/graphql'
corpus_search_endpoint=""

# select url, externalId, language, source from ApprovedItem where isSyndicated is true;
syndication_json_dump=''#filepath

In [254]:
import pandas as pd
import numpy as np
import json
from datetime import datetime as dt
import itertools
import requests
from tqdm import tqdm
import time
import random

In [302]:
# select url, externalId, language, source from ApprovedItem where isSyndicated is true;

synd = pd.read_json(syndication_json_dump)\
    .set_index('externalId')\
    .replace('\xa0', ' ', regex=True)

# all entries are getpocket.com domains
assert len(synd[synd['url'].str.contains("getpocket.com")]) == len(synd)

In [303]:
# fetch the original
url = synd['url'].iloc[0]
def get_original_id(url):
    body =  {'query': """
          query SyndicatedCorpusItemId($url: String!)
            {
              itemByUrl(url: $url) {
                syndicatedArticle {
                  publisherUrl
                  originalItem {
                    givenUrl
                    corpusItem {
                      id
                    }
                  }
                }
              }
            }""",
        'operationName': 'SyndicatedCorpusItemId',
        'variables': { 'url': url }}
    r = requests.post(
        f'{pocket_graph_endpoint}?consumer_key={consumer_key}', 
        data=json.dumps(body),
        headers={ 'apollographql-client-name': app_name, 'Content-Type': 'application/json' })
    return r


def get_info(urls, results, errors, request_delay=0.1):
    for i in tqdm(range(len(urls))):
        r = get_original_id(urls[i])
        if r.status_code != requests.codes.ok:
            errors.append(urls[i])
        else:
            results.append({'url': urls[i], 'result': r.json()})
        time.sleep(request_delay + random.random() / 10.)
    return (results, errors)


In [304]:
urls = list(synd['url'])
results = []
errors = []
res = get_info(urls, results, errors)

100%|█████████████████████████████████████| 9424/9424 [1:57:47<00:00,  1.33it/s]


In [315]:
with open('./results-v4.json', 'w') as f:
    json.dump(res[0], f)

In [316]:
def transform_to_row(r):
    d = {'syndicatedUrl': r['url']}
    d['publisherUrl'] = ((r['result']['data']['itemByUrl'] or {}).get('syndicatedArticle', {}) or {}).get('publisherUrl')
    d['corpusId'] = ((((r['result']['data']['itemByUrl'] or {}).get('syndicatedArticle', {}) or {}).get('originalItem', {}) or {}).get('corpusItem', {}) or {}).get('id')
    return d

In [317]:
df = pd.DataFrame(list(map(lambda x: transform_to_row(x), res[0])))

In [385]:
unresolved = len(df[df['corpusId'].isna()])
total = len(df)

print(f'# duplicated: {total - unresolved} out of {total} ({(total - unresolved) / total * 100}%)')

# duplicated: 1849 out of 9446 (19.5744230362058%)


In [329]:
synd.rename(columns={'url': 'syndicatedUrl'}, inplace=True)

In [346]:
dedupes = df[~df['corpusId'].isna()].rename(columns={'corpusId': 'originalCorpusId'}).merge(synd.rename_axis('syndicatedCorpusId').reset_index(), on=['syndicatedUrl'], how='inner')

In [347]:
print(dedupes['language'].unique())
dedupes.head()

['EN']


Unnamed: 0,syndicatedUrl,publisherUrl,originalCorpusId,syndicatedCorpusId,language,source
0,https://getpocket.com/explore/item/the-science...,https://www.brainpickings.org/2015/07/20/esthe...,28c90471-bda3-45fc-a310-297d32346c56,8118dc5e-ac0b-4eb8-911a-9661384d8c84,EN,BACKFILL
1,https://getpocket.com/explore/item/3-ways-to-b...,https://hbr.org/2016/11/3-ways-to-better-under...,ebc46868-c1e5-4e05-8e7d-e95ac4f50ef0,8b99dcc6-0a65-4303-8dac-2348da34f068,EN,BACKFILL
2,https://getpocket.com/explore/item/what-great-...,https://hbr.org/2016/07/what-great-listeners-a...,daf109f3-909c-4200-87d5-ba859a1de46a,0cc330bd-ef67-4fcf-8120-2e87283439db,EN,BACKFILL
3,https://getpocket.com/explore/item/how-to-beat...,https://hbr.org/2016/07/how-to-beat-procrastin...,e7b4f3e6-fbba-4786-8357-1f26058848c2,410cb299-2861-4b3b-8bb7-5ee985daac07,EN,BACKFILL
4,https://getpocket.com/explore/item/how-i-rewir...,http://nautil.us/issue/40/learning/how-i-rewir...,d71ff0bc-cc87-4d58-ad64-75848f6eb8db,b0595294-305f-4460-91e8-3d3058e4c195,EN,BACKFILL


In [368]:
# just en so do corpus_en_luc
commands = dedupes['originalCorpusId'].apply(lambda x: {'_index': 'corpus_en_luc', '_op_type': 'delete', '_id': x})#{'delete': {'_index': 'corpus_en_luc', '_id': x}})
commands.head()

0    {'_index': 'corpus_en_luc', '_op_type': 'delet...
1    {'_index': 'corpus_en_luc', '_op_type': 'delet...
2    {'_index': 'corpus_en_luc', '_op_type': 'delet...
3    {'_index': 'corpus_en_luc', '_op_type': 'delet...
4    {'_index': 'corpus_en_luc', '_op_type': 'delet...
Name: originalCorpusId, dtype: object

In [369]:
from elasticsearch import Elasticsearch
from elasticsearch.helpers import streaming_bulk
import tqdm

In [380]:
client = Elasticsearch(
    corpus_search_endpoint,
    timeout=30,
    max_retries=10,
    retry_on_timeout=True
)

In [381]:
print("deleting documents...")
progress = tqdm.tqdm(unit="docs", total=len(commands))
successes = 0
errors = []
for ok, action in streaming_bulk(
    client=client, actions=list(commands),
    raise_on_error=False
):
    progress.update(1)
    successes += ok
    if not ok:
        errors.append(json.dumps(action))

print(f"deleted {successes} documents")


deleting documents...


100%|█████████████████████████████████████| 1849/1849 [04:56<00:00,  6.24docs/s]
 81%|█████████████████████████████▏      | 1501/1849 [00:04<00:00, 621.90docs/s]

deleted 1843 documents
