In [None]:
import csv
import datetime
import dateutil.relativedelta
import json
import urllib.parse

import boto3
import requests

# Structure

In [None]:
class TranslatedText:
    
    def __init__(self, original, original_lang, translated, translated_lang):
        self._original = original
        self._original_lang = original_lang
        self._translated = translated
        self._translated_lang = translated_lang
    
    def get_original(self):
        return self._original
    
    def get_original_lang(self):
        return self._original_lang
    
    def get_translated(self):
        return self._translated
    
    def get_translated_lang(self):
        return self._translated_lang
    
    def to_dict(self):
        return {
            'original': {
                'language': self.get_original_lang(),
                'body': self.get_original()
            },
            'translated': {
                'language': self.get_translated_lang(),
                'body': self.get_translated()
            }
        }

In [None]:
class Locale:
    
    def __init__(self, country, language):
        self._country = country
        self._language = language
    
    def get_country(self):
        return self._country
    
    def get_language(self):
        return self._language
    
    def to_dict(self):
        return {
            'country': self.get_country(),
            'language': self.get_language()
        }

In [None]:
class Source:
    
    def __init__(self, name, url, categories, language, country):
        self._name = name
        self._url = url
        self._categories = categories
        self._language = language
        self._country = country
    
    def get_name(self):
        return self._name
    
    def get_url(self):
        return self._url
    
    def get_categories(self):
        return self._categories
    
    def get_language(self):
        return self._language
    
    def get_country(self):
        return self._country
    
    def to_dict(self):
        return {
            'name': self.get_name(),
            'url': self.get_url(),
            'categories': self.get_categories(),
            'language': self.get_language(),
            'country': self.get_country()
        }

In [None]:
class Article:
    
    def __init__(self, url, title, keywords, creator, content, publish_datetime,
        category, locale):
        self._url = url
        self._title = title
        self._keywords = keywords
        self._creator = creator
        self._content = content
        self._publish_datetime = publish_datetime
        self._category = category
        self._locale = locale
    
    def get_url(self):
        return self._url
    
    def get_title(self):
        return self._title
    
    def get_keywords(self):
        return self._keywords
    
    def get_creator(self):
        return self._creator
    
    def get_content(self):
        return self._content
    
    def get_publish_datetime(self):
        return self._publish_datetime
    
    def get_category(self):
        return self._category
    
    def get_locale(self):
        return self._locale
    
    def to_dict(self):
        return {
            'url': self.get_url(),
            'title': self.get_title().to_dict(),
            'content': self.get_content().to_dict(),
            'keywords': self.get_keywords(),
            'creator': self.get_creator(),
            'published': self.get_publish_datetime(),
            'category': self.get_category(),
            'locale': self.get_locale().to_dict()
        }

# Utility

In [None]:
with open('docs_languages.csv') as f:
    records = csv.DictReader(f)
    records_sanitized = map(
        lambda x: (x['Language'].lower().strip(), x['Language Code'].lower().strip()),
        records
    )
    indexed_languages = dict(records_sanitized)

In [None]:
def lookup_language_code(full_name):
    full_name_safe = full_name.lower().strip()
    return indexed_languages[full_name_safe]

In [None]:
class TranslateChunker:
    
    def __init__(self, max_size=950):
        self._current_word = ''
        self._current_chunk = ''
        self._chunks = []
        self._max_size = max_size
        self._finished = False
    
    def process(self, char):
        assert not self._finished
        
        if char == ' ':
            self._accept_current_word()
        else:
            self._current_word += char
    
    def finish(self):
        assert not self._finished
        self._accept_current_word()
        self._accept_current_chunk()
        self._finished = True
    
    def get_chunks(self):
        assert self._finished
        
        def is_ok_size(target):
            length = len(target)
            return length > 0 and length < self._max_size
        
        return filter(is_ok_size, self._chunks)
    
    def _accept_current_chunk(self):
        self._chunks.append(self._current_chunk.strip())
        self._current_chunk = ''
    
    def _accept_current_word(self):
        if len(self._current_word) > self._max_size:
            self._current_word = self._current_word[:self._max_size]
        
        possible_size = len(self._current_chunk) + len(self._current_word) + 1
        if possible_size > self._max_size:
            self._accept_current_chunk()
        
        self._current_chunk = self._current_chunk + ' ' + self._current_word
        self._current_word = ''

In [None]:
class QueryFacade:
    
    def __init__(self, news_data_key, aws_key, news_data_endpoint=None, aws_region=None):
        self._translate_cache = {}
        
        self._news_data_key = news_data_key
        self._aws_key = aws_key
        
        if news_data_endpoint is None:
            news_data_endpoint = 'https://newsdata.io/api/1/'
        
        self._news_data_endpoint = news_data_endpoint
        
        if aws_region is None:
            aws_region = 'us-east-2'
        
        self._aws_region = aws_region
        
        self._translate_client = self._build_translate_client()
    
    def sample_sources(self, country):
        def parse_record(record):
            name = record['source_id']
            url = urllib.parse.urlparse(record['link']).netloc
            categories = record['category']
            language = lookup_language_code(record['language'])
            return Source(name, url, categories, language, country)
        
        def get_for_priority(priority, top=True):
            params = {
                'prioritydomain': priority,
                'country': country,
                'apikey': newsdata_key
            }
            
            if top:
                params['category'] = 'top'
            
            response = requests.get(
                self._news_data_endpoint + 'news',
                params=params
            )

            if response.status_code != 200:
                raise RuntimeError('Error (%d): %s' % (response.status_code, response.text))

            raw_results = response.json()['results']
            parsed_results = map(lambda record: parse_record(record), raw_results)
            
            output_names = set()
            output_records = []
            for result in parsed_results:
                if result.get_name() not in output_names:
                    output_names.add(result.get_name())
                    output_records.append(result)
            
            return output_records
        
        sources = get_for_priority('top', top=True)
        
        if len(sources) < 5:
            sources += get_for_priority('medium', top=True)
        
        if len(sources) < 5:
            sources += get_for_priority('top', top=False)
        
        if len(sources) < 5:
            sources += get_for_priority('medium', top=False)
        
        return sources
    
    def sample_articles(self, country='gb', language='en', year=2023, month=6,
        query='Food', domain=None):
        from_date = datetime.date(year, month, 1)
        to_date_exclusive = from_date + dateutil.relativedelta.relativedelta(months=1)
        to_date = to_date_exclusive + dateutil.relativedelta.relativedelta(days=-1)
        query_translated = self.translate(query, to=language, cache=True)

        params = {
            'country': country,
            'language': language,
            'from_date': from_date,
            'to_date': to_date,
            'q': query_translated.get_translated(),
            'apikey': newsdata_key
        }
        
        if domain:
            params['domainurl'] = domain
        
        response = requests.get(
            self._news_data_endpoint + 'archive',
            params=params
        )

        if response.status_code != 200:
            raise RuntimeError('Error (%d): %s' % (response.status_code, response.text))

        results_json = response.json()
        results = results_json['results']

        locale = Locale(country, language)

        def parse_result(record):
            url = record['link']
            keywords = record['keywords']
            creator = record['creator']
            publish_datetime = record['pubDate']
            category = record['category']

            title_untranslated = record['title']

            content_pieces = [
                record['content'],
                record['description']
            ]
            content_pieces_valid = filter(
                lambda x: x != None,
                content_pieces
            )
            content_untranslated = ' '.join(content_pieces_valid)

            title = self.translate(title_untranslated, source=language)
            content = self.translate(content_untranslated, source=language)

            return Article(
                url,
                title,
                keywords,
                creator,
                content,
                publish_datetime,
                category,
                locale
            )

        parsed_results = map(parse_result, results)
        return parsed_results
    
    def translate(self, target, source='en', to='en', cache=False):
        if source == to:
            return TranslatedText(target, source, target, to)

        if not cache:
            return self._translate_force(target, source=source, to=to)
        
        assert 'en' in [source, to]

        if target not in self._translate_cache:
            self._translate_cache[target] = {}

        if to not in self._translate_cache[target]:
            translated = self._translate_force(target, source=source, to=to)
            self._translate_cache[target][to] = translated

        return self._translate_cache[target][to]
    
    def _build_translate_client(self):
        return boto3.client(
            service_name='translate',
            region_name=self._aws_region,
            use_ssl=True,
            aws_access_key_id=self._aws_key['access'],
            aws_secret_access_key=self._aws_key['secret']
        )
    
    def _translate_force(self, target, source='en', to='en'):
        chunker = TranslateChunker()
        
        for char in target:
            chunker.process(char)
        
        chunker.finish()
        
        pieces = list(chunker.get_chunks())
        
        def translate_piece(piece):
            response = self._translate_client.translate_text(
                Text=piece, 
                SourceLanguageCode=source,
                TargetLanguageCode=to,
            )
            return response
        
        pieces_responses = map(translate_piece, pieces)
        pieces_translated = map(lambda x: x['TranslatedText'], pieces_responses)
        translated = ' '.join(pieces_translated)
        
        return TranslatedText(
            target,
            source,
            translated,
            to
        )

In [None]:
with open('newsdata_key.txt') as f:
    newsdata_key = f.read().strip()

In [None]:
with open('aws_key.json') as f:
    aws_key = json.load(f)

In [None]:
facade = QueryFacade(newsdata_key, aws_key)

# Test translation

In [None]:
target = 'I want to do some research.'
facade.translate(target, to='es').get_translated()

In [None]:
target = 'Food'
facade._translate_force(target, to='es').get_translated()

# Sources

In [None]:
response = facade.sample_sources('bd')

In [None]:
with open('sample_bd.json', 'w') as f:
    response_json = [x.to_dict() for x in response]
    json.dump({'entries': response_json}, f, indent=2)
    print('Wrote %d sources.' % len(response_json))

In [None]:
response = facade.sample_sources('us')

In [None]:
with open('sample_us.json', 'w') as f:
    response_json = [x.to_dict() for x in response]
    json.dump({'entries': response_json}, f, indent=2)
    print('Wrote %d sources.' % len(response_json))

# English

In [None]:
response = facade.sample_articles()

In [None]:
with open('sample_en.json', 'w') as f:
    response_json = [x.to_dict() for x in response]
    json.dump({'entries': response_json}, f, indent=2)
    print('Wrote %d articles.' % len(response_json))

# Spanish

In [None]:
response = facade.sample_articles(country='mx', language='es')

In [None]:
with open('sample_es.json', 'w') as f:
    response_json = [x.to_dict() for x in response]
    json.dump({'entries': response_json}, f, indent=2)
    print('Wrote %d articles.' % len(response_json))