In [None]:
from pymongo import MongoClient
from datetime import datetime
from collections import defaultdict

mongo_db_name = "kiev"
mongo_collection_article_kiev = "articles"
mongo_collection_article_onu = "onuData"
mongo_url = "mongodb://"
number_of_tags = 10


In [None]:
from pymongo import MongoClient

def connect_mongo(mongo_url, mongo_db_name):
    client = MongoClient(mongo_url)
    mongo_db = client[mongo_db_name]
    return mongo_db

def get_collection(mongo_db, collection_name):
    return mongo_db[collection_name]

def get_all_from_collection(collection):
    return list(collection.find({}))

def append_tags(old_tags, new_tags):
    for new in new_tags:
        count = 1
        old_count = None
        found = False

        for old in old_tags:
            if old['title'] == new:
                old_count = old['count']
                count = old_count + 1
                found = True
                break

        if not found:
            old_tags.append({'title': new, 'count': count})
        else:
            old_tags.remove({'title': new, 'count': old_count})
            old_tags.append({'title': new, 'count': count})

    return old_tags

def clean_tags(tags, number_of_tags):
    sorted_tags = sorted(tags, key=lambda x: x['count'], reverse=True)
    return sorted_tags[:number_of_tags]

def normalize_date(dt):
    return dt.replace(hour=0, minute=0, second=0, microsecond=0)

def should_count_death(article):
    return bool(article.get('deaths')) or bool(article.get('deathsV2'))

def process_article_tags(existing_tags, article):
    tags = article.get('customTagSMV2')
    if tags:
        return append_tags(existing_tags, tags)
    return existing_tags

def get_article_kiev_aggregated(articles_kiev, number_of_tags):
    grouped_by_date = defaultdict(list)
    
    for article in articles_kiev:
        date = article.get('date')
        if date:
            normalized = normalize_date(date)
            grouped_by_date[normalized].append(article)

    aggregated_list = []

    for date, articles in grouped_by_date.items():
        aggregated = {
            'articlesCount': len(articles),
            'tags': [],
            'date': date,
            'deathCounts': 0
        }

        for article in articles:
            aggregated['tags'] = process_article_tags(aggregated['tags'], article)

            if should_count_death(article):
                aggregated['deathCounts'] += 1

        if aggregated['tags']:
            aggregated['tags'] = clean_tags(aggregated['tags'], number_of_tags)

        aggregated_list.append(aggregated)

    return aggregated_list

def aggregate_data_with_onu_and_sort(aggregated_list, articles_onu):
    updated_list = []

    for agg in aggregated_list:
        new_agg = agg.copy()
        date_agg = agg['date']

        for onu_article in articles_onu:
            date_onu = onu_article.get('date')

            if date_onu == date_agg:
                new_agg['death'] = onu_article.get('death')
                new_agg['injuried'] = onu_article.get('injuried')

        updated_list.append(new_agg)

    return sorted(updated_list, key=lambda x: x['date'])

In [None]:
mongo_db = connect_mongo(mongo_url, mongo_db_name)
collection_article_kiev = get_collection(mongo_db, mongo_collection_article_kiev)
collection_article_onu = get_collection(mongo_db, mongo_collection_article_onu)

articles_kiev = get_all_from_collection(collection_article_kiev)
articles_onu = get_all_from_collection(collection_article_onu)

list_aggregated = get_article_kiev_aggregated(articles_kiev, number_of_tags)
list_aggregated_with_onu_sorted = aggregate_data_with_onu_and_sort(list_aggregated, articles_onu)


In [None]:
list_aggregated_with_onu_sorted

[{'articlesCount': 7,
  'tags': [{'title': 'share', 'count': 7},
   {'title': 'twitter', 'count': 7},
   {'title': 'facebook', 'count': 7},
   {'title': 'linkedin', 'count': 7},
   {'title': 'emailthe', 'count': 5},
   {'title': 'company', 'count': 2},
   {'title': 'ukrainian', 'count': 2},
   {'title': 'russian', 'count': 2},
   {'title': 'responsible', 'count': 1},
   {'title': 'buying', 'count': 1}],
  'date': datetime.datetime(2021, 11, 14, 0, 0),
  'deathCounts': 0},
 {'articlesCount': 7,
  'tags': [{'title': 'share', 'count': 7},
   {'title': 'twitter', 'count': 7},
   {'title': 'facebook', 'count': 7},
   {'title': 'linkedin', 'count': 7},
   {'title': 'ukrainian', 'count': 3},
   {'title': 'ukraine', 'count': 3},
   {'title': 'state', 'count': 2},
   {'title': 'own', 'count': 2},
   {'title': 'russia', 'count': 2},
   {'title': 'general', 'count': 2}],
  'date': datetime.datetime(2021, 11, 15, 0, 0),
  'deathCounts': 0},
 {'articlesCount': 9,
  'tags': [{'title': 'share', 'coun