In [None]:
import requests
import re
from bs4 import BeautifulSoup
import time
from pymongo import MongoClient
import calendar
import urllib.request
import PyPDF2
import os
from datetime import datetime

data_onu_url = "https://ukraine.un.org/en/search?key=Ukraine%20civilian%20casualties&page="
main_domain = "https://ukraine.un.org"
article_valid = "civilian casualties"

mongo_url = "mongodb://"
mongo_db = "kiev"
mongo_collection_article = "onuData"
list_of_values_txt = ["killed"," men","women","girls","boys","children","adults","injured"]


In [None]:


def get_onu_url(page):
    return data_onu_url + str(page)

def find_last_page(onu_text):
    find_number = [m.start() for m in re.finditer('Showing 1 to', onu_text)]
    if len(find_number) == 1:
        position = find_number[0]
        text_found = onu_text[position:position+100]
        
        if len(text_found) > 0:
            text_found_splitted = text_found.split('<')

            if len(text_found_splitted) > 0:
                text_found_cleaned = text_found_splitted[0].strip()
                
                search_last_two_number = text_found_cleaned.split(' ')

                number_list = []
                for any in search_last_two_number:
                    try:
                        converted = int(any)
                        number_list.append(converted)
                    except:
                        continue
                
                if len(number_list) == 3:
                    pages_number = int(number_list[2] / number_list[1])
                    return pages_number
    return None

def create_articles(title, url):
    article = {
        "title": title,
        "url": url
    }
    return article

def get_articles(soup):
    article_list_from_soup = []
    article_test = soup.find_all('a', {"class": "text-3xl"})
    for any in article_test:
        this_title = any.text
        if article_valid in this_title.lower():
            for month in my_calendar_list:
                if month in this_title.lower():
                    this_url = any.get("href")
                    this_article = create_articles(this_title, main_domain + this_url)
                    article_list_from_soup.append(this_article)
                    break
    return article_list_from_soup

def append_article_to_list(article_list, articles):
    for any in articles:
        article_list.append(any)
    return article_list

def get_all_articles():
    start_page = 0
    onu_response = requests.get(get_onu_url(start_page))
    onu_text = onu_response.text
    soup_first_page = BeautifulSoup(onu_text, 'html5lib')
    last_page = find_last_page(onu_text)
    
    article_list = []
    if last_page is not None:
        for page in range(last_page + 1):
            articles = []
            
            if page == 0:
                articles = get_articles(soup_first_page)
            else:
                page_url = get_onu_url(page)
                print(page_url)
                page_response = requests.get(page_url)
                onu_text = page_response.text
                soup_page = BeautifulSoup(onu_text, 'html5lib')
                articles = get_articles(soup_page)

            if len(articles) > 0:
                article_list = append_article_to_list(article_list, articles)
            
            time.sleep(5)
            
    return article_list

def clean_article_list(article_list):
    new_article_list = []
    for art in article_list:
        found = False
        
        for new_art in new_article_list:
            if art['url'] == new_art['url']:
                found = True
                break

        if not found:
            new_article_list.append(art)
    
    return new_article_list

def connect_mongo(mongo_url, mongo_db):
    client = MongoClient(mongo_url)
    mongo_db_instance = client[mongo_db]
    return mongo_db_instance

def get_collection(mongo_db_instance, collection_to_get):
    collection = mongo_db_instance[collection_to_get]
    return collection

def insert_in_collection(list_to_add, collection):
    collection.insert_many(list_to_add)

def get_calendar_list():
    list_calendar = list(calendar.month_name)
    my_calendar_list = []
    for any in list_calendar:
        if len(any) > 0:
            my_calendar_list.append(any.lower())
    return my_calendar_list

def get_download_link_from_url(url):
    onu_response_test = requests.get(url)
    onu_text_test = onu_response_test.text
    soup_page_test = BeautifulSoup(onu_text_test, 'html5lib')

    downloads = soup_page_test.find_all('i', {"class": "fa fa-download"})
    download_list = []
    for any in downloads:
        parent = any.find_parent().find_parent().find_parent()
        if 'English' in parent.text:
            parent_a = parent.find('a')
            if parent_a is not None:
                parent_a_href = parent_a.get('href')

                if parent_a_href is not None:
                    download_list.append(main_domain + parent_a_href)

    if len(download_list) == 1:
        return download_list[0]
    return None

def add_download_to_articles(all_articles):
    new_article_list = []
    for any in all_articles:
        new_article = any.copy()
        article_url = any['url']
        download = get_download_link_from_url(article_url)

        if download is not None:
            new_article['download'] = download
            new_article_list.append(new_article)
        else:
            print("Cannot download this article " + article_url)
        time.sleep(5)
    return new_article_list

def remove_all_article_present(article_list):
    new_article_list = []
    print("searching for duplicates : " + str(len(article_list)))
    for art in article_list:
        print("searching for : " + art['title'])
        articles_db = collection_article.find({'title': art['title']})

        articles_found = False
        for any in articles_db:
            articles_found = True
            break

        if articles_found:
            continue
        new_article_list.append(art)
    return new_article_list

def download_file(download_url, filename):
    response = urllib.request.urlopen(download_url)
    file = open(filename + ".pdf", 'wb')
    file.write(response.read())
    file.close()
    i = response.info()
    i.keys()
    article_date = None
    try:
        date_time_str = i.get('Last-Modified')
        date_time_obj = datetime.strptime(date_time_str, '%a, %d %b %Y %H:%M:%S GMT')
        article_date = date_time_obj
    except:
        article_date = None
    return article_date

def search_death_regex(synonym, text):
    expression = r"(?i)(?:\b" + synonym + "\D{0,20})([0-9][0-9,]*)[^.,]|([0-9][0-9,]*)[^.,](?:\D{0,20}" + synonym + ")"
    values = re.findall(expression, text)
    return values

def search_death(article_body, to_find):
    matches = search_death_regex(to_find, article_body)
    if not matches:
        return []
    return [item for sublist in matches if sublist for item in sublist if item]

def extract_text_from_last_file(filename):
    pdf_file_obj = open(filename + '.pdf', 'rb')     #'rb' for read binary mode
    pdf_reader = PyPDF2.PdfFileReader(pdf_file_obj)
    page_obj = pdf_reader.getPage(0) 
    pdf_extracted = page_obj.extractText()
    pdf_file_obj.close()
    pdf_extracted = pdf_extracted.replace('\n','')
    list_of_dot = pdf_extracted.split('-')
    new_list_of_dot = []
    for list in list_of_dot:
        if 'total' in list:
            list = list.split(')')
            new_list_of_dot.append(list[0])
    return new_list_of_dot, pdf_extracted

def get_dot_values(new_list_of_dot):
    dot_lists = []
    for row in new_list_of_dot:
        object_value = {}
        for to_search in list_of_values_txt:
            values = search_death(row, to_search)
            list_of_int = []
            if values is not None:
                for integer in values:
                    integer = integer.replace(',', '')
                    list_of_int.append(int(integer))

                object_value[to_search.strip()] = list_of_int
        if len(object_value) > 0:
            dot_lists.append(object_value)
    return dot_lists

def extract_from_object(my_object):
    keys = [
        'killed', 'men', 'women', 'girls',
        'boys', 'children', 'adults', 'injured'
    ]

    def extract_value(key):
        try:
            value = my_object.get(key)
            if isinstance(value, list):
                return value[0] if len(value) == 1 else value
        except Exception:
            pass
        return None

    return tuple(extract_value(key) for key in keys)

def get_article_downloaded_value(new_article_list):
    article_list_updated = []
    for article in new_article_list:
        file_name = "last"
        article_create_date = download_file(article['download'], file_name)
        new_list_of_dot, full_text = extract_text_from_last_file(file_name)
        delete_file(file_name)
        dot_lists = get_dot_values(new_list_of_dot)
        
        if len(dot_lists) == 2:
            death_values, injuried_values = extract_values_from_dot_lists(dot_lists)
            article['death'] = death_values
            article['injuried'] = injuried_values
        else:
            if len(dot_lists) == 0:
                dot_lists = extract_dots_v2(full_text)
                dot_lists = get_dot_values(dot_lists)
                
                if len(dot_lists) == 2:
                    death_values, injuried_values = extract_values_from_dot_lists(dot_lists)
                    article['death'] = death_values
                    article['injuried'] = injuried_values
                else:
                    article['deathValues'] = dot_lists
            else:
                article['deathValues'] = dot_lists
        
        article['createDate'] = article_create_date
        article['fullDocument'] = full_text
        article_list_updated.append(article)
        
        time.sleep(5)
    return article_list_updated


def extract_values_from_dot_lists(dot_lists):
    death = dot_lists[0]
    injuried = dot_lists[1]
    death_total, death_men, death_women, death_girls, death_boys, death_children, death_adults, injured_total_useless = extract_from_object(death)
    death_total_useless, injuried_men, injuried_women, injuried_girls, injuried_boys, injuried_children, injuried_adults, injured_total = extract_from_object(injuried)
    
    if (death_total is None and death_men is None and death_women is None and death_girls is None and death_boys is None and death_children is None and death_adults is None):
        death_values = None
    else:
        death_values = {
            'total': death_total,
            'men': death_men,
            'women': death_women,
            'girls': death_girls,
            'boys': death_boys,
            'children': death_children,
            'adults': death_adults
        }
    
    if (injuried_men is None and injuried_women is None and injuried_girls is None and injuried_boys is None and injuried_children is None and injuried_adults is None and injured_total is None):
        injuried_values = None
    else:
        injuried_values = {
            'total': injured_total,
            'men': injuried_men,
            'women': injuried_women,
            'girls': injuried_girls,
            'boys': injuried_boys,
            'children': injuried_children,
            'adults': injuried_adults
        }

    return death_values, injuried_values
def extract_dots_v2(text):
    text_splitted = text.split('.')
    dots = [segment for segment in text_splitted if 'killed' in segment and ' men' in segment]

    if len(dots) == 1:
        value = dots[0]
        value_splitted = value.split(')')
        new_dots = [seg for seg in value_splitted if 'killed' in seg or 'injured' in seg]
        return new_dots

    return dots


def delete_file(filename):
    os.remove(filename + ".pdf")


def update_date(updated_article_list):
    new_article_list = []
    for article in updated_article_list:
        new_article = article.copy()
        title = article['title']
        article_date = None
        title_splitted = title.split(' ')
        int_values = []

        for val in title_splitted:
            try:
                int_values.append(int(val))
            except ValueError:
                for month in my_calendar_list:
                    if month in val.lower():
                        try:
                            int_values.append(datetime.strptime(val, "%B").month)
                        except ValueError:
                            pass

        if len(int_values) == 3:
            try:
                article_date = datetime(int_values[2], int_values[1], int_values[0])
            except:
                article_date = None

        new_article['date'] = article_date
        new_article_list.append(new_article)

    if new_article_list:
        return new_article_list

    print("CANNOT UPDATE DATE OR TITLE")
    return updated_article_list


def update_title(article_list):
    updated_list = []
    for article in article_list:
        new_article = article.copy()
        new_article['title'] = article['title'].replace('\xa0', ' ')
        updated_list.append(new_article)
    return updated_list

In [None]:
mongo_db = connect_mongo(mongo_url, mongo_db_name)
collection_article = get_collection(mongo_db, mongo_collection_article)
my_calendar_list = get_calendar_list()

all_articles = get_all_articles()
all_articles = clean_article_list(all_articles)
new_article_list = update_title(all_articles)
new_article_list = remove_all_article_present(new_article_list)
print("articles to add : " + str(len(new_article_list)))
new_article_list = add_download_to_articles(new_article_list)
updated_article_list = get_article_downloaded_value(new_article_list)
updated_article_list_v2 = update_date(updated_article_list)
insert_in_collection(updated_article_list_v2, collection_article)

https://ukraine.un.org/en/search?key=Ukraine%20civilian%20casualties&page=1
https://ukraine.un.org/en/search?key=Ukraine%20civilian%20casualties&page=2
https://ukraine.un.org/en/search?key=Ukraine%20civilian%20casualties&page=3
https://ukraine.un.org/en/search?key=Ukraine%20civilian%20casualties&page=4
https://ukraine.un.org/en/search?key=Ukraine%20civilian%20casualties&page=5
https://ukraine.un.org/en/search?key=Ukraine%20civilian%20casualties&page=6
https://ukraine.un.org/en/search?key=Ukraine%20civilian%20casualties&page=7
https://ukraine.un.org/en/search?key=Ukraine%20civilian%20casualties&page=8
searching for duplicates : 31
searching for : Ukraine: Civilian casualties as of 18 March 2022
searching for : Ukraine: Civilian casualties as of 15 March 2022
searching for : Ukraine: Civilian casualties as of 16 March 2022
searching for : Ukraine: Civilian casualties as of 21 March 2022
searching for : Ukraine: Civilian casualties as of 23 March 2022
searching for : Ukraine: Civilian cas