In [35]:
import pandas as pd
import geopandas as gpd
import re
import time
import calendar

Import new data and already geolocated data

If new data is not in older dataset, needs to be geocoded

Sorts new data requiring geocoding

Outputs into two datasets

Load datasets

In [36]:
source_df = pd.read_csv("../poisk_in_ua/poisk_in_ua_messages.csv")
source_df.set_index("id")
with open("../to_geocode_datasets/1686573190-to_reparse.csv", encoding="utf-8-sig") as contents:
    previous_update = gpd.read_file(contents, driver="GeoJSON")


  source_df = pd.read_csv("../poisk_in_ua/poisk_in_ua_messages.csv")


Sort new data and old data

In [37]:
merged_df = source_df.merge(previous_update.drop_duplicates(), on=['id','id'], how='left', indicator=True)
merged_df = merged_df[merged_df["message_x"].str.contains("погиб", case = False, na = False)]
merged_df = merged_df[~merged_df["message_x"].str.contains("Не идентифицирован", case = False, na = False)]
new_locations = merged_df
new_locations = new_locations[new_locations['_merge'] == 'left_only']
new_locations.rename(columns={"message_x":"message"}, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_locations.rename(columns={"message_x":"message"}, inplace = True)


In [38]:
new_locations = new_locations[["id", "date_x", "message"]]
new_locations = new_locations.rename(columns={"date_x" : "date"})
new_locations = new_locations.dropna(subset=['message'])

Sort geolocatable data based on information in the messages

In [39]:
def cleanup(msg):
    replacement_list = [ "Респ.", "респ.", "н.п", "аул ","Пос.","пгт.", "сл.", "д.","п.","ст.", "с.", "р.п.",  "г.", 'ЧВК "Вагнер"', "ул.", "х.", "c.", "р.", "сь.", "Пгт." ,"Хут.", "по ","им."]
    for entry in replacement_list:
        if entry.lower() in msg:
            msg = msg.lower().replace(entry.lower(), "")

    rephrasings = ( ["ской ", "ская "], ["области", "область"], ["ской области", "ская область"], ["цкой области", "цкая область"], ["р-н", "район"], ["ского района", "ский район"] )
    for rephrasing in rephrasings:
        if rephrasing[0].lower() in msg.lower():
            msg = msg.lower().replace(rephrasing[0].lower(), rephrasing[1].lower())
    return msg

def contains_village(msg:str, return_label=False):

    data:list = msg.split("\n")
    return_value = False
    for line in data:
        if "с." in line.lower() or "д." in line.lower() or "п." in line.lower() or "ст." in line.lower() and len(line.lower()) < 250:
            return_value = True
            if return_label:
                return_value = (True, cleanup(line))
            break
    return return_value

def contains_city(msg:str, return_label=False):
    data:list = msg.split("\n")
    return_value = False       
    for line in data:
        if "г." in line.lower() and "г.р." not in line.lower() and len(line.lower()) < 250 and re.search("\d{2}\.\d{2}\.\d{4}", line.lower()) == None:
            return_value = True
            if return_label:
                return_value = (True, cleanup(line))
                break
    return return_value

def contains_subject(msg, return_label=False):
    data:list = msg.split("\n")
    subject_list=["край","республик","респ.","округ","област","обл.", "район","р-н"]
    for line in data:
        for subject in subject_list:
            if subject in line.lower() and len(line.lower()) < 250:
                if return_label == False:
                    return True
                else:
                    return (True, cleanup(line))
            
    return False

def contains_wagner(msg, return_label=False):
    data:list = msg.split("\n")
    filter_str = [ 'ЧВК "Вагнер"', 'ЧВК']
    for line in data:
        for f_str in filter_str:
            if f_str.lower() in line.lower() and len(line.lower()) < 250:
                if return_label == False:
                    return True
                else:
                    return (True, msg)

Launch sorting

In [40]:
def launch_geolocation_sorting(input_dataset, message_key="message"):
    
    areas_to_geolocate = []
    not_geolocated = []

    for entry in input_dataset.T.to_dict().values():
        msg = entry[message_key]

        if contains_village(msg):
            areas_to_geolocate.append(
                    {
                        "id" : entry['id'],
                        "location" : contains_village(msg, return_label=True)[1],
                        "message" : msg,
                        "tag" : "village",
                        "date": entry["date"]
                    }
                )
            continue

        elif contains_city(msg):
            areas_to_geolocate.append(
                    {          
                        "id" : entry['id'],
                        "location" : contains_city(msg, return_label=True)[1],
                        "message" : msg,
                        "tag" : "city",
                        "date": entry["date"]
                    }
                )
            continue

        elif contains_subject(msg):
            areas_to_geolocate.append(
                    {
                        "id" : entry['id'],
                        "location" : contains_subject(msg, return_label=True)[1],
                        "message" : msg,
                        "tag" : "county",
                        "date": entry["date"]
                    }
                )
            continue

        else:
            not_geolocated.append({
                        "id" : entry['id'],
                        "message" : msg,
                        "date": entry["date"]
                    })

    return areas_to_geolocate, not_geolocated


Output sorted data

In [45]:
to_geolocate, failures = launch_geolocation_sorting(new_locations)


In [46]:
to_geolocate_df = pd.DataFrame(to_geolocate)
failures_df = pd.DataFrame(failures)
timestamp = calendar.timegm(time.gmtime())


In [47]:
to_geolocate_df.to_csv(f"../to_geocode_datasets/{timestamp}-@poisk_in_ua-to_geocode.csv", encoding="utf-8-sig")