In [None]:
# Librairies & paramètres
import pandas as pd
import geopandas as gpd
import os
import plotly.express as px
from shapely.geometry import Point

chemin_actuel = os.getcwd()
dossier_parent = os.path.abspath(os.path.join(chemin_actuel, os.pardir))
dossier_parent = os.path.abspath(os.path.join(dossier_parent, os.pardir))

param = {

    # Direction vers les fichiers SRA
    "GeoSRA DIR IN" :dossier_parent+"/data/processed/geosra_coordprec.parquet.gzip",
    "GeoSRA DIR OUT":dossier_parent+"/data/processed/geosra_eval.parquet.gzip",
    "Country DIR"   :'naturalearth_lowres',
    
    # Seuil minimal de précision des coordonées
    "Seuil LonLatAc":2,

    # Taille des figures
    "Width Figure"  :800,
    "Height Figure" :600,

    # Couleurs
    "Georef Color"  :{
                    "(?)"        :'lightgrey',
                    "Has LonLat" : 'lightgreen',
                    "Has Country": 'yellow',
                    "NR"         : 'lightcoral'
    },

    "Binary Color"  :{
                    "(?)"  :'lightgrey',
                    "True" :'lightcoral',
                    "False":"lightgreen"
    }

}

In [None]:
# Ouverture des fichiers
geosra_pd = pd.read_parquet(param["GeoSRA DIR IN"])
file = gpd.datasets.get_path(param["Country DIR"])
countries = gpd.read_file(file)
geosra_pd.head()

In [None]:
# Calcul des conflits de pays par années
df_latlon = geosra_pd[geosra_pd['has_latlon'] == True]
df_latlon['geometry'] = [Point(xy) for xy in zip(df_latlon['longitude'],df_latlon['latitude'])]
gdf_latlon = gpd.GeoDataFrame(df_latlon, geometry='geometry')
gdf_latlon.crs = 'EPSG:4326'

join = gdf_latlon.sjoin(countries, how='left', predicate='intersects')
join = join.drop_duplicates(subset='acc')

countries_dic = {
    'United States': 'United States of America',
    'Korea, Republic of': 'South Korea',
    'Hong Kong': 'China',
    'Russian Federation': 'Russia',
    'Iran, Islamic Republic of':'Asia',
    
}

for country in countries_dic:
    condition = (join["rg_country"] == country) & (join["name"].notnull())
    join.loc[condition, "rg_country"] = countries_dic[country]

join["conflict_country"] = 'False'
join.loc[(join["rg_country"] != join["name"]), "conflict_country"] = 'True'

fig = px.histogram(join, x="release_year", color="conflict_country",
                color_discrete_map=param["Binary Color"]) 
fig.update_layout(
    width=param["Width Figure"],
    height=param["Height Figure"]
)            
fig.show()

In [None]:
# TreeMap de l'évaluation des données
to_join = join[["acc", "conflict_country"]]
geosra_pd = geosra_pd.set_index('acc').join(to_join.set_index('acc'))

geosra_pd["eval_data"] = 'Utilisable'
condition_NR  = (geosra_pd["has_latlon"].isnull()) & (geosra_pd["geo_loc_name_country_calc"].isnull()) & (geosra_pd["rg_country"] == '')
condition_INV = (geosra_pd["has_latlon"].isnull()) & ((geosra_pd["geo_loc_name_country_calc"] != 'uncalculated') | (geosra_pd["rg_country"] != '') | (geosra_pd["geo_loc_name_country_calc"].notnull()))
condition_IMP = ((geosra_pd["has_latlon"].notnull()) & ((geosra_pd["latitude_precision"] <= 2) | (geosra_pd["longitude_precision"] <= 2)))
condition_CON = (geosra_pd["conflict_country"] == True)

geosra_pd.loc[condition_NR, "eval_data"] = 'Aucun Renseignement'
geosra_pd.loc[condition_INV,"eval_data"] = 'Inverifiable'
geosra_pd.loc[condition_IMP,"eval_data"] = 'Imprecis'
geosra_pd.loc[condition_CON,"eval_data"] = 'Conflit Pays'

nb_NR  = len(geosra_pd[condition_NR]) 
nb_INV = len(geosra_pd[condition_INV]) 
nb_IMP = len(geosra_pd[condition_IMP])
nb_CON = len(geosra_pd[condition_CON])
good = len(geosra_pd) - (nb_NR + nb_INV + nb_IMP + nb_CON)

sizes = [nb_NR, nb_INV, nb_IMP, nb_CON, good]
labels = [f"Aucun Renseignement\n {nb_NR}", f"Invérifiables\n {nb_INV}", f"Imprécis\n {nb_IMP}", f"Conflit de Pays\n {nb_CON}", f"Utilisables\n {good}"]
treemap_pd = pd.DataFrame(
    dict(labels=labels,sizes=sizes)
)
treemap_pd["all"] = f"all\n {len(geosra_pd)}"
fig = px.treemap(treemap_pd,
                path=['all','labels'], 
                values='sizes',
                color='labels',
                color_discrete_map={ f"all\n {len(geosra_pd)}":'lightgrey',
                                    f"Imprécis\n {nb_IMP}":'lemonchiffon', 
                                    f"Utilisables\n {good}":'lightgreen',
                                    f"Invérifiables\n {nb_INV}":'lightcoral',
                                    f"Conflit de Pays\n {nb_CON}":'violet',
                                    f"Aucun Renseignement\n {nb_NR}":'dimgray'}
)
fig.update_traces(root_color="black")
fig.update_layout(margin = dict(t=50, l=25, r=25, b=25))
fig.update_layout(
    width=param["Width Figure"],
    height=param["Height Figure"]
)  
fig.show()

In [None]:
# Enregistrement des données potentiellement exploitables
condition_clean = (geosra_pd["eval_data"] != 'Aucun Renseignement')
geosra_propre = geosra_pd[condition_clean].reset_index()
geosra_propre.to_parquet(param["GeoSRA DIR OUT"])