In [1]:
import pandas as pd
from graphique import *
import numpy as np
import datetime
from morse3 import Morse as m
import string, random
import json
from collections import defaultdict
import csv
from shapely.geometry import Point, Polygon

In [2]:
truth_file_url = "https://drive.usercontent.google.com/download?id=1KE4dJ_ArA7jhIUYmzITIYO7Yh60rQ0-K&export=download&authuser=2&confirm=t&uuid=5b074238-7709-408b-ac93-86cf839cdb07&at=APZUnTXJHY4NiV5TeGbmEtE-F6Ip:1699105244012"

# Charger le fichier de vérité dans un dataframe sans le télécharger
df = pd.read_csv("../truth_ground.csv", delimiter= '\t', header=None)    
df.columns = ["id","date", "long", "lat"]
print(df.head(2))
# Ajout de colonnes pour aider au traitement
df['isPoi'] = [False]*df.shape[0]
df['isTransitCity'] = [False]*df.shape[0]
df['night'] = [False]*df.shape[0]
df['work'] = [False]*df.shape[0]
df['weekend'] = [False]*df.shape[0]
df['semaine'] = pd.to_datetime(df['date'], format="%Y-%m-%d %H:%M:%S").dt.isocalendar().week

columns_types = {'id' : np.int16, 'date': str, 'long': np.float32, 'lat': np.float32, 
                 'isPoi' : bool, 'night': bool, 'work': bool, 'weekend': bool, 'semaine': np.int16}
df = df.astype(columns_types)

# Bucketisation en tuiles représentant des quartiers (arrondi 0.01)
df['long'] = df['long'].apply(lambda x : round(x, 2))
df['lat'] = df['lat'].apply(lambda x : round(x, 2))
print(df.shape)
print(df.head(5))


   id                 date      long        lat
0   1  2015-03-04 00:35:16  4.870147  45.772140
1   1  2015-03-04 00:35:48  4.870218  45.772095
(34551849, 10)
   id                 date  long    lat  isPoi  isTransitCity  night   work  \
0   1  2015-03-04 00:35:16  4.87  45.77  False          False  False  False   
1   1  2015-03-04 00:35:48  4.87  45.77  False          False  False  False   
2   1  2015-03-04 00:35:49  4.87  45.77  False          False  False  False   
3   1  2015-03-04 00:35:50  4.87  45.77  False          False  False  False   
4   1  2015-03-04 00:35:52  4.87  45.77  False          False  False  False   

   weekend  semaine  
0    False       10  
1    False       10  
2    False       10  
3    False       10  
4    False       10  


In [3]:
#ids_to_treat = range(1, 113)
#df_to_treat = df[(df['id'].isin(ids_to_treat))]
df_to_treat = df

In [4]:
df_to_treat.loc[4570486]

id                                 7
date             2015-04-01 04:48:55
long                            4.93
lat                            45.78
isPoi                          False
isTransitCity                  False
night                          False
work                           False
weekend                        False
semaine                           14
Name: 4570486, dtype: object

In [5]:
# Découper en tuiles plus petites que les tuiles finales pour pouvoir supprimer des lignes de POI plus tard
# Déterminer les poi pour les garder intègres au maximum
# Décaler les heures de 6h tout en gardant la même durée totale passée dans ces poi -- Distribution normale...
from collections import defaultdict
import csv

WORK_START = datetime.time(9, 0)
WORK_END = datetime.time(16, 0)
WEEKEND_START = datetime.time(10, 0)
WEEKEND_END = datetime.time(18, 0)
NIGHT_START = datetime.time(22, 0)
NIGHT_END = datetime.time(6, 0)

def computePOIs(df_: pd.DataFrame, file_name):
    def timedelta_def(): return datetime.timedelta()
    def defaultdicttimedalta(): return defaultdict(timedelta_def)
    def defaultdictseption(): return defaultdict(defaultdicttimedalta)
    def returnnone(): return None
    maxdict = lambda dict: max(dict, key=lambda key: dict[key])
    def diff_time(key, time1, last_date_tab):
        if last_date_tab[key] is None:
            last_date_tab[key] = time1
            return datetime.timedelta()
        else:
            difference = time1 - last_date_tab[key]
            last_date_tab[key] = time1
            return difference
    def getMaxElement(theDict):
        result = defaultdict(timedelta_def)
        for _ in range(3):
            if len(theDict)==0:
                break
            key = maxdict(theDict)
            result[key] = theDict[key]
            del theDict[key]
        return result
    def track_deplacements(row, deplacements_par_horaire, last_date_original_tab):
        key = row[0]
        gps = (row[2], row[3])
        date_time = datetime.datetime.fromisoformat(row[1][:19])

        if date_time.weekday() < 5:
            if NIGHT_START < date_time.time() or date_time.time() < NIGHT_END:
                deplacements_par_horaire[key]['night'][gps] += diff_time(key, date_time, last_date_original_tab)
            elif WORK_START < date_time.time() < WORK_END:
                deplacements_par_horaire[key]['work'][gps] += diff_time(key, date_time, last_date_original_tab)
        else:
            if WEEKEND_START < date_time.time() < WEEKEND_END:
                deplacements_par_horaire[key]['weekend'][gps] += diff_time(key, date_time, last_date_original_tab) 
    
    #--------------------------------------------------------#
    # Détermination de l'horaire de chaque entrée du df
    #--------------------------------------------------------#
    tmp = pd.to_datetime(df_['date'])
    conditions = [
        (tmp.dt.weekday < 5) & ((tmp.dt.time > NIGHT_START) | (tmp.dt.time < NIGHT_END)),
        (tmp.dt.weekday < 5) & ((WORK_START < tmp.dt.time) & (tmp.dt.time< WORK_END)),
        (tmp.dt.weekday >= 5) & ((WEEKEND_START < tmp.dt.time) & (tmp.dt.time< WEEKEND_END))
    ]
    horaires = ['night', 'work', 'weekend']
    for condition, horaire in zip(conditions, horaires):
        df_.loc[condition, horaire] = True
    
    #--------------------------------------------------------#
    # Calcul des durées passées dans à position gps pour chaque id
    #--------------------------------------------------------#
    deplacements_par_horaire = defaultdict(defaultdictseption)
    last_date_original_tab = defaultdict(returnnone)
    
    fd_original = open(file_name, newline='')
    original_reader = csv.reader(fd_original, delimiter="\t")
    
    for row in original_reader:
        track_deplacements(row, deplacements_par_horaire, last_date_original_tab)
    
    #--------------------------------------------------------#
    # Calcul des positions POI de niveau 1
    #--------------------------------------------------------#
    final_tab = defaultdict(defaultdictseption)
    for id in deplacements_par_horaire:
        for type in deplacements_par_horaire[id]:
            final_tab[id][type] = getMaxElement(deplacements_par_horaire[id][type])

    poi = []
    for id in final_tab:
        vartmp = {'id': id, 'night': None, 'work': None, 'weekend': None, 'duree_night': None, 'duree_work': None, 'duree_weekend': None}
        for champ in final_tab[id]:
            vartmp[champ] = max(final_tab[id][champ], key=lambda x: final_tab[id][champ][x])
            delta_duree = final_tab[id][champ][vartmp[champ]]
            vartmp['duree_'+str(champ)] =  (delta_duree.days * 24 * 3600) + delta_duree.seconds
            isPoiCondition = (df_[champ]) & \
                (df_['id']==np.int16(id)) & \
                (df_['long']==float(vartmp[champ][0])) & \
                (df_['lat']==float(vartmp[champ][1]))
                
            df_.loc[isPoiCondition, 'isPoi'] = True  
        poi.append(vartmp)
    return poi

In [7]:
#df_to_treat = df
allPoi = computePOIs(df_to_treat, "../truth_ground.csv")
print(allPoi)
print(df_to_treat.head(5))
print(df_to_treat[df_to_treat['isPoi']==True].shape[0], df_to_treat.shape[0])

[{'id': '1', 'night': ('4.86997166666667', '45.7702066666667'), 'work': ('1.486815', '43.572505'), 'weekend': ('4.87544833333333', '45.786505'), 'duree_night': 112332, 'duree_work': 186802, 'duree_weekend': 151256}, {'id': '2', 'night': ('4.87871833333333', '45.78388'), 'work': ('4.83265166666667', '45.7634633333333'), 'weekend': ('4.87998833333333', '45.782225'), 'duree_night': 134513, 'duree_work': 168742, 'duree_weekend': 127995}, {'id': '107', 'night': ('4.871635', '45.7850416666667'), 'work': ('4.879445', '45.7867733333333'), 'weekend': ('4.872685', '45.78467'), 'duree_night': 140161, 'duree_work': 116865, 'duree_weekend': 94924}, {'id': '98', 'night': ('4.871635', '45.7850416666667'), 'work': ('2.37536666666667', '48.843915'), 'weekend': ('4.85847', '45.760075'), 'duree_night': 133355, 'duree_work': 102744, 'duree_weekend': 133696}, {'id': '21', 'night': ('-0.563235', '44.8343133333333'), 'work': ('4.87940333333333', '45.7866383333333'), 'weekend': ('-0.570691666666667', '44.8497

In [9]:
print(df_to_treat[df_to_treat['isPoi']==True].shape[0])

0


In [None]:
def computeTransitCities(df, border_data_file, border_precision=0.01, duration_threeshold=60):
    # Nouvelle fonction pour tenir compte de la métrique mobilité des Lyonnais
    # Comme dans la fonction de la métrique, on calcule la frontière de la métropole de Lyon
    
    def check_time_spent(group):
        group['date'] = pd.to_datetime(group['date'], format="%Y-%m-%d %H:%M:%S")
        group = group.sort_values(by="date")
        time_spent = (group['date'].iloc[group.shape[0]-1] - group['date'].iloc[0]).seconds
        df.loc[group.index, 'isTransitCity'] = False if time_spent >= duration_threeshold else True
        return group
    
    with open(border_data_file, 'r') as fichier_json :
        data = json.load(fichier_json)
        lyon_shape = Polygon(data)
        del data
        # Simplifier le polygone
        lyon_shape = lyon_shape.simplify(border_precision)
        # Grouper par coordonnée et id et vérifier le temps passé à la coordonnée
        df.groupby(by=["id", "long", "lat"], group_keys=True, sort=False).apply(check_time_spent)            
    return

computeTransitCities(df, "../partieMetriques/limitesMetropole.json", border_precision=0.01, duration_threeshold=60)

In [None]:
print(df[df['isTransitCity'] == True].size)

In [None]:
def permuteData(df: pd.DataFrame, to_poi_entries:bool | None =False, to_transit_entries:bool | None =True):
    groups_indexes, groups_idx = zip(*[(group_indexes, idx) for idx, group_indexes in df.groupby(['id', 'semaine']).groups.items()])
    groups_indexes = list(groups_indexes)
    groups_idx_listified = list(groups_idx)
    groups_idx = set(groups_idx)
    groups_traites_idx = set()

    def pickAnotherGroupInTmp(current_group: Tuple):
        remaining_groups = list(groups_idx - groups_traites_idx)
        remaining_groups.remove(current_group)
        if len(remaining_groups)!=0:
            priority_groups = list(set([a for a in remaining_groups if a[0] != current_group[0]]))
            if len(priority_groups) > 0: other_group = random.choice(priority_groups)
            else: other_group = random.choice(remaining_groups)
            group_position = groups_idx_listified.index(other_group)
            return groups_indexes[group_position]
    
    for idx, group in df.groupby(['id', 'semaine']).groups.items():        
        if to_poi_entries is not None:
            poi_condition = (df.loc[group, 'isPoi'] == to_poi_entries)
            group = group[poi_condition]
        if to_transit_entries is not None:
            transit_condition = (df.loc[group, 'isTransitCity'] == to_transit_entries)
            group = group[transit_condition]
        #print(group)
        df_group = df.loc[group]
        if idx not in groups_traites_idx:
            other_group_indexes = pickAnotherGroupInTmp(idx)
            if other_group_indexes is not None :
                if to_poi_entries is not None:
                    poi_condition = (df.loc[other_group_indexes, 'isPoi'] == to_poi_entries)
                    other_group_indexes = other_group_indexes[poi_condition]
                if to_transit_entries is not None:
                    transit_condition = (df.loc[other_group_indexes, 'isTransitCity'] == to_transit_entries)
                    other_group_indexes = other_group_indexes[transit_condition]
                    
                other_group = df.loc[other_group_indexes]
                if not other_group.empty :
                    a = df_group[['long', 'lat']].copy()
                    b = other_group[['long', 'lat']].copy()                
                    if a.shape[0] < b.shape[0]:
                        df.loc[group, ['long', 'lat']] = b.iloc[:a.shape[0], :].values
                        df.loc[other_group_indexes, ['long', 'lat']].iloc[:a.shape[0], :] = a.values                    
                    else:
                        df.loc[group, ['long', 'lat']].iloc[ : b.shape[0], :] = b.values
                        df.loc[other_group_indexes, ['long', 'lat']] = a.iloc[:b.shape[0], :].values

                    groups_traites_idx.add(tuple(idx))
                    groups_traites_idx.add(tuple((other_group['id'].iloc[0], other_group['semaine'].iloc[0])))        
    return df
df_to_treat = permuteData(df_to_treat, to_poi_entries=False, to_transit_entries=True)

In [None]:
print(df[(df['long'] != df_to_treat['long']) | (df['lat']!=df_to_treat['lat'])].shape, df_to_treat.shape)

In [None]:
def shiftHour(entry) ->str:
    originalDate = datetime.datetime.fromisoformat(entry)
    shiftValue = -18 if originalDate.time()>=datetime.time(18,00) else 6
    shiftedDate = originalDate + datetime.timedelta(hours=shiftValue)
    entry = shiftedDate.strftime("%Y-%m-%d %H:%M:%S")
    return entry

In [None]:
def deleteData(df: pd.DataFrame, to_poi_entries:bool|None =False, proportion=1/3):
    def suppressionAleatoire(group):
        tmp = group[group['isPoi']==to_poi_entries] if to_poi_entries is not None else group
        tailleSup = int(len(tmp)*proportion)
        indices_to_remove = np.random.choice(tmp.index, size=tailleSup, replace=False)
        group.loc[indices_to_remove, 'id'] = 'DEL'
        return group
    df['id']= df.groupby(['id', 'semaine'], group_keys=True, sort=False).apply(suppressionAleatoire)['id'].values
    return

deleteData(df_to_treat, to_poi_entries=False, proportion=0)
print(df_to_treat.head(5))

In [None]:
filterr = ((df_to_treat['id'] != 'DEL') &(df_to_treat['id'] != df['id'])) | (df_to_treat['semaine'] != df['semaine'])
print(df_to_treat[filterr].shape, df_to_treat[filterr])

In [None]:
print(df_to_treat[df_to_treat['id']=='DEL'].shape[0], df_to_treat.shape[0])

In [None]:
def noisifyGps(df_part: pd.DataFrame):
    noise = np.random.choice([-0.02, 0.02], size=(len(df_part), 2))
    df_part[['long', 'lat']] += noise
    return df_part

In [None]:
#filter = (~df_to_treat['isPoi']) & (df_to_treat['id'] != 'DEL')
#filter = (~df_to_treat['isPoi'])
#df_to_treat[filter] = noisifyGps(df_to_treat[filter])
df_to_treat = noisifyGps(df_to_treat)
print(df_to_treat.head(5))

In [None]:
CHARACTERS_POOL = string.ascii_letters + string.digits
CORRESPONDANCES_FILE =  'correspondances3.json'

def generatePseudoIds(df_ : pd.DataFrame):
    def generator(group, nb_characters : int):
        pseudo_id_str = ''.join(random.choice(CHARACTERS_POOL) for _ in range(nb_characters))
        pseudo_id_final = m(pseudo_id_str).stringToMorse().replace(" ", "")
        group.loc[group!='DEL'] = pseudo_id_final
        #print(group)
        return group

    def saveCorrespondances(group:pd.DataFrame, corresp_struc: Dict):
        tmp = group.loc[group['id'] != 'DEL' ,['id', 'id_x']]
        id_original = int(tmp['id'].iloc[0]) if not tmp.empty else 'DEL'
        if id_original == 'DEL':
            return group
        pseudo_id_final = tmp['id_x'].iloc[0]

        if id_original not in corresp_struc:
            corresp_struc[id_original] = {}
        semaine = f"2015-{group['semaine'].iloc[0]}"
        if  semaine not in corresp_struc[id_original]:
            corresp_struc[id_original][semaine] = []
        corresp_struc[id_original][semaine].append(pseudo_id_final)
        return group

    data = {}
    df_['id_x'] = df_['id']   
    df_['id_x'] = df_.groupby(['id', 'semaine'])['id_x'].transform(lambda group: generator(group, nb_characters=5))
    df_.groupby(['id', 'semaine'], group_keys=True).apply(lambda group : saveCorrespondances(group, corresp_struc=data))
    
    with open(CORRESPONDANCES_FILE, 'w') as f:
        json.dump(data, f, indent=4, separators=(',', ':'))
    return df_

b = generatePseudoIds(df_to_treat.copy())
print(b.head(5))
b[['id_x', 'date', 'long', 'lat']].to_csv("anonym3.csv", sep="\t", index=False, header=False)

In [None]:
from metrics import pointsOfInterest, hour, meet, date, distance, tuile, nationalMobility
def calculScore(originial_f, anonymise_f):
    metrics = []
    for metric in [pointsOfInterest, hour, meet, date, distance, tuile, nationalMobility]:
        metrics.append(metric.main(originial_f, anonymise_f))
    return metrics
calculatedMetrics = calculScore("file3.csv", "anonym3.csv")
print(calculatedMetrics)

In [None]:
import numpy as np
score = np.mean(calculatedMetrics)
print(score)

In [None]:
import json
def calculateReidentificationScore(json_correct, json_soumis):
    score = 0
    nb_pseudo = 0
    with open(json_correct, 'r') as f:
        data_correct = json.load(f)

    with open(json_soumis, 'r') as f:
        data_soumis = json.load(f)

    for id, semaines_correctes in data_correct.items():
            # Comparer les pseudo-identifiants pour chaque semaine
            for semaine, pseudo_id_correct in semaines_correctes.items():
                nb_pseudo += 1
                if id in data_soumis:
                    pseudo_id_soumis = data_soumis[id].get(semaine, None)
                    # Vérifier les correspondances
                    if pseudo_id_soumis and pseudo_id_correct[0] == pseudo_id_soumis[0]:
                        score += 1
    print(f"Score={score}/{nb_pseudo}")
    return score/nb_pseudo
calculateReidentificationScore('correspondances3.json', '../partieAttaque/autoAttaques/sub2.json')