In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict
import codecs, json
import unicodedata
# pip install Unidecode  <OR> conda install Unidecode
import unidecode

In [2]:
#importing all csv files
domarar = pd.read_csv('csv/blak-domarar.csv', sep=';', header=0)
einstaklingar = pd.read_csv('csv/blak-einstaklingar.csv', sep=';', header=0)
forsvarsmenn = pd.read_csv('csv/blak-forsvarsmenn.csv', sep=';', header=0)
lid = pd.read_csv('csv/blak-lid.csv', sep=';', header=0)
lidimoti = pd.read_csv('csv/blak-lidimoti.csv', sep=';', header=0)
lidsmenn = pd.read_csv('csv/blak-lidsmenn.csv', sep=';', header=0)
lidsstjorar = pd.read_csv('csv/blak-lidsstjorar.csv', sep=';', header=0)
thjalfarar = pd.read_csv('csv/blak-thjalfarar.csv', sep=';', header=0)
mot = pd.read_csv('csv/blak-mot.csv', sep=';', header=0)

# drop all SyndarLids with an ID (SyndarlidID)
# (the reason for not dropping using SyndarLid is because I don't trust that column to be inserted correctly with [0,1])
lid = lid[lid['SyndarlidID'].isna()]
# then dropping those two columns because we don't want virtual teams
lid = lid.drop(columns=['SyndarLid', 'SyndarlidID'])

# All duplicated birthdays
duplicated_einstaklingar = einstaklingar[einstaklingar.duplicated(subset=['Nafn', 'Fdagur', 'Kyn'], keep=False)]
duplicated_fdagur_kyn_einstaklingar = einstaklingar[einstaklingar.duplicated(subset=['Fdagur', 'Kyn'], keep=False)]


In [3]:
# Add all entries that have duplicated birthdays, then filter that to first_name->birthday-><people entries>
duplicate_dict = defaultdict(dict)
for index, row in duplicated_fdagur_kyn_einstaklingar.iterrows():
    full_name = row['Nafn']
    #only get the first part of full name 
    first_name = full_name.split()[0]
    # make first name lowercase
    first_name_lowercase = first_name.lower()
    # encode icelandic letters to english
    first_name_to_english = unidecode.unidecode(first_name_lowercase)
    # split birthday into year month and day and ignore second part (sec, min, hour)
    Fdagur_date = row['Fdagur'].split()[0]
    
    if first_name_to_english in duplicate_dict.keys():
        if Fdagur_date in duplicate_dict[first_name_to_english].keys():
            #if first name and Fdagur (birthday) exist in dict then append to that key (birthday)
            duplicate_dict[first_name_to_english][Fdagur_date].append(row.values)
        else:
            #if first name exists but Fdagur (birthday) does not exist in dict
            duplicate_dict[first_name_to_english][Fdagur_date] = [row.values]
    else:
        #if Fdagur (birthday) does not exist in dict
        duplicate_dict[first_name_to_english][Fdagur_date] = [row.values]
        

In [4]:
# Remove all single birthday entries that are not duplicates
dict_removed_single_entries = defaultdict(dict)

for key, values in duplicate_dict.items():
    # key = nafn ('ludvik')
    for birthday, arrays in dict(values).items():
        # only get duplicates that there exists 2 or more entries for a birthday
        if(len(arrays) > 1):
            # used for when joining teams table
            if key in dict_removed_single_entries.keys():
                if birthday in dict_removed_single_entries[key].keys():
                    dict_removed_single_entries[key][birthday].append(arrays)
                else:
                    #if first name exists but Fdagur (birthday) does not exist in dict
                    dict_removed_single_entries[key][birthday] = arrays
            else:
                dict_removed_single_entries[key][birthday] = arrays

In [5]:
# get all ids in dict_removed_single_entries
duplicate_ids_kept = []
for key, values in dict_removed_single_entries.items():
    # key = nafn ('ludvik')
    for birthday, arrays in dict(values).items():
        for item in arrays:
            duplicate_ids_kept.append(item[0])

In [6]:
#duplicate_dict['aldis']

In [7]:
#dict_removed_single_entries.values()
#duplicate_row_keept

In [8]:
# Checking if two names are the same person
dict_duplicate_compare_team_members = defaultdict(dict)
for index, row in lidsmenn.iterrows():
    ids = row["EinstID"]
    if ids in duplicate_ids_kept:
        # now we only view ids that exist for duplicated people
        #print(ids)
        if ids in dict_duplicate_compare_team_members.keys():
            dict_duplicate_compare_team_members[ids].append(row.values)
        else:
            dict_duplicate_compare_team_members[ids] = [row.values]


In [20]:
#dict_duplicate_compare_team_members

In [17]:
dict_name_entries = {}
for key, value in dict_removed_single_entries.items():
    #get key and arrays for each person
    for birthday, arrays in dict(value).items():
        #get each array for person
        #print("KEY: " + key + " BIRTHDAY: " + birthday)
        new_key = key +"-"+ birthday
        for item in arrays:
            if new_key in dict_name_entries.keys():
                dict_name_entries[new_key].append(item[0])
            else:
                dict_name_entries[new_key] = [item[0]]
#dict_name_entries

In [23]:
dict_duplicate_compare_team_members

defaultdict(dict,
            {483: [array([1, 18, 483, '2000-04-30 10:04:24.983'], dtype=object),
              array([91, 1830, 483, '2006-04-25 15:50:39.420'], dtype=object)],
             570: [array([1, 18, 570, '2000-04-01 08:56:25.153'], dtype=object),
              array([2, 18, 570, '2001-04-06 10:31:00.870'], dtype=object),
              array([8, 18, 570, '2001-12-28 16:16:45.827'], dtype=object),
              array([15, 18, 570, '2002-05-01 21:24:27.903'], dtype=object),
              array([68, 18, 570, '2005-03-30 11:34:24.217'], dtype=object),
              array([189, 762, 570, '2011-02-19 14:16:56.450'], dtype=object)],
             614: [array([1, 18, 614, '2000-04-03 10:56:57.543'], dtype=object),
              array([2, 18, 614, '2001-04-06 10:31:00.870'], dtype=object),
              array([8, 18, 614, '2001-12-28 16:16:45.827'], dtype=object),
              array([15, 18, 614, '2002-05-01 21:22:25.310'], dtype=object),
              array([68, 18, 614, '2005-03-3

In [29]:
for key, value in dict_name_entries.items():
    #print("<key>" + str(key) + " <value> " + str(value))
    for item in value:
        #print(item)
        if item in dict_duplicate_compare_team_members.keys():
            print("<key>" + str(key) + " <values> " + str(dict_duplicate_compare_team_members[item]))
        print("xxxxxxxxxxxx")

<key>adalsteinn-1981-03-03 <values> [array([187, 2840, 1964, '2011-01-20 21:19:21.280'], dtype=object), array([232, 1022, 1964, '2013-09-03 20:55:14.783'], dtype=object), array([234, 1022, 1964, '2013-09-09 15:35:13.947'], dtype=object), array([249, 2840, 1964, '2014-04-21 21:12:11.077'], dtype=object)]
<key>adalsteinn-1981-03-03 <values> [array([45, 1023, 1437, '2004-07-26 12:25:47.497'], dtype=object), array([46, 1023, 1437, '2003-10-11 19:12:00.843'], dtype=object), array([59, 1023, 1437, '2004-09-17 11:59:19.497'], dtype=object), array([64, 1023, 1437, '2004-09-30 17:47:04.420'], dtype=object), array([66, 1015, 1437, '2004-11-15 23:01:41.150'], dtype=object), array([70, 2085, 1437, '2005-02-23 18:40:23.027'], dtype=object), array([84, 1023, 1437, '2005-10-01 20:06:49.467'], dtype=object), array([176, 1022, 1437, '2010-09-27 10:28:20.920'], dtype=object), array([191, 1022, 1437, '2011-08-29 20:53:45.547'], dtype=object), array([199, 2840, 1437, '2012-03-04 14:10:07.770'], dtype=obje

<key>solveig-1944-04-17 <values> [array([2, 350, 836, '2001-04-15 15:13:53.187'], dtype=object), array([15, 350, 836, '2002-04-15 22:39:04.840'], dtype=object), array([68, 1688, 836, '2005-04-06 08:38:21.123'], dtype=object)]
<key>solveig-1944-04-17 <values> [array([34, 1088, 1328, '2003-03-26 13:45:23.060'], dtype=object), array([54, 1088, 1328, '2004-03-30 22:33:37.077'], dtype=object)]
<key>solveig-1954-07-27 <values> [array([149, 673, 3075, '2010-04-26 22:31:36.140'], dtype=object)]
<key>stefan-1953-02-05 <values> [array([109, 132, 2305, '2007-04-27 16:04:30.497'], dtype=object), array([119, 132, 2305, '2008-01-09 14:42:59.810'], dtype=object), array([121, 132, 2305, '2008-04-01 13:43:52.873'], dtype=object)]
<key>stefan-1953-02-05 <values> [array([15, 670, 1214, '2002-05-09 14:48:05.577'], dtype=object), array([34, 1176, 1214, '2003-03-26 20:36:24.903'], dtype=object), array([36, 671, 1214, '2003-04-08 03:13:24.763'], dtype=object), array([44, 671, 1214, '2003-11-13 01:20:35.717']

In [None]:
#dict(duplicated_fdagur_kyn_einstaklingar)

# all duplicates
#duplicate_dict

# all duplicates for lúðvík
#duplicate_dict["ludvik"]

#duplicate_dict["ludvik"]['1969-03-31'][0]
#duplicate_dict["ludvik"]['1969-03-31'][1]

#duplicate_dict
#dict_removed_single_entries

In [None]:
reverted_back_to_dict = dict(duplicate_dict)
#reverted_back_to_dict

In [None]:
file_path = "json/einstaklingar_map.txt" ## your path variable
#duplicate_dict_json = json.dump(duplicate_dict, codecs.open(file_path, 'w', encoding='utf-8'), separators=(';', ':'), sort_keys=True, indent=4) ### this saves the array in .json format
#json_obj = json.dumps(duplicate_dict, indent = 4)
#dumped = json.dumps(duplicate_dict, cls=NumpyEncoder)
#dumped
#pd.DataFrame(reverted_back_to_dict).to_csv(file_path, encoding='utf-8-sig')
#duplicate_dict_json = json.dump(reverted_back_to_dict, codecs.open(file_path, 'w', encoding='utf-8-sig'))

#json = json.dumps(reverted_back_to_dict)
#f = open(file_path,"w")
#f.write(str(reverted_back_to_dict))
#f.close()

=====================================================================================
=

In [None]:
#FINAL STEP (run after everything is done):

#duplicated people put into it's own csv to be browsed later
pd.DataFrame(duplicated_einstaklingar).to_csv("csv/new/duplicated-einstaklingar.csv", encoding='utf-8-sig')
pd.DataFrame(duplicate_dict).to_csv("json/duplicate-map.json", encoding='utf-8-sig')


#save as new csv inside csv/new
pd.DataFrame(domarar).to_csv("csv/new/blak-domarar.csv", encoding='utf-8-sig')
pd.DataFrame(einstaklingar).to_csv("csv/new/blak-einstaklingar.csv", encoding='utf-8-sig')
pd.DataFrame(forsvarsmenn).to_csv("csv/new/blak-forsvarsmenn.csv.csv", encoding='utf-8-sig')
pd.DataFrame(lid).to_csv("csv/new/blak-lid.csv", encoding='utf-8-sig')
pd.DataFrame(lidimoti).to_csv("csv/new/blak-lidimoti.csv", encoding='utf-8-sig')
pd.DataFrame(lidsmenn).to_csv("csv/new/blak-lidsmenn.csv", encoding='utf-8-sig')
pd.DataFrame(lidsstjorar).to_csv("csv/new/blak-lidsstjorar.csv", encoding='utf-8-sig')
pd.DataFrame(mot).to_csv("csv/new/blak-mot.csv", encoding='utf-8-sig')
pd.DataFrame(thjalfarar).to_csv("csv/new/blak-thjalfarar.csv", encoding='utf-8-sig')