<a href="https://colab.research.google.com/github/SemanticComputing/LetterSampo-timeline/blob/main/CKCC_Timeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Installations and Imports

In [None]:
!pip install matplotlib rdflib networkx

In [None]:
!pip install SPARQLWrapper

In [None]:
import matplotlib
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import re

from collections import defaultdict
from datetime import datetime
from itertools import product, combinations
from rdflib.namespace import XSD
from SPARQLWrapper import SPARQLWrapper, JSON

# Functions

In [None]:
def checkDate(v : str):
  try:
    d = datetime.strptime(v, '%Y-%m-%dT%H:%M:%S').date()
  except ValueError:
    m = re.match(r'(\d{4})-(\d{2})-(\d{2})', v)
    d = datetime(int(m.groups()[0]), int(m.groups()[1]), 28).date()
  return d


DATATYPECONVERTERS : dict = {
      str(XSD.integer):  int,
      str(XSD.decimal):  float,
      str(XSD.date):     lambda v: datetime.strptime(v, '%Y-%m-%d').date(),
      str(XSD.dateTime): checkDate
  }


def convertDatatype(obj):
  return DATATYPECONVERTERS.get(obj.get('datatype'), str)(obj.get('value'))


def convertDatatypes(results : list) -> list:
    res = results["results"]["bindings"]
    return [dict([(k, convertDatatype(v)) for k,v in r.items()]) for r in res]

In [None]:
def simplifyLabel(st):
  '''Simplify the label of an actor by removing text in parenthesis and numbers after the name'''
  st = re.sub(r'[,0-9 -]+$', '', st)
  st = re.sub(r' \([^)]+\)', '', st)
  st = re.sub(r', ([a-z.]+)$', '', st)
  return st

# Query the Data

All letters with information about the senders and recipients

try it in Yasgui: https://api.triplydb.com/s/PECPM8eKe


In [None]:
sparql = SPARQLWrapper("http://ldf.fi/ckcc/sparql",
                       returnFormat = JSON)
sparql.setQuery("""
  PREFIX crm: <http://www.cidoc-crm.org/cidoc-crm/>
  PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
  PREFIX lsscs: <http://ldf.fi/schema/lssc/>

  SELECT DISTINCT *
  WHERE {
      ?id a lsscs:Letter ;
        ^lsscs:created ?sender__id ;
        lsscs:was_addressed_to ?receiver__id ;
        lsscs:has_time  [
            crm:P82a_begin_of_the_begin ?start ;
            crm:P82b_end_of_the_end ?end ] .

      FILTER(STR(?start) >= "1600")

      ?sender__id skos:prefLabel ?sender__label .
      ?receiver__id skos:prefLabel ?receiver__label .

  } LIMIT 25000
  """)
results = sparql.query().convert()

print("Query returned {} results".format(len(results["results"]["bindings"])))

for ob in results["results"]["bindings"][:10]:
  print(ob)

In [None]:
# convert the result datatypes and print examples
res = convertDatatypes(results)

for ob in res[:10]:
  print(ob)

In [None]:
'''Create a dictionary where the keys are the ID, and the values are the person names'''
people_lookup = {}

for ob in res:
  ob['sender__label'] = simplifyLabel(ob.get('sender__label'))
  ob['receiver__label'] = simplifyLabel(ob.get('receiver__label'))
  ob['year'] = ob.get('start').year

  people_lookup[ob.get('sender__id')] = ob.get('sender__label')
  people_lookup[ob.get('receiver__id')] = ob.get('receiver__label')

## Number of top people to consider

In [None]:
top_n_people = 12

In [None]:
''' show example of people lookup '''
for k, dc in list(people_lookup.items())[:top_n_people]:
  print(f'{k}\t{dc}')

In [None]:
def get_year_information(data: dict,
                         start_year: int,
                         end_year: int,
                         number_of_people: int = 12) -> dict:
  '''Get the people with the highest degree values in a correspondence network
  constructed by correspondences during a defined year range.
  '''
  G = nx.DiGraph()

  G.add_edges_from(((ob.get('sender__id'), ob.get('receiver__id')) for ob in data if start_year <= ob.get('year') < end_year))

  dct = nx.degree(G)

  arr = sorted(dict(dct).items(),
               key = lambda x: x[-1],
               reverse=True)[:number_of_people]

  if len(arr):
      v_max = max([x for _,x in arr])
      # normalize the degree values so that #1 person has the value of 1.0
      return dict(((k, v/v_max) for k,v in arr))
  else:
      return {}


# show example output of the function
get_year_information(res,
                     start_year = 1690,
                     end_year = 1695,
                     number_of_people = top_n_people)

In [None]:
start_year, end_year = 1580, 1720
year_span = 2.5
year_data = {}

for start_year in np.arange(start_year, end_year, year_span):
  year_data[start_year] = get_year_information(res,
                                               start_year = start_year,
                                               end_year = start_year+year_span,
                                               number_of_people = top_n_people)

first_year_of_person = defaultdict(list)
for y, arr in year_data.items():
  if arr:
    for x,_ in arr.items():
      first_year_of_person[x].append(y)

people_list = [k for k,_ in sorted(((k, min(arr)) for k, arr in first_year_of_person.items()), key = lambda x: x[-1])]

# show top10 examples of actors along the time period start_year – end_year
people_list[:10]

## Plot the results

In [None]:
'''
plot by degree value:
Y = year
X = actor index
Z/color = actor's rank at year Y
'''
YXZ = [(y, people_list.index(x), i) for y, arr in year_data.items() for x, i in arr.items()]
Y, X, colors = zip(*YXZ)

# set colorbar
colormap = matplotlib.colors.LinearSegmentedColormap.from_list("", ["blue","yellow","red"])

# define the scatter plot
fig = plt.figure(figsize = [36.0, 13.0])
ax = fig.add_subplot()
sc = ax.scatter(X, Y, c = colors, cmap = colormap, s = 50.0)
plt.colorbar(sc)

# vertical actor labels below the plot
plt.xticks(ticks = [i for i,_ in enumerate(people_list)],
           labels = [people_lookup.get(x, '') for x in people_list],
           fontsize = 7,
           rotation = 90)

# size of years on y-axis
plt.yticks(fontsize = 18)

# save image
plt.savefig("Timeline_CKCC.png", dpi = None, transparent = False, bbox_inches = 'tight')

# show output
plt.show()
