# Explore the biographical texts and prepare data acquisition

## Site original

Licence : 

[Copyright Information](https://mathshistory.st-andrews.ac.uk/Miscellaneous/copyright/) (Original website) -> [Creative Commons License 4.0 BY SA](http://creativecommons.org/licenses/by-sa/4.0/)

https://mathshistory.st-andrews.ac.uk/Biographies/chronological

Je téléchargé la page dans le dossier : /mathshistory/html_files

Fichier original Chronological Biographies Index - MacTutor History of Mathematics.html
Copie : chrono_biogr_index.html
Je nettoye le fichier directement en HTML (Atom) afin de tout simplement avoir la liste que je veux travailler

https://mathshistory.st-andrews.ac.uk/Biographies/Tartaglia/


In [2]:
### Importation de modules
import lxml.html
import re
from lxml import etree
import requests
import altair as alt

## Explore mathematician's list

In [3]:
### This file was created manually from this address (retrieved 8 April 2023)
# https://mathshistory.st-andrews.ac.uk/Biographies/chronological/
# by eliminating unuseful scripts and indexes

# Mathematicians from the year 1000 and onward 
f = "html_files/chrono_biogr_index_1000_and_later.html"
a = etree.parse(f)
a

<lxml.etree._ElementTree at 0x7f3505b0b140>

In [4]:
### Test the syntax to extract the URLs
a.xpath('//li/a/@href')[0:2]

['https://mathshistory.st-andrews.ac.uk/Biographies/Al-Nasawi/',
 'https://mathshistory.st-andrews.ac.uk/Biographies/Jia_Xian/']

In [5]:
### Extract all URLs
l_url = a.xpath('//li/a/@href')
print(len(l_url))
### syntax for relative URLs, not needed here
# restructured_url = ['https://mathshistory.st-andrews.ac.uk/' + url.replace('../../', '') for url in l_url]
urls = [url for url in l_url]
len(urls)

3010


3010

In [6]:
urls[:2]

['https://mathshistory.st-andrews.ac.uk/Biographies/Al-Nasawi/',
 'https://mathshistory.st-andrews.ac.uk/Biographies/Jia_Xian/']

In [7]:
### Prepare the whole list of persons
t = a.xpath('//li') #[0:2]

In [8]:
### Extract birth/death year, name, URL 
l_date_nom_url = [[e.xpath('./text()')[0].replace('(', '').replace(')', '').replace(' - ', '-').strip(), \
                   e.xpath('./a/text()')[0], \
                   e.xpath('./a/@href')[0]] \
                   for e in t]

In [9]:
print(len(l_date_nom_url))
l_date_nom_url[10:13], l_date_nom_url[-3:]

3010


([['1089-1173',
   'Hemchandra',
   'https://mathshistory.st-andrews.ac.uk/Biographies/Hemchandra/'],
  ['1092-1167',
   'Rabbi Ben Ezra',
   'https://mathshistory.st-andrews.ac.uk/Biographies/Ezra/'],
  ['1100-1160',
   'Jabir ibn Aflah',
   'https://mathshistory.st-andrews.ac.uk/Biographies/Jabir_ibn_Aflah/']],
 [['1979-',
   'Artur Ávila',
   'https://mathshistory.st-andrews.ac.uk/Biographies/Avila/'],
  ['1984-',
   'Maryna Viazovska',
   'https://mathshistory.st-andrews.ac.uk/Biographies/Viazovska/'],
  ['1984-',
   'Qëndrim Gashi',
   'https://mathshistory.st-andrews.ac.uk/Biographies/Gashi/']])

In [10]:
### Transform years into integers
l_astro = [[e[1], int(e[0][:4]), int(e[0][5:9]) if len(e[0])> 5 else None, e[2]] for e in l_date_nom_url]; len(l_astro)

3010

In [11]:
print(l_astro[:2], '\n----\n',l_astro[-5:-3])

[['al-Nasawi', 1010, 1075, 'https://mathshistory.st-andrews.ac.uk/Biographies/Al-Nasawi/'], ['Jia Xian', 1010, 1070, 'https://mathshistory.st-andrews.ac.uk/Biographies/Jia_Xian/']] 
----
 [['Terence Tao', 1975, None, 'https://mathshistory.st-andrews.ac.uk/Biographies/Tao/'], ['Maryam Mirzakhani', 1977, 2017, 'https://mathshistory.st-andrews.ac.uk/Biographies/Mirzakhani/']]


In [12]:
### List birth years
y_l = [e[1] for e in l_astro]; y_l[:3]

[1010, 1010, 1013]

In [13]:
### Get min and max birth years
min(y_l), max(y_l)

(1010, 1984)

In [14]:
### Create periods
period = 50 # remplacer si souhaité avec 10, 15, 20, 50 etc.
per_l = list(range(1001, 1921, period))
per_ll = [(l, l+ period -1) for l in per_l]
len(per_ll),per_ll[:3],per_ll[-3:]

(19,
 [(1001, 1050), (1051, 1100), (1101, 1150)],
 [(1801, 1850), (1851, 1900), (1901, 1950)])

In [15]:
### Add periods to data

per_r = []
for a in per_ll:
    # créer le label de la période 
    label = f'{a[0]}_{a[1]}'
    # réinitialiser ces variables à chaque boucle
    effectif = 0
    noms =  []
    for v in l_astro:
        if v[1] >= a[0] and v[1] <= a[1] and (v[1] < 1921 or (v[1] > 1920 and v[2])): # dernière clause = protection de données, max nés en 1930
            effectif += 1
            noms.append(v[0])
            # pass
    per_r.append([label, effectif, noms])

per_r[:1]  #, per_r[-1:]

[['1001_1050',
  7,
  ['al-Nasawi',
   'Jia Xian',
   'Hermann of Reichenau',
   'Sripati',
   'al-Zarqali',
   'Shen Kua',
   'Omar Khayyam']]]

In [16]:
### Prepare data to visualize
data = [{'per': e[0], 'eff': e[1], 'names': '\n'.join(e[2])} for e in per_r]

In [17]:
### Ajouter une moyenne glissante
# Documentation:
#     https://joelostblom.github.io/altair-docs/gallery/bar_with_rolling_mean.html

## https://altair-viz.github.io/user_guide/data.html
data_trsf = alt.Data(values=data)

bar = alt.Chart(data_trsf).mark_bar().encode(
    x='per:N',
    y='eff:Q',
    tooltip=('eff:Q')
   
)

line = alt.Chart(data_trsf).mark_line(color='red').transform_window(
    # The field to average
    rolling_mean='mean(eff)',
    # The number of values before and after the current value to include.
    frame=[-9, 0]
).encode(
    x='per:O',
    y='rolling_mean:Q'
)

(bar + line).properties(height= 600, width=900,  title = f'Distribution of births by {period}-year periods')