# Analysing data on The Math Geaneology project
https://www.genealogy.math.ndsu.nodak.edu/

## Data

add link to data.json

## Reading data

In [1]:
import json
import pandas as pd

In [2]:
with open("data/data.json", "r") as f:
    data_json = json.load(f)
data_json=data_json["nodes"]
data=pd.DataFrame(data_json)

## Fetching coordinates of schools

We use OpenStreetMaps to get the coordinates of the schools

Defining the `get_coordinates` function using geopy package

In [3]:
def get_coordinates(university_name,use_Bing=False,bing_api_key=None):

    # Bing API geocoder
    if use_Bing:
        geolocator = Bing(api_key=bing_api_key)

    # Nominatim geocoder
    else:
        geolocator = Nominatim(user_agent="university_locator")

    university_name_no_filler = remove_fillers(university_name)

    try:
        # Use the geocode method to get the location information
        location = geolocator.geocode(university_name_no_filler)

        if location:
            # Extract latitude and longitude
            latitude, longitude = location.latitude, location.longitude
            return [university_name,latitude,longitude]
            
        else:
            # if we did not get the location return location in Antartica
            return [university_name,-82,135]
            #print(f"Coordinates for {university_name} not found.")

    except Exception as e:
        #print(f"An error occurred: {e}")
        #If there is an error we return None
        return [university_name,None,None]

def remove_fillers(sentence):

    sentence = str(sentence)
    # List of filler words
    filler_words = ['the', 'of', 'and', 'in', 'to', 'a', 'is', 'that', 'it', 'with', 'as', 'on', 'for', 'at']

    # Split the sentence into words
    words = sentence.split()

    # Remove filler words
    filtered_words = [word for word in words if word.lower() not in filler_words]

    # Join the remaining words to form the new sentence
    new_sentence = ' '.join(filtered_words)

    return new_sentence

In [4]:
import csv
from tqdm import tqdm

# Set download_coordinates to True if you want to download the coordinates
download_coordinates = False

# If you want to use Bing maps API, set use_Bing to True and enter your API key
use_Bing = False
bing_api_key = None

if download_coordinates:

    import time

    # Using Bing maps API for better performance. This requires a free account and an API key
    from geopy.geocoders import Bing

    # Alternatively we can use OpenStreetMaps API which is free and open source, but does not perform as well as Bing
    from geopy.geocoders import Nominatim

    # Get the unique schools
    schools = data['school'].unique()

    # Loop through the schools and save the recieved coordinates in coord
    coords =[]
    count = 0
    for school in tqdm(schools):
        coord = get_coordinates(school,use_Bing=use_Bing,bing_api_key=bing_api_key)
        coords.append(coord)
        count+=1

        # Save coordinates every 500 iterations
        if count%500==0:
            with open('data/coords'+str(count)+'.csv','w', newline='') as file:
                csv_writer = csv.writer(file)
                csv_writer.writerows(coords)

        # To prevent the API from blocking us
        time.sleep(0.2)

    # Save all the coordinates
    with open('data/coordinates.csv','w', newline='') as file:
        csv_writer = csv.writer(file)
        csv_writer.writerow(['school','lat','lon'])
        csv_writer.writerows(coords)


    print("Done Downloading Coordinates")

## Calculating descendants

### Data cleaning

Before we begin to calculate the descendants for each advisor, we need to fix three errors in the data, for `id = 105250`, `id=23936` and `id = 256927`. These two cause looping in the tree descent algorithm due to error in data enter

The students of `id = 105250` contains themself as a student. We remove `105250` from their student list

In [5]:
students_of_105250 = data[data["id"] == 105250]['students'].values[0]
students_of_105250.remove(105250)

The students of `id = 23936` contains themself as a student. We remove `23936` from their student list

In [6]:
students_of_23936 = data[data["id"] == 23936]['students'].values[0]
students_of_23936.remove(23936)

The students of `id = 256927` contains `256928` which is their advisor. We remove `256928` from their student list

In [7]:
students_of_256927 = data[data["id"] == 256927]['students'].values[0]
students_of_256927.remove(256928)

### Calculating descendants

In [8]:
# Sorting by year so that most of the leaves are processed first
data.sort_values('year', inplace=True,ascending=False)

# Creating a dictionary of the data
data_students_dict = {}
for i in range(data.shape[0]):
    row = data.iloc[i]
    data_students_dict[row['id']] = row['students']

In [9]:
def count_descendants(data,des_dict,student_dict,remaining_nodes):
    node = remaining_nodes[-1]

    children = data[node]
    for child in children:

        # Check if child is the same as node
        #if str(child) == str(node):
        #    print(child)
        #    continue
        
        if child not in data.keys():
            des_dict[child] = 0
            student_dict[node].add(child)
        elif des_dict[child] == -1:
            remaining_nodes.append(child)
            return
        else:
            student_dict[node].add(child)
            student_dict[node].update(student_dict[child])
    des_dict[node] = len(student_dict[node])
    remaining_nodes.pop()

In [10]:
des_dict = {key : -1 for key in data_students_dict.keys()}
student_dict = {key : set() for key in data_students_dict.keys()}
remaining_nodes = list(data_students_dict.keys())
count = 0
while remaining_nodes != []:
    count_descendants(data_students_dict,des_dict,student_dict,remaining_nodes)

    if count % 100000 == 0:
        print(f"Iteration is {count}, Length of remaining nodes is {len(remaining_nodes)}")
    count += 1
print('Done')

Iteration is 0, Length of remaining nodes is 256768
Iteration is 100000, Length of remaining nodes is 255982
Iteration is 200000, Length of remaining nodes is 254208
Iteration is 300000, Length of remaining nodes is 254210
Iteration is 400000, Length of remaining nodes is 245438
Iteration is 500000, Length of remaining nodes is 200494
Iteration is 600000, Length of remaining nodes is 112544
Iteration is 700000, Length of remaining nodes is 12756
Done


In [11]:
# Convert the des_dict dictionary to a dataframe
descendants_df = pd.DataFrame.from_dict(des_dict, orient='index').reset_index()
descendants_df.head()
descendants_df = descendants_df.rename(columns={'index': 'id', 0: 'descendants'})

### Merging all the data

In [12]:
# Read coordinates
coordinates = pd.read_csv('data/coordinates.csv')

In [13]:
merged_df = data.merge(descendants_df, on='id', how='left')
merged_df = merged_df.merge(coordinates, on='school', how='left')

In [14]:
save = False

if save:
    merged_df.to_csv('data/merged_data.csv', index=False)

## Heat Map

In [15]:
from folium import plugins
import folium

In [16]:
L=[]
for t0 in range(1700,2020,5):
    period = merged_df[ (merged_df["year"]>=t0) & (merged_df['year']<t0+5)]
    p = period[["school","lat","lon","descendants"]].groupby(['school'])
    p = p["descendants"].agg(['sum']).reset_index()
    p = pd.merge(p,coordinates,on='school',how='left')

    X = p[["lat","lon","sum"]].dropna()
    L.append(X.values.tolist())
    
map = folium.Map(location = [5,30], tiles = "Cartodb dark_matter", zoom_start=2)


plugins.HeatMapWithTime(L, auto_play=True, max_opacity=0.3).add_to(map)
map.save("heat_time.html")
map