In [1]:
import requests
from bs4 import BeautifulSoup

ssl_verify_path = 'ch.pem'
url = 'https://genealogy.math.ndsu.nodak.edu/id.php?id=13141'
page = requests.get(url,verify=ssl_verify_path)


soup = BeautifulSoup(page.content,'html.parser')

In [2]:
results = soup.find('h2')
print(results.prettify())

<h2 style="text-align: center; margin-bottom: 0.5ex; margin-top: 1ex">
 Peter Mann Winkler
</h2>



# Inspection of page
* It looks like the only time the h2 tag will appear on the page is for the name
* Afterwards, all of the things I am looking for are pretty much right there

## Items I want to pull
1. Name  
    * between the open and close of only h2 tag
    * May not just be a first and last name. Figure out how to parse that later
2. MathSciNet link
    * An href in the p style right after the h2 tag
3. College name
    * In the second span style next
4. Year of degree
    * before the end of the next span style
5. Dissertation name
    * the next div style. id is thesisTitle
6. Math Classification pt 1
    * The number that ndsu uses
    * Not all pages have this
    * Next div after dissertation
7. Math Classification pt 2
    * The name of the subfield
    * Again, not all pages have this
    * Part of the same div as math class pt 1
   

In [3]:
results = soup.find('h2')
print(results.prettify())
print(results.text.strip())

<h2 style="text-align: center; margin-bottom: 0.5ex; margin-top: 1ex">
 Peter Mann Winkler
</h2>

Peter Mann Winkler


In [4]:
name = soup.h2
mathsci = name.find_next('p')
print(mathsci.prettify())
print(mathsci.text.strip())

<p style="text-align: center; margin-top: 0; margin-bottom: 0px; font-size: small">
 <a href="http://www.ams.org/mathscinet/MRAuthorID/183565">
  MathSciNet
 </a>
</p>

MathSciNet


In [5]:
college = name.find_next('span').find_next('span')
print(college.prettify())
print(college.text.strip())

<span style="color:
  #006633; margin-left: 0.5em">
 Yale University
</span>

Yale University


OK, so I think I have the gist down. Just need to put it together

In [6]:
# Get all of the fields I want, and print so it is neat
name = soup.find('h2')
name_string = name.text.strip()

mathsci = name.find_next('p')
mathsci_link = name.find_next('a')
mathsci_string = mathsci_link['href']

# The spans are nested, so pull the full string which gives [phd, university, year]
# Split list into two to combine everything but year, then get year as last element
degree_year = mathsci.find_next('span')
degree_year_string = degree_year.text.strip().rsplit(None,1)[1] # probably want as date for db

college = degree_year.find_next('span')
college_string = college.text.strip()

dissertation = college.find_next('div')
dissertation_string = dissertation.text.strip().split('\n')
dissertation_string_final = dissertation_string[len(dissertation_string)-1]

classification = dissertation.find_next('div')
classification_string = classification.text.strip()
classification_split_text = classification_string.split(':')[1]


# It is combined on an em dash
# Need to regex to remove
import re
classification_split_utf = re.sub(u'\u2014','--',classification_split_text)

classification_number = str(classification_split_utf.split('--')[0]).strip()
classification_name_string = classification_split_utf.split('--')[1]

I think that should work for everyone

# Next Step
OK, now I need to put them all together. Maybe in a dictionary? 
I guess not. I probably want to update the database as soon as I get the name. But, I could use dictionaries for testing

In [7]:
academic = {
    "Name": name_string,
    "MathSci": mathsci_string,
    "Degree Year": degree_year_string,
    "University": college_string,
    "Dissertation Title": dissertation_string_final,
    "Math Genre Number": classification_number,
    "Math Subfield": classification_name_string
}
print(academic)

{'Name': 'Peter Mann Winkler', 'MathSci': 'http://www.ams.org/mathscinet/MRAuthorID/183565', 'Degree Year': '1975', 'University': 'Yale University', 'Dissertation Title': 'Assignment of Skolem Functions for Model-Complete Theories', 'Math Genre Number': '03', 'Math Subfield': 'Mathematical logic and foundations'}




# Try it is a table

In [8]:
print("Name\t\tMathSci\t\tDegree\t\tCollege\t\tThesis\t\tSubfield\t\tSubNum")

Name		MathSci		Degree		College		Thesis		Subfield		SubNum


In [9]:
for x in academic:
    print(academic[x])

Peter Mann Winkler
http://www.ams.org/mathscinet/MRAuthorID/183565
1975
Yale University
Assignment of Skolem Functions for Model-Complete Theories
03
Mathematical logic and foundations


In [10]:
test = [name_string, mathsci_string, degree_year_string, college_string, dissertation_string_final, classification_number,
       classification_name_string]

In [11]:
print(test)

['Peter Mann Winkler', 'http://www.ams.org/mathscinet/MRAuthorID/183565', '1975', 'Yale University', 'Assignment of Skolem Functions for Model-Complete Theories', '03', 'Mathematical logic and foundations']
