<h1 align='center'>MSDE 692</h1>

[Function Definitions](#Function-Definitions)

[Data Cleaning](#Data-Cleaning)

In [73]:
import os
import pandas as pd
import neo4j
import numpy as np
import csv

from functools import reduce
from datetime import datetime as dt
from csv import DictReader, writer

## Function Definitions

In [95]:
def csv_reader(csv_file):
    final_list = []

    with open(csv_file, 'r') as f:
        input_list = csv.DictReader(f)

        for item in input_list:
            final_list.append(item['wiki'])
            
    return final_list

In [110]:
def people_pruner(my_file):
    my_list = []

    with open(my_file, 'r') as f:
        reader = DictReader(f, fieldnames='person')

        for row in reader:
            if [row['p']] not in my_list:
                my_list.append([row['p']])
            else:
                print(f"Duplicate person: {row['p']}")
    
    return my_list

In [113]:
def csv_writer(my_file, my_list):

    with open(my_file, 'w', newline='') as outfile:

        # using csv.writer method from CSV package
        csv_writer = writer(outfile)

        for row in my_list:
            csv_writer.writerows(row)

In [115]:
def csv_dict_writer(my_file, my_list):
    with open(my_file, 'w', newline='') as f:
        writer = csv.DictWriter(f, fieldnames=['wiki'])
        
        for row in my_list:
            writer.writerow({'wiki': row})

## Data Cleaning

### Create Difference CSV File for wikigrabber Spider Runs

In [150]:
people_list = csv_reader('wikigrabber/people.csv')
people_base_list = csv_reader('wikigrabber/people_base.csv')
people_set = set(people_list)
people_base_set = set(people_base_list)

In [151]:
# The difference between people and people_base are the new wiki pages discovered
# These will be used for the next grabwikis spider run
new_names = list(set.difference(people_set, people_base_set))
new_names

['Hugh_Cholmondeley,_5th_Baron_Delamere',
 'Danford_N._Barney',
 'Stella_Hammerstein',
 'David_Bailey',
 'Henry_White_(diplomat)',
 'Gladys_Deacon',
 'Peggy_Schuyler',
 'Lucy_Wharton_Drexel',
 'Henry_Cadogan',
 'Thomas_van_Straubenzee',
 'Henry_Paget,_1st_Marquess_of_Anglesey',
 'Frances_Ford_Seymour',
 'Caroline_Townshend,_1st_Baroness_Greenwich',
 'Jeffrey_Lynn',
 'Lord_Herbert_Montagu_Douglas_Scott',
 'Auchincloss',
 'Brigitte_Bardot',
 'John_Jacob_Astor,_1st_Baron_Astor_of_Hever',
 'Robert_Montagu,_3rd_Duke_of_Manchester',
 'Elizabeth_Herbert,_Countess_of_Pembroke_and_Montgomery',
 'Michelle_Phillips',
 'Wilhelm_von_Opel',
 'Alice_Claypoole_Vanderbilt',
 'Jean_Templeton_Ward',
 'Elizabeth_Harrison_Walker',
 'Blanche_de_Loosey_Oelrichs',
 'Robert_Livingston_Gerry,_Sr.#Family',
 'James_Watson_Webb',
 'Alexander_Slidell_Mackenzie',
 'Robert_Guestier_Goelet',
 'Charles_Vane-Tempest-Stewart,_6th_Marquess_of_Londonderry',
 'George_Macculloch_Miller',
 'Richard_Burton',
 'Maturin_Livingst

In [152]:
len(new_names)

902

In [153]:
csv_dict_writer('wikigrabber/new_people_base.csv', new_names)

### Explore people_data_temp.csv file

In [445]:
df = pd.read_csv('wikicrawler/people_data_temp.csv')
df

Unnamed: 0,schools,degrees,name,full_name,DOB,occupation,spouses,offspring
0,London School of Economics,"MSc,BSc",George Soros,György Schwartz,1930-08-12 00:00:00,"Investor, hedge fund manager, author, philanth...",Susan Weber (historian),"Jonathan Soros,Alexander Soros"
1,"Yale University,University of Wyoming,Universi...","MA,BA",Dick Cheney,Richard Bruce Cheney,1941-01-30 00:00:00,,Lynne Cheney,"Liz Cheney,Mary Cheney"
2,University of Fribourg,,Klaus Schwab,,1938-03-30 00:00:00,World Economic Forum,Hilde Schwab,2
3,"DePauw University,Indiana University, Indianap...","BA,JD",Dan Quayle,James Danforth Quayle,1947-02-04 00:00:00,,Marilyn Quayle,Ben Quayle
4,"Dartmouth College,University College, Oxford,Y...","MA,BA,JD",Robert Reich,Robert Bernard Reich,1946-06-24 00:00:00,,Clare Dalton,Sam Reich
...,...,...,...,...,...,...,...,...
1310,"Harvard University,BA,MBA",,Theodore Roosevelt V,Theodore Roosevelt V,1942-11-27 00:00:00,,Constance Lane Rogers,Theodore Roosevelt VI
1311,Harvard University,"BA,LLB",Charles Francis Adams III,,1866-08-02 00:00:00,,index.php?title=Frances Adams&action=edit&redl...,Charles Francis Adams IV
1312,"Harvard University,Yale University","JD,AB",Kermit Roosevelt III,,1971-07-14 00:00:00,,,
1313,"Yale University,Princeton University,Universit...","MA,BA,JD",Bob Taft,Robert Alphonso Taft III,1942-01-08 00:00:00,,,Anna Taft


In [446]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1315 entries, 0 to 1314
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   schools     842 non-null    object
 1   degrees     320 non-null    object
 2   name        1233 non-null   object
 3   full_name   581 non-null    object
 4   DOB         1225 non-null   object
 5   occupation  405 non-null    object
 6   spouses     698 non-null    object
 7   offspring   574 non-null    object
dtypes: object(8)
memory usage: 82.3+ KB


In [447]:
df_full = df.dropna(how='all')
df_full

Unnamed: 0,schools,degrees,name,full_name,DOB,occupation,spouses,offspring
0,London School of Economics,"MSc,BSc",George Soros,György Schwartz,1930-08-12 00:00:00,"Investor, hedge fund manager, author, philanth...",Susan Weber (historian),"Jonathan Soros,Alexander Soros"
1,"Yale University,University of Wyoming,Universi...","MA,BA",Dick Cheney,Richard Bruce Cheney,1941-01-30 00:00:00,,Lynne Cheney,"Liz Cheney,Mary Cheney"
2,University of Fribourg,,Klaus Schwab,,1938-03-30 00:00:00,World Economic Forum,Hilde Schwab,2
3,"DePauw University,Indiana University, Indianap...","BA,JD",Dan Quayle,James Danforth Quayle,1947-02-04 00:00:00,,Marilyn Quayle,Ben Quayle
4,"Dartmouth College,University College, Oxford,Y...","MA,BA,JD",Robert Reich,Robert Bernard Reich,1946-06-24 00:00:00,,Clare Dalton,Sam Reich
...,...,...,...,...,...,...,...,...
1310,"Harvard University,BA,MBA",,Theodore Roosevelt V,Theodore Roosevelt V,1942-11-27 00:00:00,,Constance Lane Rogers,Theodore Roosevelt VI
1311,Harvard University,"BA,LLB",Charles Francis Adams III,,1866-08-02 00:00:00,,index.php?title=Frances Adams&action=edit&redl...,Charles Francis Adams IV
1312,"Harvard University,Yale University","JD,AB",Kermit Roosevelt III,,1971-07-14 00:00:00,,,
1313,"Yale University,Princeton University,Universit...","MA,BA,JD",Bob Taft,Robert Alphonso Taft III,1942-01-08 00:00:00,,,Anna Taft


In [458]:
# Drop all rows without a name
df_all_names = df_full.dropna(subset=['name'])
df_all_names                  

Unnamed: 0,schools,degrees,name,full_name,DOB,occupation,spouses,offspring
0,London School of Economics,"MSc,BSc",George Soros,György Schwartz,1930-08-12 00:00:00,"Investor, hedge fund manager, author, philanth...",Susan Weber (historian),"Jonathan Soros,Alexander Soros"
1,"Yale University,University of Wyoming,Universi...","MA,BA",Dick Cheney,Richard Bruce Cheney,1941-01-30 00:00:00,,Lynne Cheney,"Liz Cheney,Mary Cheney"
2,University of Fribourg,,Klaus Schwab,,1938-03-30 00:00:00,World Economic Forum,Hilde Schwab,2
3,"DePauw University,Indiana University, Indianap...","BA,JD",Dan Quayle,James Danforth Quayle,1947-02-04 00:00:00,,Marilyn Quayle,Ben Quayle
4,"Dartmouth College,University College, Oxford,Y...","MA,BA,JD",Robert Reich,Robert Bernard Reich,1946-06-24 00:00:00,,Clare Dalton,Sam Reich
...,...,...,...,...,...,...,...,...
1310,"Harvard University,BA,MBA",,Theodore Roosevelt V,Theodore Roosevelt V,1942-11-27 00:00:00,,Constance Lane Rogers,Theodore Roosevelt VI
1311,Harvard University,"BA,LLB",Charles Francis Adams III,,1866-08-02 00:00:00,,index.php?title=Frances Adams&action=edit&redl...,Charles Francis Adams IV
1312,"Harvard University,Yale University","JD,AB",Kermit Roosevelt III,,1971-07-14 00:00:00,,,
1313,"Yale University,Princeton University,Universit...","MA,BA,JD",Bob Taft,Robert Alphonso Taft III,1942-01-08 00:00:00,,,Anna Taft


In [459]:
df_all_names.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1233 entries, 0 to 1314
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   schools     835 non-null    object
 1   degrees     320 non-null    object
 2   name        1233 non-null   object
 3   full_name   581 non-null    object
 4   DOB         1163 non-null   object
 5   occupation  405 non-null    object
 6   spouses     697 non-null    object
 7   offspring   573 non-null    object
dtypes: object(8)
memory usage: 86.7+ KB


In [460]:
df_no_gaps = df_all_names.drop_duplicates()
df_no_gaps

Unnamed: 0,schools,degrees,name,full_name,DOB,occupation,spouses,offspring
0,London School of Economics,"MSc,BSc",George Soros,György Schwartz,1930-08-12 00:00:00,"Investor, hedge fund manager, author, philanth...",Susan Weber (historian),"Jonathan Soros,Alexander Soros"
1,"Yale University,University of Wyoming,Universi...","MA,BA",Dick Cheney,Richard Bruce Cheney,1941-01-30 00:00:00,,Lynne Cheney,"Liz Cheney,Mary Cheney"
2,University of Fribourg,,Klaus Schwab,,1938-03-30 00:00:00,World Economic Forum,Hilde Schwab,2
3,"DePauw University,Indiana University, Indianap...","BA,JD",Dan Quayle,James Danforth Quayle,1947-02-04 00:00:00,,Marilyn Quayle,Ben Quayle
4,"Dartmouth College,University College, Oxford,Y...","MA,BA,JD",Robert Reich,Robert Bernard Reich,1946-06-24 00:00:00,,Clare Dalton,Sam Reich
...,...,...,...,...,...,...,...,...
1310,"Harvard University,BA,MBA",,Theodore Roosevelt V,Theodore Roosevelt V,1942-11-27 00:00:00,,Constance Lane Rogers,Theodore Roosevelt VI
1311,Harvard University,"BA,LLB",Charles Francis Adams III,,1866-08-02 00:00:00,,index.php?title=Frances Adams&action=edit&redl...,Charles Francis Adams IV
1312,"Harvard University,Yale University","JD,AB",Kermit Roosevelt III,,1971-07-14 00:00:00,,,
1313,"Yale University,Princeton University,Universit...","MA,BA,JD",Bob Taft,Robert Alphonso Taft III,1942-01-08 00:00:00,,,Anna Taft


In [461]:
df_no_gaps.to_csv('wikicrawler/people_data.csv')

In [462]:
df_degrees = df_no_gaps.dropna(subset=['schools', 'degrees'])

In [463]:
# Rearrange columns and sort on name
df_degrees = df_degrees[['name', 'full_name', 'DOB', 'occupation', 'schools', 'degrees', 'spouses', 'offspring']]
df_degrees.sort_values(by='name', ignore_index=True, inplace=True)
df_degrees

Unnamed: 0,name,full_name,DOB,occupation,schools,degrees,spouses,offspring
0,A. Clayton Spencer,Ava Clayton Spencer,1954-12-15 00:00:00,,"Williams College,University of Oxford,Harvard ...","MA,BA,JD",Ash Carter,2
1,Abdul El-Sayed,Abdulrahman Mohamed El-Sayed,1984-10-31 00:00:00,,"University of Michigan,Oriel College, Oxford,C...","MA,PhD,MD,BS",Sarah Jukaku,
2,Alan Bersin,,1946-10-15 00:00:00,,"Harvard University,Yale University","BA,JD",Lisa Foster,
3,Alan Chester Valentine,,1901-02-23 00:00:00,,"Swarthmore College,University of Pennsylvania,...","University of Pennsylvania,Balliol College, Ox...",Lucia Garrison Norton,Annie Laurie Buffinton
4,Alan Greenspan,,1926-03-06 00:00:00,,New York University,"PhD,MA,BA",Andrea Mitchell,
...,...,...,...,...,...,...,...,...
308,William McAdoo,William Gibbs McAdoo Jr.,1863-10-31 00:00:00,,"University of Tennessee, Knoxville",BA,Eleanor Wilson McAdoo,9
309,William McRae,William Allan McRae Jr.,1909-09-25 00:00:00,,"University of Florida,University of Oxford,Fre...","B.A.,J.D.,B.Litt.,A.B.,B.A.,J.D.,B.Litt.,A.B.",,
310,William Taft,,1945-09-13 00:00:00,,"Yale University,Harvard University","BA,JD",Julia V. Taft,3
311,Wilson Elkins,Wilson Homer Elkins,1908-07-09 00:00:00,,"University of Texas, Austin","MA,BA",Dorothy Blackburn,2


In [454]:
df_degrees.set_index('name', drop=True, inplace=True)
df_degrees

Unnamed: 0_level_0,full_name,DOB,occupation,schools,degrees,spouses,offspring
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
A. Clayton Spencer,Ava Clayton Spencer,1954-12-15 00:00:00,,"Williams College,University of Oxford,Harvard ...","MA,BA,JD",Ash Carter,2
Abdul El-Sayed,Abdulrahman Mohamed El-Sayed,1984-10-31 00:00:00,,"University of Michigan,Oriel College, Oxford,C...","MA,PhD,MD,BS",Sarah Jukaku,
Alan Bersin,,1946-10-15 00:00:00,,"Harvard University,Yale University","BA,JD",Lisa Foster,
Alan Chester Valentine,,1901-02-23 00:00:00,,"Swarthmore College,University of Pennsylvania,...","University of Pennsylvania,Balliol College, Ox...",Lucia Garrison Norton,Annie Laurie Buffinton
Alan Greenspan,,1926-03-06 00:00:00,,New York University,"PhD,MA,BA",Andrea Mitchell,
...,...,...,...,...,...,...,...
William McAdoo,William Gibbs McAdoo Jr.,1863-10-31 00:00:00,,"University of Tennessee, Knoxville",BA,Eleanor Wilson McAdoo,9
William McRae,William Allan McRae Jr.,1909-09-25 00:00:00,,"University of Florida,University of Oxford,Fre...","B.A.,J.D.,B.Litt.,A.B.,B.A.,J.D.,B.Litt.,A.B.",,
William Taft,,1945-09-13 00:00:00,,"Yale University,Harvard University","BA,JD",Julia V. Taft,3
Wilson Elkins,Wilson Homer Elkins,1908-07-09 00:00:00,,"University of Texas, Austin","MA,BA",Dorothy Blackburn,2


In [455]:
# df_degrees = df_degrees.replace({np.nan:None})

In [456]:
def date_converter(x):
    if x:
        if '-' in x:
            updated = dt.strptime(str(x), '%Y-%m-%d %H:%M:%S')
        elif '/' in x:
            updated = dt.strptime(str(x), '%m/%d/%Y %H:%M:%S')
        else:
            return None
        print(updated)
        return updated.strftme('%Y-%m-%d')        
    else:
        return None

In [457]:
# df_degrees['DOB'] = np.vectorize(date_converter)(df_degrees['DOB'])
df_degrees['DOB'] = pd.to_datetime(df_degrees.DOB)
df_degrees['DOB'] = df_degrees['DOB'].dt.strftime('%Y-%m-%d')
df_degrees

ValueError: ('Unknown string format:', '1989 or 1990 (age\xa031–32)')

In [None]:
df_degrees.to_csv('wikicrawler/people_data_full.csv')

## Rhodes Scholars

In [None]:
rhodes_df = pd.read_csv('rhodescholars/rhodes_data.csv')
rhodes_df

In [None]:
df_occupations = rhodes_df.merge(df_no_gaps, on='name', how='outer')
df_occupations

In [414]:
df_occupations.to_csv('wikicrawler/merged_occupations.csv')