<h1 align='center'>MSDE 692</h1>

[Global Variables](#Global-Variables)

[Function Definitions](#Function-Definitions)

[Data Cleaning](#Data-Cleaning)
- [Create Difference CSV File for wikigrabber Spider Runs](#Create-Difference-CSV-File-for-wikigrabber-Spider-Runs)
- [Text Manipulation](#Text-Manipulation)
- [WikiGrabber Results Cleaning](#WikiGrabber-Results-Cleaning)
- [WikiCrawler Results Cleaning](#WikiCrawler-Results-Cleaning)
    - [Data Type Transformations](#Data-Type-Transformations)
    - [Create CSV file for each node](#Create-CSV-file-for-each-node)

In [1]:
import os
import pandas as pd
import neo4j
import numpy as np
import csv
import shutil
import urllib
import codecs
import neo4j

from dateutil import relativedelta
from scrapy.crawler import CrawlerProcess
from pprint import pprint
from deepdiff import DeepDiff
from functools import reduce
from datetime import datetime as dt
from datetime import date
from csv import DictReader, writer

In [2]:
# https://www.youtube.com/watch?v=5Is-QdbKmEI

## Global Variables

In [3]:
OMIT_LIST = ['Wikipedia:', 'Â', 'Ã', '#', 'index']

## Function Definitions

In [4]:
def csv_reader(csv_file):
    final_list = []

    with open(csv_file, 'r', encoding='unicode_escape') as f:
        input_list = csv.reader(f)

        for row in input_list:
            final_list.append(row[0])
            
    return final_list

In [5]:
def people_pruner(my_file):
    my_list = []

    with open(my_file, 'r') as f:
        reader = DictReader(f, fieldnames='person')

        for row in reader:
            if [row['p']] not in my_list:
                my_list.append([row['p']])
            else:
                print(f"Duplicate person: {row['p']}")
    
    return my_list

In [6]:
def csv_writer(my_file, my_mode, my_list):

    with open(my_file, my_mode, newline='', encoding="utf-8") as outfile:
        csv_writer = writer(outfile)

        for row in my_list:
            if not (":" in row or row == 'wiki' or row.startswith(tuple(OMIT_LIST)) or "Ã" in row):
                csv_writer.writerow([row])

In [7]:
def csv_dict_writer(my_file, my_list, my_mode):
    with open(my_file, my_mode, newline='') as f:
        writer = csv.DictWriter(f, fieldnames=['wiki'])
        
        for row in my_list:
            if ":" not in row and row != 'wiki' and not row.startswith(tuple(OMIT_LIST)):
                writer.writerow({'wiki': row})

In [8]:
def add_new_wikis(adding_file, gaining_file, base_file):
    """
    Adds newly discovered wiki pages to an accumulative wiki pages file before deleting the previous base file
    used to extract the new wikis. The previous base file is then deleted and the adding_file is renamed to base_file
    for the next grabwikis spider run.
    """
    base_list_tmp = csv_reader(base_file)
    adding_list = csv_reader(adding_file)
    gaining_list = csv_reader(gaining_file)
    base_set = set(base_list_tmp)
    adding_set = set(adding_list)
    gaining_set = set(gaining_list)
    
    new_base_wikis = list(set.difference(base_set, gaining_set))
    new_wikis = list(set.difference(adding_set, gaining_set))
    
    if len(new_wikis) > 0:
        print(f"Adding {len(new_wikis)} new wiki pages to {gaining_file}")
        csv_writer(gaining_file, 'a', new_wikis)
    if len(new_base_wikis) > 0:
        print(f"Adding {len(new_base_wikis)} new base wiki pages to {gaining_file}")
        csv_writer(gaining_file, 'a', new_base_wikis)
#         csv_dict_writer(base_file, new_wikis, 'w')
#         os.remove(adding_file)
    os.remove(base_file)
#     shutil.copy(adding_file, base_file)
    os.remove(adding_file)
    csv_writer(base_file, 'w', new_wikis)

In [9]:
def clean_df(my_df, out_file):
    all_people_df = my_df.drop_duplicates()
    all_people_df = all_people_df[all_people_df.wiki.str.contains(":")==False]
#     all_people_df['wiki'] = all_people_df['wiki'].apply(lambda x: urllib.parse.unquote(x))
    all_people_df = all_people_df[~all_people_df['wiki'].str[:].str.contains("Ã")]    
    all_people_df = all_people_df[~all_people_df['wiki'].str[0].str.isdigit()]    
    all_people_df.sort_values('wiki', inplace=True)
    all_people_df.set_index('wiki', drop=True, inplace=True)
    all_people_df.to_csv('wikigrabber/all_people.csv', encoding='utf-8')    

In [10]:
urllib.parse.unquote("%C4%90or%C4%91e_Stoj%C5%A1i%C4%87_(Serbian_politician,_1928%E2%80%932014)")

'Đorđe_Stojšić_(Serbian_politician,_1928–2014)'

In [11]:
urllib.parse.unquote("Zuzana_%C4%8Caputov%C3%A1")

'Zuzana_Čaputová'

## Data Cleaning

### Create Difference CSV File for wikigrabber Spider Runs

In [12]:
add_new_wikis('wikigrabber/people.csv', 'wikicrawler/all_people.csv', 'wikigrabber/people_base.csv')

FileNotFoundError: [Errno 2] No such file or directory: 'wikigrabber/people.csv'

In [61]:
df = pd.read_csv('wikigrabber/all_people.csv', encoding='unicode_escape')
# df['wiki'] = df['wiki'].apply(lambda x: urllib.parse.unquote(x))
df.to_csv('wikigrabber/people_unclean.csv', encoding='unicode_escape')
df

Unnamed: 0,wiki
0,%C3%81kos_Birtalan
1,%C3%81lfhei%C3%B0ur_Ingad%C3%B3ttir
2,%C3%81lvaro_Ara%C3%BAjo_Castro
3,%C3%81lvaro_El%C3%ADas_Loredo
4,%C3%81lvaro_G%C3%B3mez_Hurtado
...,...
75803,Chris_Licht
75804,Megyn_Kelly
75805,3_(number)
75806,Michael_Jackson


### Text Manipulation

In [211]:
df = pd.read_csv('wikigrabber/people.csv', encoding='unicode_escape')

FileNotFoundError: [Errno 2] No such file or directory: 'wikigrabber/people.csv'

In [13]:
df.to_csv('wikigrabber/people_unclean.csv')

In [24]:
df['name'] = df['name'].apply(lambda x: x.strip())
df['name'] = df['name'].apply(lambda x: x.replace(' ', '_'))
df

Unnamed: 0,name
0,Adewale_Adeyemo
1,Afsin_Yurdakul
2,Albert_Bourla
3,Alex_Karp
4,Ana_Pinho
...,...
398,Mark_Tucker
399,Jessica_Uhl
400,Ulrik_Vestergaard_Knudsen
401,Darren_Walker


In [276]:
df.to_csv('wikicrawler/all_people.csv')

In [278]:
all_people_df = df.drop_duplicates()
all_people_df = all_people_df[all_people_df.wiki.str.contains(":")==False]
all_people_df.sort_values('wiki', inplace=True)
all_people_df.set_index('wiki', drop=True, inplace=True)
all_people_df.to_csv('wikigrabber/all_people.csv')
all_people_df

(Barbara)_Hazel_Guggenheim_King-Farlow_McKinley
A.A._Ames
A.P.J._Abdul_Kalam
A.T._Smith
A.V._Balakrishnan
...
ÃÂÃÂtienne_Hirsch
ÃÂÃÂ¯ÃÂÃÂ»ÃÂÃÂ¿Edward_VIII
ÃÂÃÂ½ivko_RadiÃÂÃÂ¡iÃÂÃÂ
ÃÂmile_Ollivier
ï»¿Jimi_Hendrix


In [343]:
url_df = pd.read_csv('wikigrabber/all_people.csv', encoding='unicode_escape')
url_df

Unnamed: 0,wiki
0,%C3%81kos_Birtalan
1,%C3%81lvaro_Nadal
2,%C3%81ngel_Acebes
3,%C3%81ngeles_Amador
4,%C3%85sa_Lindhagen
...,...
68503,Zweli_Mkhize
68504,Zygmunt_Janiszewski
68505,Zygmunt_Zalcwasser
68506,Zyon_Braun


In [344]:
url_df['wiki'] = url_df['wiki'].apply(lambda x: urllib.parse.unquote(x))

In [346]:
url_df.to_csv('wikigrabber/all_people.csv')

### WikiGrabber Results Cleaning

In [14]:
df = pd.read_csv('wikigrabber/all_people.csv', encoding='unicode_escape')
df

Unnamed: 0,wiki
0,%C3%81kos_Birtalan
1,%C3%81lfhei%C3%B0ur_Ingad%C3%B3ttir
2,%C3%81lvaro_Ara%C3%BAjo_Castro
3,%C3%81lvaro_El%C3%ADas_Loredo
4,%C3%81lvaro_G%C3%B3mez_Hurtado
...,...
75335,Zwentibold
75336,Zygmunt_Janiszewski
75337,Zygmunt_Zalcwasser
75338,Zyon_Braun


In [None]:
df['wiki'] = df['wiki'].apply(lambda x: x if not x.startswith())

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47952 entries, 0 to 47951
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   name       47251 non-null  object
 1   full_name  16425 non-null  object
 2   born       45753 non-null  object
 3   died       28590 non-null  object
dtypes: object(4)
memory usage: 1.5+ MB


### WikiCrawler Results Cleaning

In [13]:
df = pd.read_csv('wikicrawler/wikicrawler_results.csv', encoding='unicode_escape')
df

Unnamed: 0,name,full_name,born,died,citizenship,known_for,schools,degrees,organizations,institutions,...,house,title,doctoral_advisor,fields,positions,occupation,employer,political_party,board_member,labels
0,Michael Jackson,Michael Joseph Jackson,1958-08-29,"June 25, 2009",,,,,,,...,Jackson,,,,,"Singer,songwriter,dancer,record producer",,,,"Steeltown,Motown,Epic,Legacy,Sony,MJJ Productions"
1,Ãlvaro ElÃ­as Loredo,,1947-02-19,,,,Autonomous University of San Luis PotosÃ­,,,,...,,,,,,Lawyer and politician,,PAN,,
2,Charles,,1948-11-14,,,,"Gordonstoun,University of Cambridge",,,,...,Windsor,,,,,,,,,
3,Kat Timpf,Katherine Clare Timpf,1988-10-29,,,"Gutfeld!,National Review,Fox News Specialists","Hillsdale College,BA",,,,...,,,,,,Television personality,,,,
4,Elon Musk,Elon Reeve Musk,1971-06-28,,"United States,[1]",,University of Pennsylvania,"BA,BS",,,...,Musk,"Founder, CEO, and Chief Engineer of ,SpaceX,CE...",,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
65153,Zweli Mkhize,Zwelini Lawrence Mkhize,1956-02-02,,,,"University of Natal,M.B.",,,,...,,,,,"Minister of Health of South Africa,Minister of...","Doctor,legislator",,African National Congress,,
65154,Zyon Braun,Zyon Braun,1994-08-17,,,,FOM University of Applied Sciences for Economi...,,,,...,,,,,Leader of the Free Democratic Party in Branden...,"Politician,Banker",,Free Democratic Party,,
65155,Zuzana ZvolenskÃ¡,,1972-01-27,,,,Comenius University,,,,...,,,,,Minister of Health,Politician,,Direction-Social Democracy,,
65156,Zyta Gilowska,,1949-07-07,5 April 2016,,,"Warsaw University,Maria Curie-Sklodowska Unive...",,,,...,,,,,"Minister of Finance,Deputy Prime Minister",,,"Freedom Union,Civic Platform",,


In [102]:
# Delete blank records 
df_full = df.dropna(subset=['name', 'born'])
df_full = df_full.replace({np.nan: None})
df_full['born'] = df.born.apply(lambda x: pd.to_datetime(x, errors='coerce'))
df_full['died'] = df.died.apply(lambda x: pd.to_datetime(x, errors='coerce'))
# df_full = df_full[pd.to_datetime(df_full['born'], errors='coerce').notna()]
# df_full = df_full[pd.to_datetime(df_full['died'], errors='coerce').notna()]
df_full['died'] = df_full['died'].astype(str).replace({'NaT': None})
df_full = df_full.dropna(subset=['born'])
df_full

Unnamed: 0,name,full_name,born,died,citizenship,known_for,schools,degrees,organizations,institutions,...,house,title,doctoral_advisor,fields,positions,occupation,employer,political_party,board_member,labels
0,Michael Jackson,Michael Joseph Jackson,1958-08-29,2009-06-25,,,,,,,...,Jackson,,,,,"Singer,songwriter,dancer,record producer",,,,"Steeltown,Motown,Epic,Legacy,Sony,MJJ Productions"
1,Ãlvaro ElÃ­as Loredo,,1947-02-19,,,,Autonomous University of San Luis PotosÃ­,,,,...,,,,,,Lawyer and politician,,PAN,,
2,Charles,,1948-11-14,,,,"Gordonstoun,University of Cambridge",,,,...,Windsor,,,,,,,,,
3,Kat Timpf,Katherine Clare Timpf,1988-10-29,,,"Gutfeld!,National Review,Fox News Specialists","Hillsdale College,BA",,,,...,,,,,,Television personality,,,,
4,Elon Musk,Elon Reeve Musk,1971-06-28,,"United States,[1]",,University of Pennsylvania,"BA,BS",,,...,Musk,"Founder, CEO, and Chief Engineer of ,SpaceX,CE...",,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
65153,Zweli Mkhize,Zwelini Lawrence Mkhize,1956-02-02,,,,"University of Natal,M.B.",,,,...,,,,,"Minister of Health of South Africa,Minister of...","Doctor,legislator",,African National Congress,,
65154,Zyon Braun,Zyon Braun,1994-08-17,,,,FOM University of Applied Sciences for Economi...,,,,...,,,,,Leader of the Free Democratic Party in Branden...,"Politician,Banker",,Free Democratic Party,,
65155,Zuzana ZvolenskÃ¡,,1972-01-27,,,,Comenius University,,,,...,,,,,Minister of Health,Politician,,Direction-Social Democracy,,
65156,Zyta Gilowska,,1949-07-07,2016-04-05,,,"Warsaw University,Maria Curie-Sklodowska Unive...",,,,...,,,,,"Minister of Finance,Deputy Prime Minister",,,"Freedom Union,Civic Platform",,


### Data Type Transformations

In [103]:
# Function creates list from comma separated values in string
# Removes '' and '\n' values
def str_to_list(my_str):
    final_vec = []
#     print(type(my_str))
    if my_str != None:
        vector = [v for v in my_str.split(',') if v not in ['', '\n']]
        return vector

In [104]:
multi_val_cols = ['title',
                  'positions',
                  'institutions',
                  'occupation',
                  'labels',
                  'fields',
                  'known_for',
                  'schools',
                  'degrees',
                  'spouses',
                  'parents',
                  'relatives',
                  'political_party',
                  'employer',
                  'board_member']

In [105]:
# Clean empty values in cells
df_full[multi_val_cols] = np.vectorize(str_to_list)(df_full[multi_val_cols])
df_full

Unnamed: 0,name,full_name,born,died,citizenship,known_for,schools,degrees,organizations,institutions,...,house,title,doctoral_advisor,fields,positions,occupation,employer,political_party,board_member,labels
0,Michael Jackson,Michael Joseph Jackson,1958-08-29,2009-06-25,,,,,,,...,Jackson,,,,,"[Singer, songwriter, dancer, record producer]",,,,"[Steeltown, Motown, Epic, Legacy, Sony, MJJ Pr..."
1,Ãlvaro ElÃ­as Loredo,,1947-02-19,,,,[Autonomous University of San Luis PotosÃ­],,,,...,,,,,,[Lawyer and politician],,[PAN],,
2,Charles,,1948-11-14,,,,"[Gordonstoun, University of Cambridge]",,,,...,Windsor,,,,,,,,,
3,Kat Timpf,Katherine Clare Timpf,1988-10-29,,,"[Gutfeld!, National Review, Fox News Specialists]","[Hillsdale College, BA]",,,,...,,,,,,[Television personality],,,,
4,Elon Musk,Elon Reeve Musk,1971-06-28,,"United States,[1]",,[University of Pennsylvania],"[BA, BS]",,,...,Musk,"[Founder, CEO, and Chief Engineer of , Space...",,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
65153,Zweli Mkhize,Zwelini Lawrence Mkhize,1956-02-02,,,,"[University of Natal, M.B.]",,,,...,,,,,"[Minister of Health of South Africa, Minister ...","[Doctor, legislator]",,[African National Congress],,
65154,Zyon Braun,Zyon Braun,1994-08-17,,,,[FOM University of Applied Sciences for Econom...,,,,...,,,,,[Leader of the Free Democratic Party in Brande...,"[Politician, Banker]",,[Free Democratic Party],,
65155,Zuzana ZvolenskÃ¡,,1972-01-27,,,,[Comenius University],,,,...,,,,,[Minister of Health],[Politician],,[Direction-Social Democracy],,
65156,Zyta Gilowska,,1949-07-07,2016-04-05,,,"[Warsaw University, Maria Curie-Sklodowska Uni...",,,,...,,,,,"[Minister of Finance, Deputy Prime Minister]",,,"[Freedom Union, Civic Platform]",,


In [106]:
# Function calculateds age
def calc_age(my_born, my_died):
    born = pd.to_datetime(my_born)
    if my_died is None:
        today = dt.now()
        return today.year - born.year - ((today.month, today.day) < (born.month, born.day))
    else:
#         my_age = relativedelta.relativedelta(died - born)
        died = pd.to_datetime(my_died)
        return (died - born).days / 365

In [107]:
chars_to_remove = ['c', '.', 'circa', 'Unkown', '/']

# Should handle 
def convert_circa_dates(my_date):
    if my_date.startswith(tuple(chars_to_remove)):
        clean_date = ''.join([c for c in my_date if c not in chars_to_remove])
        return clean_date
    else:
        return my_date

In [108]:
def datetime_to_mmddyyyy(my_date):
    if my_date is not None:
        return dt.strftime('%Y-%m-%d', '%m/%d/%Y')
    else:
        return None

In [109]:
df_full['age']= np.vectorize(calc_age)(df_full['born'], df_full['died'])
df_full

Unnamed: 0,name,full_name,born,died,citizenship,known_for,schools,degrees,organizations,institutions,...,title,doctoral_advisor,fields,positions,occupation,employer,political_party,board_member,labels,age
0,Michael Jackson,Michael Joseph Jackson,1958-08-29,2009-06-25,,,,,,,...,,,,,"[Singer, songwriter, dancer, record producer]",,,,"[Steeltown, Motown, Epic, Legacy, Sony, MJJ Pr...",50.857534
1,Ãlvaro ElÃ­as Loredo,,1947-02-19,,,,[Autonomous University of San Luis PotosÃ­],,,,...,,,,,[Lawyer and politician],,[PAN],,,75.000000
2,Charles,,1948-11-14,,,,"[Gordonstoun, University of Cambridge]",,,,...,,,,,,,,,,73.000000
3,Kat Timpf,Katherine Clare Timpf,1988-10-29,,,"[Gutfeld!, National Review, Fox News Specialists]","[Hillsdale College, BA]",,,,...,,,,,[Television personality],,,,,33.000000
4,Elon Musk,Elon Reeve Musk,1971-06-28,,"United States,[1]",,[University of Pennsylvania],"[BA, BS]",,,...,"[Founder, CEO, and Chief Engineer of , Space...",,,,,,,,,50.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
65153,Zweli Mkhize,Zwelini Lawrence Mkhize,1956-02-02,,,,"[University of Natal, M.B.]",,,,...,,,,"[Minister of Health of South Africa, Minister ...","[Doctor, legislator]",,[African National Congress],,,66.000000
65154,Zyon Braun,Zyon Braun,1994-08-17,,,,[FOM University of Applied Sciences for Econom...,,,,...,,,,[Leader of the Free Democratic Party in Brande...,"[Politician, Banker]",,[Free Democratic Party],,,27.000000
65155,Zuzana ZvolenskÃ¡,,1972-01-27,,,,[Comenius University],,,,...,,,,[Minister of Health],[Politician],,[Direction-Social Democracy],,,50.000000
65156,Zyta Gilowska,,1949-07-07,2016-04-05,,,"[Warsaw University, Maria Curie-Sklodowska Uni...",,,,...,,,,"[Minister of Finance, Deputy Prime Minister]",,,"[Freedom Union, Civic Platform]",,,66.791781


In [110]:
schools_df = df_full[['name', 'schools']]
schools_df

Unnamed: 0,name,schools
0,Michael Jackson,
1,Ãlvaro ElÃ­as Loredo,[Autonomous University of San Luis PotosÃ­]
2,Charles,"[Gordonstoun, University of Cambridge]"
3,Kat Timpf,"[Hillsdale College, BA]"
4,Elon Musk,[University of Pennsylvania]
...,...,...
65153,Zweli Mkhize,"[University of Natal, M.B.]"
65154,Zyon Braun,[FOM University of Applied Sciences for Econom...
65155,Zuzana ZvolenskÃ¡,[Comenius University]
65156,Zyta Gilowska,"[Warsaw University, Maria Curie-Sklodowska Uni..."


In [111]:
pared_schools_df = schools_df.dropna(how='any')
pared_schools_df

Unnamed: 0,name,schools
1,Ãlvaro ElÃ­as Loredo,[Autonomous University of San Luis PotosÃ­]
2,Charles,"[Gordonstoun, University of Cambridge]"
3,Kat Timpf,"[Hillsdale College, BA]"
4,Elon Musk,[University of Pennsylvania]
5,Timothy Olyphant,"[University of Southern California, BFA]"
...,...,...
65153,Zweli Mkhize,"[University of Natal, M.B.]"
65154,Zyon Braun,[FOM University of Applied Sciences for Econom...
65155,Zuzana ZvolenskÃ¡,[Comenius University]
65156,Zyta Gilowska,"[Warsaw University, Maria Curie-Sklodowska Uni..."


In [114]:
exploded_schools_df = pared_schools_df.explode(column='schools')
exploded_schools_df

Unnamed: 0,name,schools
1,Ãlvaro ElÃ­as Loredo,Autonomous University of San Luis PotosÃ­
2,Charles,Gordonstoun
2,Charles,University of Cambridge
3,Kat Timpf,Hillsdale College
3,Kat Timpf,BA
...,...,...
65155,Zuzana ZvolenskÃ¡,Comenius University
65156,Zyta Gilowska,Warsaw University
65156,Zyta Gilowska,Maria Curie-Sklodowska University
65157,Zvezdelina Entcheva Stankova,Bryn Mawr College


In [115]:
exploded_schools_df['attended'] = 'attended'
exploded_schools_df

Unnamed: 0,name,schools,attended
1,Ãlvaro ElÃ­as Loredo,Autonomous University of San Luis PotosÃ­,attended
2,Charles,Gordonstoun,attended
2,Charles,University of Cambridge,attended
3,Kat Timpf,Hillsdale College,attended
3,Kat Timpf,BA,attended
...,...,...,...
65155,Zuzana ZvolenskÃ¡,Comenius University,attended
65156,Zyta Gilowska,Warsaw University,attended
65156,Zyta Gilowska,Maria Curie-Sklodowska University,attended
65157,Zvezdelina Entcheva Stankova,Bryn Mawr College,attended


In [117]:
exploded_schools_df.to_csv('wikicrawler/schools_attended.csv')

In [22]:
df_full.set_index('name', inplace=True)

In [23]:
df_full.to_csv('wikicrawler/wikicrawler_no_blanks.csv')

### Create CSV file for each node

### Organizations

In [51]:
df_orgs = df['organizations'].dropna()
df_orgs

261      Creative Commons,Reddit,Open Library,DeadDrop,...
2495                                        Wertheim & Co.
4282     Public Radio of Armenia , Ar Radio Intercontin...
5034                                   Austen Riggs Center
5106                                        AHA Foundation
                               ...                        
58999                               University of Michigan
59147        Gerrit W. Gong,Ulisses Soares,Robert D. Hales
60446                                                   UN
61637       American Association of Physicists in Medicine
61740                            Levin & Co, Johnston & Co
Name: organizations, Length: 77, dtype: object

In [61]:
my_orgs = df_orgs.to_list()

In [62]:
for l in my_orgs:
    print(l)

Creative Commons,Reddit,Open Library,DeadDrop,Progressive Change Campaign Committee,Demand Progress,ThoughtWorks,Tor2web
Wertheim & Co.
Public Radio of Armenia , Ar Radio Intercontinental
Austen Riggs Center
AHA Foundation
One Campaign
Creative Artists Agency
Colorado Springs Symphony Orchestra,Boston Landmarks Orchestra
Executive Council of Hong Kong
Anbang
NXIVM
Young Pioneers,[6]
Mormon Tabernacle Choir,Utah Symphony Orchestra Board
Paul and Daisy Soros Fellowships for New Americans
CAMH,Jack.org
Teneo
Lincoln's Inn Fields

,Chorgemeinschaft Neubeuern,
,KlangVerwaltung,
,BUND,
,Herrenchiemsee Festival,

Jeffrey R. Holland
Creative Armenia , Avalanche Entertainment
University of Oxford
Smokehouse Pictures,Casamigos
Rudger Clawson,[1]
Baptist State Convention of North Carolina,Baylor University
TerraMar Project
D. Todd Christofferson
Americans for Tax Reform
Women's Social and Political Union
Metropolitan Steamship Company
Henry B. Eyring
Black Panther Party
International Monetary Fun

In [63]:
orgs_list = []

for l in my_orgs:
    my_orgs = l.split(',')
    for org in my_orgs:
        if org not in orgs_list:
            orgs_list.append(org.strip())

In [70]:
all_orgs = [x for x in orgs_list if x is not '']
all_orgs

['Creative Commons',
 'Reddit',
 'Open Library',
 'DeadDrop',
 'Progressive Change Campaign Committee',
 'Demand Progress',
 'ThoughtWorks',
 'Tor2web',
 'Wertheim & Co.',
 'Public Radio of Armenia',
 'Ar Radio Intercontinental',
 'Austen Riggs Center',
 'AHA Foundation',
 'One Campaign',
 'Creative Artists Agency',
 'Colorado Springs Symphony Orchestra',
 'Boston Landmarks Orchestra',
 'Executive Council of Hong Kong',
 'Anbang',
 'NXIVM',
 'Young Pioneers',
 '[6]',
 'Mormon Tabernacle Choir',
 'Utah Symphony Orchestra Board',
 'Paul and Daisy Soros Fellowships for New Americans',
 'CAMH',
 'Jack.org',
 'Teneo',
 "Lincoln's Inn Fields",
 'Chorgemeinschaft Neubeuern',
 'KlangVerwaltung',
 'BUND',
 'Herrenchiemsee Festival',
 'Jeffrey R. Holland',
 'Creative Armenia',
 'Avalanche Entertainment',
 'University of Oxford',
 'Smokehouse Pictures',
 'Casamigos',
 'Rudger Clawson',
 '[1]',
 'Baptist State Convention of North Carolina',
 'Baylor University',
 'TerraMar Project',
 'D. Todd Chri

In [71]:
len(all_orgs)

113

In [77]:
orgs_df = pd.DataFrame({'organization': all_orgs})
orgs_df.set_index('organization', inplace=True)
orgs_df.to_csv('wikicrawler/organizations.csv')

In [79]:
schools_df = df['schools'].dropna()
schools_df

1                Autonomous University of San Luis PotosÃ­
2                      Gordonstoun,University of Cambridge
3                                     Hillsdale College,BA
4                               University of Pennsylvania
5                    University of Southern California,BFA
                               ...                        
65153                             University of Natal,M.B.
65154    FOM University of Applied Sciences for Economi...
65155                                  Comenius University
65156    Warsaw University,Maria Curie-Sklodowska Unive...
65157                 Bryn Mawr College,Harvard University
Name: schools, Length: 37435, dtype: object

In [96]:
all_schools = schools_df.copy().to_list()
schools_set = set(all_schools)

In [97]:
len(schools_set)

18911

In [98]:
schools_list = list(schools_set)
len(schools_list)

18911

In [100]:
schools_df = pd.DataFrame({'school': schools_list})
schools_df.set_index('school', inplace=True)
schools_df.to_csv('wikicrawler/schools.csv')

In [87]:
# Drop all rows with no born/died data
df_all_names = df_full.dropna(subset=['born', 'died'])
df_all_names                  

Unnamed: 0,name,full_name,born,died,citizenship,known_for,schools,degrees,organizations,institutions,...,house,title,doctoral_advisor,fields,positions,occupation,employer,political_party,board_member,labels
8,Ãlvaro GÃ³mez Hurtado,,1919-05-08,"November 2, 1995",,,Pontifical Xavierian University,,,,...,,,,,"15th Colombia Ambassador to France,Colombia Am...","Journalist,politician",,Conservative,,
10,Ãlvaro Lloreda Caicedo,,1903-10-15,10 April 1985,,,,,,,...,,,,,"Colombia Ambassador to Spain,Member of the Sen...",,,Conservative,,
17,Ãlvaro ObregÃ³n,Ãlvaro ObregÃ³n Salido,1880-02-19,17 July 1928,,,,,,,...,,,,,"46th President of Mexico,President of the Mexi...",,,Laborist Party,,
19,Michael Jackson,Michael Joseph Jackson,1958-08-29,"June 25, 2009",,,,,,,...,Jackson,,,,,"Singer,songwriter,dancer,record producer",,,,"Steeltown,Motown,Epic,Legacy,Sony,MJJ Productions"
25,Ãsgeir Ãsgeirsson,,13 May 1894,15 September 1972,,,,,,,...,,,,,"2nd President of Iceland,6th Prime Minister of...",,,"Progressive Party,Social Democratic Party",,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
65147,Zviad Gamsakhurdia,,1939-03-31,31 December 1993,,,,,,,...,,,,,"1st President of Georgia,Chairman of the Supre...",,,Round TableâFree Georgia,,
65148,Zvonimir Äervenko,,1926-11-13,17 February 2001,,,,,,,...,,,,,,,,,,
65150,Zvi Griliches,,1930-09-12,4 November 1999,,,"University of Chicago,UCÂ Berkeley",,,,...,,,,Economics,,,,,,
65152,Zyta Gilowska,,1949-07-07,5 April 2016,,,"Warsaw University,Maria Curie-Sklodowska Unive...",,,,...,,,,,"Minister of Finance,Deputy Prime Minister",,,"Freedom Union,Civic Platform",,


In [88]:
df_all_names.to_csv('wikicrawler/wikicrawler_full.csv')

In [89]:
df_all_names.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 35746 entries, 8 to 65155
Data columns (total 24 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   name              35746 non-null  object
 1   full_name         11569 non-null  object
 2   born              35746 non-null  object
 3   died              35746 non-null  object
 4   citizenship       762 non-null    object
 5   known_for         2677 non-null   object
 6   schools           16706 non-null  object
 7   degrees           4371 non-null   object
 8   organizations     44 non-null     object
 9   institutions      2272 non-null   object
 10  spouses           5540 non-null   object
 11  offspring         4631 non-null   object
 12  parents           4298 non-null   object
 13  relatives         4095 non-null   object
 14  house             1431 non-null   object
 15  title             152 non-null    object
 16  doctoral_advisor  1906 non-null   object
 17  fields      

In [47]:
df_no_gaps = df_all_names.drop_duplicates()
df_no_gaps

Unnamed: 0,name,full_name,born,died,known_for,schools,degrees,spouses,offspring,parents,relatives,title,positions,occupation,employer,political_party,board_member_of,labels
0,Ãlvaro ObregÃ³n,Ãlvaro ObregÃ³n Salido,1880-02-19,17 July 1928,,,,MarÃ­a Tapia (1888-1971),,,,,"46th President of Mexico,President of the Mexi...",,,,,
8,Ãlvaro Lloreda Caicedo,,1903-10-15,10 April 1985,,,,Mercedes Caicedo Ortiz,Rodrigo Hern%C3%A1n Lloreda Caicedo,,,,"Colombia Ambassador to Spain,Member of the Sen...",,,,,
11,Ãlvaro GÃ³mez Hurtado,,1919-05-08,"November 2, 1995",,Pontifical Xavierian University,,Margarita Escobar LÃ³pez (1946-1995),"Mauricio GÃ³mez Escobar,Mercedes GÃ³mez Escoba...",,Laureano GÃ³mez,,"15th Colombia Ambassador to France,Colombia Am...","Journalist,politician",,,,
28,Ãlcio Ãlvares,,1932-09-28,9 December 2016,,,,Irene Ãlvares,,,,,"Minister of Defence of Brazil,Minister of Deve...",,,,,
31,The Baron of Carondelet,ÃÃ±igo Cavero Lataillade,1929-08-01,25 December 2002,,"University of Deusto,Complutense University of...",,,,,,,"President of the Spanish Council of State,Mini...",,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52488,Zurab Zhvania,,9 December 1963,3 February 2005,,Tbilisi State University,,,,,,,"4th Prime Minister of Georgia,State Minister o...",,,,,
52491,Zvi Griliches,,1930-09-12,4 November 1999,,"University of Chicago,UCÂ Berkeley",,,,,,,,,,,,
52492,Zvonimir Äervenko,,1926-11-13,17 February 2001,,,,,,,,,,,,,,
52494,Zygmunt Janiszewski,,1888-07-12,3 January 1920,,University of Paris,,,,,,,,,,,,


In [48]:
df_no_gaps.to_csv('wikicrawler/wikicrawler_born_died.csv')

In [49]:
df_degrees = df_no_gaps.dropna(subset=['schools', 'degrees'])

In [50]:
# Rearrange columns and sort on name
df_degrees = df_degrees[['name', 'full_name', 'born', 'occupation', 'schools', 'degrees', 'spouses', 'offspring']]
df_degrees.sort_values(by='name', ignore_index=True, inplace=True)
df_degrees

Unnamed: 0,name,full_name,born,occupation,schools,degrees,spouses,offspring
0,A. Andrew Hauk,Aloysius Andrew Hauk,1912-12-29,,"Regis University,Columbus School of Law,Yale L...","A.B.,J.S.D.,LL.B.",,
1,A. Bruce Bielaski,,1883-04-02,,George Washington University,LLB,,
2,A. J. McNamara,Abel John McNamara,1936-06-09,,"Louisiana State University,Loyola University N...","J.D.,B.S.",,
3,A. Mitchell Palmer,Alexander Mitchell Palmer,1872-05-04,,Swarthmore College,BA,Roberta Dixon,
4,A. P. Jayasuriya,Alexander Perera Jayasuriya,1901-11-01,Advocate,"Sri Sumangala College,Panadura,Royal College, ...",Panadura,,
...,...,...,...,...,...,...,...,...
3478,Zakir Husain,,1897-02-08,,"Mohammedan Anglo-Oriental College Aligarh,Univ...","PhD,MA",Shah Jahan Begum,2
3479,Zales Nelson Ecton,,1898-04-01,,"Montana State University,University of Chicago","BS,LLB",Vera Harris,2
3480,Zhang Wentian,,1900-08-30,,"University of California, Berkeley,Moscow Sun ...",Moscow Sun Yat-sen University,,
3481,Ä°hsan Sabri ÃaÄlayangil,,1908,,"Ä°stanbul University,Istanbul High School",Istanbul High School,,


In [51]:
df_degrees.set_index('name', drop=True, inplace=True)
df_degrees

Unnamed: 0_level_0,full_name,born,occupation,schools,degrees,spouses,offspring
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
A. Andrew Hauk,Aloysius Andrew Hauk,1912-12-29,,"Regis University,Columbus School of Law,Yale L...","A.B.,J.S.D.,LL.B.",,
A. Bruce Bielaski,,1883-04-02,,George Washington University,LLB,,
A. J. McNamara,Abel John McNamara,1936-06-09,,"Louisiana State University,Loyola University N...","J.D.,B.S.",,
A. Mitchell Palmer,Alexander Mitchell Palmer,1872-05-04,,Swarthmore College,BA,Roberta Dixon,
A. P. Jayasuriya,Alexander Perera Jayasuriya,1901-11-01,Advocate,"Sri Sumangala College,Panadura,Royal College, ...",Panadura,,
...,...,...,...,...,...,...,...
Zakir Husain,,1897-02-08,,"Mohammedan Anglo-Oriental College Aligarh,Univ...","PhD,MA",Shah Jahan Begum,2
Zales Nelson Ecton,,1898-04-01,,"Montana State University,University of Chicago","BS,LLB",Vera Harris,2
Zhang Wentian,,1900-08-30,,"University of California, Berkeley,Moscow Sun ...",Moscow Sun Yat-sen University,,
Ä°hsan Sabri ÃaÄlayangil,,1908,,"Ä°stanbul University,Istanbul High School",Istanbul High School,,


In [52]:
# Keep only complete records
df_all_fields = df.dropna(how='any')

In [53]:
def date_converter(x):
    if x:
        if '-' in x:
            updated = dt.strptime(str(x), '%Y-%m-%d %H:%M:%S')
        elif '/' in x:
            updated = dt.strptime(str(x), '%m/%d/%Y %H:%M:%S')
        else:
            return None
        print(updated)
        return updated.strftme('%Y-%m-%d')        
    else:
        return None

In [54]:
# df_degrees['DOB'] = np.vectorize(date_converter)(df_degrees['DOB'])
df_degrees['born'] = pd.to_datetime(df_degrees.born)
df_degrees['born'] = df_degrees['born'].dt.strftime('%Y-%m-%d')
df_degrees

ParserError: String does not contain a date: , 

In [None]:
df_degrees.to_csv('wikicrawler/people_data_full.csv')

## Rhodes Scholars

In [None]:
rhodes_df = pd.read_csv('rhodescholars/rhodes_data.csv')
rhodes_df

In [None]:
df_occupations = rhodes_df.merge(df_no_gaps, on='name', how='outer')
df_occupations

In [414]:
df_occupations.to_csv('wikicrawler/merged_occupations.csv')