In [64]:
import os
import pandas as pd
from csv import DictReader, writer

In [88]:
def people_pruner(my_file):
    my_list = []

    with open(my_file, 'r') as f:
        reader = DictReader(f, fieldnames='person')

        for row in reader:
            if [row['p']] not in my_list:
                my_list.append([row['p']])
            else:
                print(f"Duplicate person: {row['p']}")
    
    return my_list

In [93]:
def csv_writer(my_file, my_list):

    with open(my_file, 'w', newline='') as outfile:

        # using csv.writer method from CSV package
        writer = csv.writer(outfile)

        for row in my_list:
            writer.writerow(row)

In [115]:
people_list = people_pruner('wikicrawler/people_temp.csv')

Duplicate person: Sophie_Gr%C3%A9goire_Trudeau
Duplicate person: Lynne_Cheney
Duplicate person: Marilyn_Quayle
Duplicate person: Susan_Weber_(historian)
Duplicate person: Nancy_Kissinger
Duplicate person: Hillary_Rodham
Duplicate person: Doug_Emhoff
Duplicate person: Priscilla_Chan
Duplicate person: MacKenzie_Scott
Duplicate person: Ashley_Estes_Kavanaugh
Duplicate person: Ginni_Thomas
Duplicate person: George_Akerlof
Duplicate person: Andrea_Mitchell
Duplicate person: Edward_Sugden_(methodist)
Duplicate person: Gertrude_Schoepperle
Duplicate person: Laura_Hibbard_Loomis
Duplicate person: Mary_Herring
Duplicate person: Edna_Manley
Duplicate person: Ida_Gordon
Duplicate person: Editha_Olga_Bailey
Duplicate person: Evelyn_Lett
Duplicate person: Virginia_Durr
Duplicate person: Fay_Wray
Duplicate person: Norah_Michener
Duplicate person: index.php?title=Helen_Hill_Miller&action=edit&redlink=1
Duplicate person: Marian_Dale_Scott
Duplicate person: Harriet_Mayor_Fulbright
Duplicate person: Cla

In [116]:
len(people_list)

1791

In [117]:
csv_writer('wikicrawler/people.csv', people_list)

## Data Cleaning

In [130]:
df = pd.read_csv('wikicrawler/people_data_temp.csv')
df

Unnamed: 0,schools,degrees,name,full_name,DOB,spouses,offspring
0,"Yale University,University of Wyoming,Universi...","MA,BA",Dick Cheney,Richard Bruce Cheney,1941-01-30 00:00:00,Lynne Cheney,"Liz Cheney,Mary Cheney"
1,Harvard University,"MA,BA",Henry Kissinger,Heinz Alfred Kissinger,1923-05-27 00:00:00,Nancy Kissinger,2
2,"Dartmouth College,University College, Oxford,Y...","MA,JD,BA",Robert Reich,Robert Bernard Reich,1946-06-24 00:00:00,Clare Dalton,Sam Reich
3,"University of Denver,University of Notre Dame","MA,PhD,BA",Condoleezza Rice,,1954-11-14 00:00:00,,
4,"DePauw University,Indiana University, Indianap...","JD,BA",Dan Quayle,James Danforth Quayle,1947-02-04 00:00:00,Marilyn Quayle,Ben Quayle
...,...,...,...,...,...,...,...
1139,"Harvard University,Yale University","JD,AB",Kermit Roosevelt III,,1971-07-14 00:00:00,,
1140,"Harvard University,BA,MBA",,Theodore Roosevelt V,Theodore Roosevelt V,1942-11-27 00:00:00,Constance Lane Rogers,Theodore Roosevelt VI
1141,Harvard University,"LLB,BA",Charles Francis Adams III,,1866-08-02 00:00:00,index.php?title=Frances Adams&action=edit&redl...,Charles Francis Adams IV
1142,"Yale University,Harvard University","JD,BA",William Taft,,1945-09-13 00:00:00,Julia V. Taft,3


In [131]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1144 entries, 0 to 1143
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   schools    746 non-null    object
 1   degrees    320 non-null    object
 2   name       1077 non-null   object
 3   full_name  580 non-null    object
 4   DOB        1077 non-null   object
 5   spouses    660 non-null    object
 6   offspring  564 non-null    object
dtypes: object(7)
memory usage: 62.7+ KB


In [132]:
df.describe()

Unnamed: 0,schools,degrees,name,full_name,DOB,spouses,offspring
count,746,320,1077,580,1077,660,564
unique,611,137,1030,546,1023,602,279
top,Harvard University,BA,Jacqueline Kennedy Onassis,Leslie Lynch King Jr.,1874-08-10 00:00:00,\n,2
freq,28,43,3,3,3,5,85


In [133]:
df_full = df.dropna(how='all')
df_full

Unnamed: 0,schools,degrees,name,full_name,DOB,spouses,offspring
0,"Yale University,University of Wyoming,Universi...","MA,BA",Dick Cheney,Richard Bruce Cheney,1941-01-30 00:00:00,Lynne Cheney,"Liz Cheney,Mary Cheney"
1,Harvard University,"MA,BA",Henry Kissinger,Heinz Alfred Kissinger,1923-05-27 00:00:00,Nancy Kissinger,2
2,"Dartmouth College,University College, Oxford,Y...","MA,JD,BA",Robert Reich,Robert Bernard Reich,1946-06-24 00:00:00,Clare Dalton,Sam Reich
3,"University of Denver,University of Notre Dame","MA,PhD,BA",Condoleezza Rice,,1954-11-14 00:00:00,,
4,"DePauw University,Indiana University, Indianap...","JD,BA",Dan Quayle,James Danforth Quayle,1947-02-04 00:00:00,Marilyn Quayle,Ben Quayle
...,...,...,...,...,...,...,...
1139,"Harvard University,Yale University","JD,AB",Kermit Roosevelt III,,1971-07-14 00:00:00,,
1140,"Harvard University,BA,MBA",,Theodore Roosevelt V,Theodore Roosevelt V,1942-11-27 00:00:00,Constance Lane Rogers,Theodore Roosevelt VI
1141,Harvard University,"LLB,BA",Charles Francis Adams III,,1866-08-02 00:00:00,index.php?title=Frances Adams&action=edit&redl...,Charles Francis Adams IV
1142,"Yale University,Harvard University","JD,BA",William Taft,,1945-09-13 00:00:00,Julia V. Taft,3


In [139]:
# Drop all rows without a name
df_all_names = df_full.dropna(subset=['name'])
df_all_names                  

Unnamed: 0,schools,degrees,name,full_name,DOB,spouses,offspring
0,"Yale University,University of Wyoming,Universi...","MA,BA",Dick Cheney,Richard Bruce Cheney,1941-01-30 00:00:00,Lynne Cheney,"Liz Cheney,Mary Cheney"
1,Harvard University,"MA,BA",Henry Kissinger,Heinz Alfred Kissinger,1923-05-27 00:00:00,Nancy Kissinger,2
2,"Dartmouth College,University College, Oxford,Y...","MA,JD,BA",Robert Reich,Robert Bernard Reich,1946-06-24 00:00:00,Clare Dalton,Sam Reich
3,"University of Denver,University of Notre Dame","MA,PhD,BA",Condoleezza Rice,,1954-11-14 00:00:00,,
4,"DePauw University,Indiana University, Indianap...","JD,BA",Dan Quayle,James Danforth Quayle,1947-02-04 00:00:00,Marilyn Quayle,Ben Quayle
...,...,...,...,...,...,...,...
1139,"Harvard University,Yale University","JD,AB",Kermit Roosevelt III,,1971-07-14 00:00:00,,
1140,"Harvard University,BA,MBA",,Theodore Roosevelt V,Theodore Roosevelt V,1942-11-27 00:00:00,Constance Lane Rogers,Theodore Roosevelt VI
1141,Harvard University,"LLB,BA",Charles Francis Adams III,,1866-08-02 00:00:00,index.php?title=Frances Adams&action=edit&redl...,Charles Francis Adams IV
1142,"Yale University,Harvard University","JD,BA",William Taft,,1945-09-13 00:00:00,Julia V. Taft,3


In [140]:
df_all_names.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1037 entries, 0 to 1143
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   schools    718 non-null    object
 1   degrees    307 non-null    object
 2   name       1037 non-null   object
 3   full_name  548 non-null    object
 4   DOB        990 non-null    object
 5   spouses    622 non-null    object
 6   offspring  527 non-null    object
dtypes: object(7)
memory usage: 64.8+ KB


In [142]:
df_all_names.describe()

Unnamed: 0,schools,degrees,name,full_name,DOB,spouses,offspring
count,718,307,1037,548,990,622,527
unique,605,137,1030,546,977,601,279
top,Harvard University,BA,Elliott Roosevelt,Anna Eleanor Roosevelt,1946-08-13 00:00:00,\n,2
freq,25,38,2,2,2,5,83


In [143]:
df_final = df_all_names.drop_duplicates()
df_final

Unnamed: 0,schools,degrees,name,full_name,DOB,spouses,offspring
0,"Yale University,University of Wyoming,Universi...","MA,BA",Dick Cheney,Richard Bruce Cheney,1941-01-30 00:00:00,Lynne Cheney,"Liz Cheney,Mary Cheney"
1,Harvard University,"MA,BA",Henry Kissinger,Heinz Alfred Kissinger,1923-05-27 00:00:00,Nancy Kissinger,2
2,"Dartmouth College,University College, Oxford,Y...","MA,JD,BA",Robert Reich,Robert Bernard Reich,1946-06-24 00:00:00,Clare Dalton,Sam Reich
3,"University of Denver,University of Notre Dame","MA,PhD,BA",Condoleezza Rice,,1954-11-14 00:00:00,,
4,"DePauw University,Indiana University, Indianap...","JD,BA",Dan Quayle,James Danforth Quayle,1947-02-04 00:00:00,Marilyn Quayle,Ben Quayle
...,...,...,...,...,...,...,...
1139,"Harvard University,Yale University","JD,AB",Kermit Roosevelt III,,1971-07-14 00:00:00,,
1140,"Harvard University,BA,MBA",,Theodore Roosevelt V,Theodore Roosevelt V,1942-11-27 00:00:00,Constance Lane Rogers,Theodore Roosevelt VI
1141,Harvard University,"LLB,BA",Charles Francis Adams III,,1866-08-02 00:00:00,index.php?title=Frances Adams&action=edit&redl...,Charles Francis Adams IV
1142,"Yale University,Harvard University","JD,BA",William Taft,,1945-09-13 00:00:00,Julia V. Taft,3


In [144]:
df_final.to_csv('wikicrawler/people_data.csv')