# White House visitor logs

### Import Python tools and Jupyter configuration

In [1]:
%load_ext lab_black

In [2]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
import datetime as dt
import glob

### Read the page to find links to the raw data

In [3]:
r = requests.get("https://www.whitehouse.gov/disclosures/visitor-logs/")
soup = BeautifulSoup(r.text, "html.parser")

In [4]:
links = [a.get("href") for a in soup.find_all("a", href=re.compile(".csv"))]

In [5]:
for link in links:
    !wget -P 'data/raw/csv' {link} --quiet -N

In [6]:
path = "data/raw/csv/"
all_files = glob.glob(path + "*.csv")

li = []

for filename in all_files:
    df = pd.read_csv(filename, index_col=None, header=0)
    li.append(df)

src = pd.concat(li, axis=0, ignore_index=True)

### Clean up the columns

In [7]:
src.columns = src.columns.str.lower()

In [8]:
src = src[src.columns.drop(list(src.filter(regex="unnamed")))]

In [19]:
src = src.sort_values("releasedate", ascending=False)

In [20]:
src.head()

Unnamed: 0,namelast,namefirst,namemid,uin,bdgnbr,access_type,toa,poa,tod,pod,...,terminal_suffix,visitee_namelast,visitee_namefirst,meeting_loc,meeting_room,caller_name_last,caller_name_first,caller_room,description,releasedate
7897,FRESHWATER,MARGARET,E,U30571,,VA,NAN,NAN,NAN,NAN,...,NF,FASANO,NICOLE,WH,WEST WING,FASANO,NICOLE,,NAN,2021-12-03
6630,COMER,SIERRA,D,U29829,182219.0,VA,8/10/21 10:25,NAN,NAN,A0101,...,JB,BOTELLO,JOHN,WH,EXECUTIVE,BOTELLO,JOHN,,NAN,2021-12-03
6632,FASHOLA,KEVIN,A,U29783,177416.0,VA,8/10/21 12:47,NAN,NAN,NAN,...,JP,BRISTER,PAUL,OEOB,351,PRINT,JOSHUA,,NAN,2021-12-03
6633,FISHER,MATTHEW,D,U29786,183221.0,VA,8/10/21 13:38,NAN,NAN,NAN,...,JB,BOTELLO,JOHN,WH,EXR-STATE,BOTELLO,JOHN,,NAN,2021-12-03
6634,GOMEZFONT,RAMON,N,U29765,177928.0,VA,8/10/21 15:47,NAN,NAN,B04,...,EB,POETHIG,ERIKA,OEOB,194,BROWN,ELIZABETH,,NAN,2021-12-03


### Real dates

In [10]:
date_cols = [
    "appt_made_date",
    "appt_start_date",
    "appt_end_date",
    "appt_cancel_date",
    "releasedate",
]

In [11]:
src[date_cols] = src[date_cols].applymap(lambda x: pd.to_datetime(x))

### The dataframe has inconsitent casing. Let's make it all upper for grouping later. 

In [12]:
src[
    [
        "namelast",
        "namefirst",
        "namemid",
        "uin",
        "access_type",
        "toa",
        "poa",
        "tod",
        "pod",
        "last_updatedby",
        "post",
        "lastentrydate",
        "terminal_suffix",
        "visitee_namelast",
        "visitee_namefirst",
        "meeting_loc",
        "meeting_room",
        "caller_name_last",
        "caller_name_first",
        "description",
    ]
] = src[
    [
        "namelast",
        "namefirst",
        "namemid",
        "uin",
        "access_type",
        "toa",
        "poa",
        "tod",
        "pod",
        "last_updatedby",
        "post",
        "lastentrydate",
        "terminal_suffix",
        "visitee_namelast",
        "visitee_namefirst",
        "meeting_loc",
        "meeting_room",
        "caller_name_last",
        "caller_name_first",
        "description",
    ]
].apply(
    lambda x: x.astype(str).str.upper()
)

In [13]:
df = src.copy()

---

## Exports

In [14]:
today = dt.date.today().strftime("%m-%d-%Y")

In [15]:
df.to_csv("data/processed/log_" + today + ".csv", index=False)
df.to_csv("data/processed/log_latest.csv", index=False)
src.to_csv("data/raw/log_" + today + ".csv", index=False)