# Scrape USDA's FoodData Central Data

In this notebook we will create a pipeline for pulling data from the USDA's website about food.

## Goal of this notebook

1. Pull data from the USDA food dataset:
    - https://fdc.nal.usda.gov/download-datasets.html
2. Extract zip files.
3. Format into folders

In [10]:
import pandas as pd
import numpy as np

import requests
import re
import urllib.request

from glob import glob
import shutil
import os

from bs4 import BeautifulSoup
from tqdm.notebook import tqdm

import matplotlib.pyplot as plt
import logging

plt.style.use('fivethirtyeight')
pd.set_option('display.max_columns', 500)
logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.INFO)


In [11]:
def get_links(base_url = 'https://fdc.nal.usda.gov/',
              site = 'https://fdc.nal.usda.gov/download-datasets.html'):
    r  = requests.get(site)
    data = r.text
    soup = BeautifulSoup(data)

    links = []
    for link in soup.find_all('a'):
        links.append(link.get('href'))

    zips = [c for c in links if str(c).endswith('.zip')]
    zips = [*set(zips)]
    zips = [base_url + a for a in zips]
    return zips

def download_zips(player_zips, zip_dir='./'):
    logging.info('======== Starting Download =======')
    for z in tqdm(player_zips):
        logging.info(f'Downloading {z}')
        out = z.split('/')[-1]
        urllib.request.urlretrieve(z, f"{zip_dir}/{out}")

def unzip_files(zip_dir='./', delete_zip=True):
    logging.info('======== Starting Unzip =======')
    for fn in tqdm(glob(f'{zip_dir}/*.zip')):
        logging.info(f'Unzipping {fn}')
        out_fn = fn.split('.')[1].replace('/','')
        shutil.unpack_archive(fn, out_fn)
        if delete_zip:
            os.remove(fn)

In [12]:
DEBUG = False

zips = get_links()
# Subset zips just to the main data
zips = [z for z in zips if 'FoodData_Central_csv_' in z]
if DEBUG:
    zips = zips[:2]
download_zips(zips)
unzip_files()



  0%|          | 0/8 [00:00<?, ?it/s]

INFO:Downloading https://fdc.nal.usda.gov//fdc-datasets/FoodData_Central_csv_2020-10-30.zip
INFO:Downloading https://fdc.nal.usda.gov//fdc-datasets/FoodData_Central_csv_2021-10-28.zip
INFO:Downloading https://fdc.nal.usda.gov//fdc-datasets/FoodData_Central_csv_2021-04-28.zip
INFO:Downloading https://fdc.nal.usda.gov//fdc-datasets/FoodData_Central_csv_2022-10-28.zip
INFO:Downloading https://fdc.nal.usda.gov//fdc-datasets/FoodData_Central_csv_2022-04-28.zip
INFO:Downloading https://fdc.nal.usda.gov//fdc-datasets/FoodData_Central_csv_2020-04-29.zip
INFO:Downloading https://fdc.nal.usda.gov//fdc-datasets/FoodData_Central_csv_%202019-04-02.zip
INFO:Downloading https://fdc.nal.usda.gov//fdc-datasets/FoodData_Central_csv_2019-12-17.zip


  0%|          | 0/8 [00:00<?, ?it/s]

INFO:Unzipping ./FoodData_Central_csv_2022-10-28.zip
INFO:Unzipping ./FoodData_Central_csv_2021-04-28.zip
INFO:Unzipping ./FoodData_Central_csv_%202019-04-02.zip
INFO:Unzipping ./FoodData_Central_csv_2022-04-28.zip
INFO:Unzipping ./FoodData_Central_csv_2019-12-17.zip
INFO:Unzipping ./FoodData_Central_csv_2020-04-29.zip
INFO:Unzipping ./FoodData_Central_csv_2021-10-28.zip
INFO:Unzipping ./FoodData_Central_csv_2020-10-30.zip


In [13]:
!ls -GFlash

total 120K
4.0K drwxrwxr-x 10 robmulla 4.0K Feb  2 22:29 ./
4.0K drwxrwxr-x 57 robmulla 4.0K Feb  2 21:37 ../
 60K -rw-rw-r--  1 robmulla  58K Jun 30  2022 eda-2022-data.ipynb
4.0K drwxrwxr-x  2 robmulla 4.0K Feb  2 22:29 FoodData_Central_csv_2019-12-17/
4.0K drwxrwxr-x  3 robmulla 4.0K Feb  2 22:29 FoodData_Central_csv_2020-04-29/
4.0K drwxrwxr-x  3 robmulla 4.0K Feb  2 22:29 FoodData_Central_csv_2020-10-30/
4.0K drwxrwxr-x  2 robmulla 4.0K Feb  2 22:29 FoodData_Central_csv_%202019-04-02/
4.0K drwxrwxr-x  3 robmulla 4.0K Feb  2 22:28 FoodData_Central_csv_2021-04-28/
4.0K drwxrwxr-x  3 robmulla 4.0K Feb  2 22:29 FoodData_Central_csv_2021-10-28/
4.0K drwxrwxr-x  3 robmulla 4.0K Feb  2 22:29 FoodData_Central_csv_2022-04-28/
4.0K drwxrwxr-x  2 robmulla 4.0K Feb  2 22:28 FoodData_Central_csv_2022-10-28/
8.0K -rw-rw-r--  1 robmulla 5.6K Feb  2 22:27 fooddata-central-data-scrape.ipynb
 12K -rw-rw-r--  1 robmulla  11K Feb  2 21:22 usda-food-data-pull.ipynb
