# Automate the retrieval of PDF records for companies

Can get basic company information using;  GET https://api.companieshouse.gov.uk/company/{company_number}

Can get their filing history with;  GET https://api.companieshouse.gov.uk/company/{company_number}/filing-history

Can retrieve a specific document with;  GET http://document-api.companieshouse.gov.uk/document/{id}/content

In [1]:
import requests
import json
import shutil
import pymongo
import time as tic

import numpy as np
import pandas as pd

## Putting together the code for querying the Companies House API

In [2]:
information_url = "https://api.companieshouse.gov.uk/company/{}/filing-history"    # format with CH number
document_url = "GET http://document-api.companieshouse.gov.uk/document/{}" # format with doc id

In [3]:
with open("CH_api_key.txt") as f:
    key = f.read().split(":")[-1].strip()

In [4]:
r = requests.get(information_url.format("00002404"), auth=(key, ""))

In [40]:
r.json()

{'items': [{'action_date': '2018-11-15',
   'subcategory': 'termination',
   'links': {'self': '/company/00002404/filing-history/MzIxOTQ5MTk4M2FkaXF6a2N4',
    'document_metadata': 'https://frontend-doc-api.companieshouse.gov.uk/document/vmITVZ_ZjKC-iKDcxqKsBtvmIbjwck98jLFrLC-z3as'},
   'description': 'termination-director-company-with-name-termination-date',
   'category': 'officers',
   'type': 'TM01',
   'date': '2018-11-15',
   'description_values': {'officer_name': 'Jonathan Mark Moorhouse Green',
    'termination_date': '2018-11-15'},
   'pages': 1,
   'barcode': 'X7IROPXM',
   'transaction_id': 'MzIxOTQ5MTk4M2FkaXF6a2N4'},
  {'links': {'self': '/company/00002404/filing-history/MzIxOTQ5MTk3NWFkaXF6a2N4',
    'document_metadata': 'https://frontend-doc-api.companieshouse.gov.uk/document/gPFzKFmG1b4yblMB-HXAbS_ep98CTKEW5rTCpY3wuLA'},
   'description': 'termination-director-company-with-name-termination-date',
   'subcategory': 'termination',
   'action_date': '2018-11-15',
   'descr

In [74]:
# This finds all transaction id's for annual account documents

doc_ids = []

for each in r.json()['items']:
    
    if each['type'] == "AA":
        doc_ids.append( each['links']['document_metadata'])

doc_ids

['https://frontend-doc-api.companieshouse.gov.uk/document/SKk8jBdX8ZW9-UK1BxJB-He9xB6s6pbMRYW6gZulG5s',
 'https://frontend-doc-api.companieshouse.gov.uk/document/5akrmOTdSkP3Vm6KC8Nv4n6oG-B0ij4QAl5E-eTyafM']

In [75]:
aa = requests.get(doc_ids[0] + "/content", auth=(key, ""))

In [76]:
with open("test.pdf", "wb") as f:
        f.write(aa.content)

## Create a list of companies for which we don't have an electronic record on file

In [6]:
# Connect to mongodb of digital records for purposes of cross-checking
# Believe I've previously generated an index on CH code so searching should be fast
import pymongo

cl = pymongo.MongoClient()
db = cl['CH_records']
col = db['digital_record_scrapes']

In [11]:
test = col.find_one()

In [13]:
test

{'_id': ObjectId('5bd87f507db16a3f89d15dfc'),
 'doc_name': 'Prod224_0052_08028272_20170630.html',
 'doc_type': 'html',
 'doc_upload_date': '2018-10-30 15:57:03.613731',
 'arc_name': 'Accounts_Monthly_Data-March2018',
 'parsed': True,
 'doc_balancesheetdate': '2017-06-30',
 'doc_companieshouseregisterednumber': '08028272',
 'doc_standard_type': 'FRS-102',
 'doc_standard_date': '2014-09-01',
 'doc_standard_link': 'https://xbrl.frc.org.uk/FRS-102/2014-09-01/FRS-102-2014-09-01.xsd',
 'elements': [{'name': 'entitycurrentlegalorregisteredname',
   'value': 'Cromore Ltd',
   'unit': 'NA',
   'date': '2017-06-30'},
  {'name': 'ukcompanieshouseregisterednumber',
   'value': '08028272',
   'unit': 'NA',
   'date': '2017-06-30'},
  {'name': 'entitydormanttruefalse',
   'value': 'false',
   'unit': 'NA',
   'date': '2017-06-30'},
  {'name': 'entitytradingstatus',
   'value': '',
   'unit': 'NA',
   'date': '2017-06-30'},
  {'name': 'startdateforperiodcoveredbyreport',
   'value': '2016-07-01',
   

In [23]:
counter = 0
recorded = 0

# Load the very large CSV in chunks (to fit within RAM)
for chunk in pd.read_csv("~/data/BasicCompanyDataAsOneFile-2018-10-01.csv", chunksize=1000):
    print("Loaded chunk {}, containing {} records".format(counter, len(chunk)))
    
    no_record = pd.DataFrame()
    
    # Iterate through the entries, checking if each exists in the database
    for index, row in chunk.iterrows():
        doc_count = col.count_documents({'doc_companieshouseregisterednumber':row[' CompanyNumber']})
        
        # If it doesn't, record it
        if doc_count == 0:
            no_record = no_record.append(row, ignore_index=True)
            recorded += 1
    
    # Append the discovered missing DB entries to the output csv file
    if (counter > 0) & (len(no_record) > 0):
        with open("./output/CH_no_digital_records.csv", 'a') as f:
            no_record.to_csv(f, mode='a', header=False, index=False)
            print("Saved a chunk")
    
    # Create a csv file for the discovered missing DB entries if one doesn't exist yet
    else:
        no_record.to_csv("./output/CH_no_digital_records.csv", index=False)
        print("Saved first chunk")
    
    counter += 1

Loaded chunk 0, containing 1000 records
Saved first chunk
Loaded chunk 1, containing 1000 records
Saved a chunk
Loaded chunk 2, containing 1000 records
Saved a chunk
Loaded chunk 3, containing 1000 records
Saved a chunk
Loaded chunk 4, containing 1000 records
Saved a chunk
Loaded chunk 5, containing 1000 records
Saved a chunk
Loaded chunk 6, containing 1000 records
Saved a chunk
Loaded chunk 7, containing 1000 records
Saved a chunk
Loaded chunk 8, containing 1000 records
Saved a chunk
Loaded chunk 9, containing 1000 records
Saved a chunk
Loaded chunk 10, containing 1000 records
Saved a chunk
Loaded chunk 11, containing 1000 records
Saved a chunk
Loaded chunk 12, containing 1000 records
Saved a chunk
Loaded chunk 13, containing 1000 records
Saved a chunk
Loaded chunk 14, containing 1000 records
Saved a chunk
Loaded chunk 15, containing 1000 records
Saved a chunk
Loaded chunk 16, containing 1000 records
Saved a chunk
Loaded chunk 17, containing 1000 records
Saved a chunk
Loaded chunk 18,

Saved a chunk
Loaded chunk 149, containing 1000 records
Saved a chunk
Loaded chunk 150, containing 1000 records
Saved a chunk
Loaded chunk 151, containing 1000 records
Saved a chunk
Loaded chunk 152, containing 1000 records
Saved a chunk
Loaded chunk 153, containing 1000 records
Saved a chunk
Loaded chunk 154, containing 1000 records
Saved a chunk
Loaded chunk 155, containing 1000 records
Saved a chunk
Loaded chunk 156, containing 1000 records
Saved a chunk
Loaded chunk 157, containing 1000 records
Saved a chunk
Loaded chunk 158, containing 1000 records
Saved a chunk
Loaded chunk 159, containing 1000 records
Saved a chunk
Loaded chunk 160, containing 1000 records
Saved a chunk
Loaded chunk 161, containing 1000 records
Saved a chunk
Loaded chunk 162, containing 1000 records
Saved a chunk
Loaded chunk 163, containing 1000 records
Saved a chunk
Loaded chunk 164, containing 1000 records
Saved a chunk
Loaded chunk 165, containing 1000 records
Saved a chunk
Loaded chunk 166, containing 1000 

Saved a chunk
Loaded chunk 296, containing 1000 records
Saved a chunk
Loaded chunk 297, containing 1000 records
Saved a chunk
Loaded chunk 298, containing 1000 records
Saved a chunk
Loaded chunk 299, containing 1000 records
Saved a chunk
Loaded chunk 300, containing 1000 records
Saved a chunk
Loaded chunk 301, containing 1000 records
Saved a chunk
Loaded chunk 302, containing 1000 records
Saved a chunk
Loaded chunk 303, containing 1000 records
Saved a chunk
Loaded chunk 304, containing 1000 records
Saved a chunk
Loaded chunk 305, containing 1000 records
Saved a chunk
Loaded chunk 306, containing 1000 records
Saved a chunk
Loaded chunk 307, containing 1000 records
Saved a chunk
Loaded chunk 308, containing 1000 records
Saved a chunk
Loaded chunk 309, containing 1000 records
Saved a chunk
Loaded chunk 310, containing 1000 records
Saved a chunk
Loaded chunk 311, containing 1000 records
Saved a chunk
Loaded chunk 312, containing 1000 records
Saved a chunk
Loaded chunk 313, containing 1000 

Saved a chunk
Loaded chunk 443, containing 1000 records
Saved a chunk
Loaded chunk 444, containing 1000 records
Saved a chunk
Loaded chunk 445, containing 1000 records
Saved a chunk
Loaded chunk 446, containing 1000 records
Saved a chunk
Loaded chunk 447, containing 1000 records
Saved a chunk
Loaded chunk 448, containing 1000 records
Saved a chunk
Loaded chunk 449, containing 1000 records
Saved a chunk
Loaded chunk 450, containing 1000 records
Saved a chunk
Loaded chunk 451, containing 1000 records
Saved a chunk
Loaded chunk 452, containing 1000 records
Saved a chunk
Loaded chunk 453, containing 1000 records
Saved a chunk
Loaded chunk 454, containing 1000 records
Saved a chunk
Loaded chunk 455, containing 1000 records
Saved a chunk
Loaded chunk 456, containing 1000 records
Saved a chunk
Loaded chunk 457, containing 1000 records
Saved a chunk
Loaded chunk 458, containing 1000 records
Saved a chunk
Loaded chunk 459, containing 1000 records
Saved a chunk
Loaded chunk 460, containing 1000 

Saved a chunk
Loaded chunk 590, containing 1000 records
Saved a chunk
Loaded chunk 591, containing 1000 records
Saved a chunk
Loaded chunk 592, containing 1000 records
Saved a chunk
Loaded chunk 593, containing 1000 records
Saved a chunk
Loaded chunk 594, containing 1000 records
Saved a chunk
Loaded chunk 595, containing 1000 records
Saved a chunk
Loaded chunk 596, containing 1000 records
Saved a chunk
Loaded chunk 597, containing 1000 records
Saved a chunk
Loaded chunk 598, containing 1000 records
Saved a chunk
Loaded chunk 599, containing 1000 records
Saved a chunk
Loaded chunk 600, containing 1000 records
Saved a chunk
Loaded chunk 601, containing 1000 records
Saved a chunk
Loaded chunk 602, containing 1000 records
Saved a chunk
Loaded chunk 603, containing 1000 records
Saved a chunk
Loaded chunk 604, containing 1000 records
Saved a chunk
Loaded chunk 605, containing 1000 records
Saved a chunk
Loaded chunk 606, containing 1000 records
Saved a chunk
Loaded chunk 607, containing 1000 

Saved a chunk
Loaded chunk 737, containing 1000 records
Saved a chunk
Loaded chunk 738, containing 1000 records
Saved a chunk
Loaded chunk 739, containing 1000 records
Saved a chunk
Loaded chunk 740, containing 1000 records
Saved a chunk
Loaded chunk 741, containing 1000 records
Saved a chunk
Loaded chunk 742, containing 1000 records
Saved a chunk
Loaded chunk 743, containing 1000 records
Saved a chunk
Loaded chunk 744, containing 1000 records
Saved a chunk
Loaded chunk 745, containing 1000 records
Saved a chunk
Loaded chunk 746, containing 1000 records
Saved a chunk
Loaded chunk 747, containing 1000 records
Saved a chunk
Loaded chunk 748, containing 1000 records
Saved a chunk
Loaded chunk 749, containing 1000 records
Saved a chunk
Loaded chunk 750, containing 1000 records
Saved a chunk
Loaded chunk 751, containing 1000 records
Saved a chunk
Loaded chunk 752, containing 1000 records
Saved a chunk
Loaded chunk 753, containing 1000 records
Saved a chunk
Loaded chunk 754, containing 1000 

Saved a chunk
Loaded chunk 884, containing 1000 records
Saved a chunk
Loaded chunk 885, containing 1000 records
Saved a chunk
Loaded chunk 886, containing 1000 records
Saved a chunk
Loaded chunk 887, containing 1000 records
Saved a chunk
Loaded chunk 888, containing 1000 records
Saved a chunk
Loaded chunk 889, containing 1000 records
Saved a chunk
Loaded chunk 890, containing 1000 records
Saved a chunk
Loaded chunk 891, containing 1000 records
Saved a chunk
Loaded chunk 892, containing 1000 records
Saved a chunk
Loaded chunk 893, containing 1000 records
Saved a chunk
Loaded chunk 894, containing 1000 records
Saved a chunk
Loaded chunk 895, containing 1000 records
Saved a chunk
Loaded chunk 896, containing 1000 records
Saved a chunk
Loaded chunk 897, containing 1000 records
Saved a chunk
Loaded chunk 898, containing 1000 records
Saved a chunk
Loaded chunk 899, containing 1000 records
Saved a chunk
Loaded chunk 900, containing 1000 records
Saved a chunk
Loaded chunk 901, containing 1000 

Saved a chunk
Loaded chunk 1030, containing 1000 records
Saved a chunk
Loaded chunk 1031, containing 1000 records
Saved a chunk
Loaded chunk 1032, containing 1000 records
Saved a chunk
Loaded chunk 1033, containing 1000 records
Saved a chunk
Loaded chunk 1034, containing 1000 records
Saved a chunk
Loaded chunk 1035, containing 1000 records
Saved a chunk
Loaded chunk 1036, containing 1000 records
Saved a chunk
Loaded chunk 1037, containing 1000 records
Saved a chunk
Loaded chunk 1038, containing 1000 records
Saved a chunk
Loaded chunk 1039, containing 1000 records
Saved a chunk
Loaded chunk 1040, containing 1000 records
Saved a chunk
Loaded chunk 1041, containing 1000 records
Saved a chunk
Loaded chunk 1042, containing 1000 records
Saved a chunk
Loaded chunk 1043, containing 1000 records
Saved a chunk
Loaded chunk 1044, containing 1000 records
Saved a chunk
Loaded chunk 1045, containing 1000 records
Saved a chunk
Loaded chunk 1046, containing 1000 records
Saved a chunk
Loaded chunk 1047

Saved a chunk
Loaded chunk 1174, containing 1000 records
Saved a chunk
Loaded chunk 1175, containing 1000 records
Saved a chunk
Loaded chunk 1176, containing 1000 records
Saved a chunk
Loaded chunk 1177, containing 1000 records
Saved a chunk
Loaded chunk 1178, containing 1000 records
Saved a chunk
Loaded chunk 1179, containing 1000 records
Saved a chunk
Loaded chunk 1180, containing 1000 records
Saved a chunk
Loaded chunk 1181, containing 1000 records
Saved a chunk
Loaded chunk 1182, containing 1000 records
Saved a chunk
Loaded chunk 1183, containing 1000 records
Saved a chunk
Loaded chunk 1184, containing 1000 records
Saved a chunk
Loaded chunk 1185, containing 1000 records
Saved a chunk
Loaded chunk 1186, containing 1000 records
Saved a chunk
Loaded chunk 1187, containing 1000 records
Saved a chunk
Loaded chunk 1188, containing 1000 records
Saved a chunk
Loaded chunk 1189, containing 1000 records
Saved a chunk
Loaded chunk 1190, containing 1000 records
Saved a chunk
Loaded chunk 1191

Saved a chunk
Loaded chunk 1318, containing 1000 records
Saved a chunk
Loaded chunk 1319, containing 1000 records
Saved a chunk
Loaded chunk 1320, containing 1000 records
Saved a chunk
Loaded chunk 1321, containing 1000 records
Saved a chunk
Loaded chunk 1322, containing 1000 records
Saved a chunk
Loaded chunk 1323, containing 1000 records
Saved a chunk
Loaded chunk 1324, containing 1000 records
Saved a chunk
Loaded chunk 1325, containing 1000 records
Saved a chunk
Loaded chunk 1326, containing 1000 records
Saved a chunk
Loaded chunk 1327, containing 1000 records
Saved a chunk
Loaded chunk 1328, containing 1000 records
Saved a chunk
Loaded chunk 1329, containing 1000 records
Saved a chunk
Loaded chunk 1330, containing 1000 records
Saved a chunk
Loaded chunk 1331, containing 1000 records
Saved a chunk
Loaded chunk 1332, containing 1000 records
Saved a chunk
Loaded chunk 1333, containing 1000 records
Saved a chunk
Loaded chunk 1334, containing 1000 records
Saved a chunk
Loaded chunk 1335

Saved a chunk
Loaded chunk 1462, containing 1000 records
Saved a chunk
Loaded chunk 1463, containing 1000 records
Saved a chunk
Loaded chunk 1464, containing 1000 records
Saved a chunk
Loaded chunk 1465, containing 1000 records
Saved a chunk
Loaded chunk 1466, containing 1000 records
Saved a chunk
Loaded chunk 1467, containing 1000 records
Saved a chunk
Loaded chunk 1468, containing 1000 records
Saved a chunk
Loaded chunk 1469, containing 1000 records
Saved a chunk
Loaded chunk 1470, containing 1000 records
Saved a chunk
Loaded chunk 1471, containing 1000 records
Saved a chunk
Loaded chunk 1472, containing 1000 records
Saved a chunk
Loaded chunk 1473, containing 1000 records
Saved a chunk
Loaded chunk 1474, containing 1000 records
Saved a chunk
Loaded chunk 1475, containing 1000 records
Saved a chunk
Loaded chunk 1476, containing 1000 records
Saved a chunk
Loaded chunk 1477, containing 1000 records
Saved a chunk
Loaded chunk 1478, containing 1000 records
Saved a chunk
Loaded chunk 1479

Saved a chunk
Loaded chunk 1606, containing 1000 records
Saved a chunk
Loaded chunk 1607, containing 1000 records
Saved a chunk
Loaded chunk 1608, containing 1000 records
Saved a chunk
Loaded chunk 1609, containing 1000 records
Saved a chunk
Loaded chunk 1610, containing 1000 records
Saved a chunk
Loaded chunk 1611, containing 1000 records
Saved a chunk
Loaded chunk 1612, containing 1000 records
Saved a chunk
Loaded chunk 1613, containing 1000 records
Saved a chunk
Loaded chunk 1614, containing 1000 records
Saved a chunk
Loaded chunk 1615, containing 1000 records
Saved a chunk
Loaded chunk 1616, containing 1000 records
Saved a chunk
Loaded chunk 1617, containing 1000 records
Saved a chunk
Loaded chunk 1618, containing 1000 records
Saved a chunk
Loaded chunk 1619, containing 1000 records
Saved a chunk
Loaded chunk 1620, containing 1000 records
Saved a chunk
Loaded chunk 1621, containing 1000 records
Saved a chunk
Loaded chunk 1622, containing 1000 records
Saved a chunk
Loaded chunk 1623

Saved a chunk
Loaded chunk 1750, containing 1000 records
Saved a chunk
Loaded chunk 1751, containing 1000 records
Saved a chunk
Loaded chunk 1752, containing 1000 records
Saved a chunk
Loaded chunk 1753, containing 1000 records
Saved a chunk
Loaded chunk 1754, containing 1000 records
Saved a chunk
Loaded chunk 1755, containing 1000 records
Saved a chunk
Loaded chunk 1756, containing 1000 records
Saved a chunk
Loaded chunk 1757, containing 1000 records
Saved a chunk
Loaded chunk 1758, containing 1000 records
Saved a chunk
Loaded chunk 1759, containing 1000 records
Saved a chunk
Loaded chunk 1760, containing 1000 records
Saved a chunk
Loaded chunk 1761, containing 1000 records
Saved a chunk
Loaded chunk 1762, containing 1000 records
Saved a chunk
Loaded chunk 1763, containing 1000 records
Saved a chunk
Loaded chunk 1764, containing 1000 records
Saved a chunk
Loaded chunk 1765, containing 1000 records
Saved a chunk
Loaded chunk 1766, containing 1000 records
Saved a chunk
Loaded chunk 1767

Saved a chunk
Loaded chunk 1894, containing 1000 records
Saved a chunk
Loaded chunk 1895, containing 1000 records
Saved a chunk
Loaded chunk 1896, containing 1000 records
Saved a chunk
Loaded chunk 1897, containing 1000 records
Saved a chunk
Loaded chunk 1898, containing 1000 records
Saved a chunk
Loaded chunk 1899, containing 1000 records
Saved a chunk
Loaded chunk 1900, containing 1000 records
Saved a chunk
Loaded chunk 1901, containing 1000 records
Saved a chunk
Loaded chunk 1902, containing 1000 records
Saved a chunk
Loaded chunk 1903, containing 1000 records
Saved a chunk
Loaded chunk 1904, containing 1000 records
Saved a chunk
Loaded chunk 1905, containing 1000 records
Saved a chunk
Loaded chunk 1906, containing 1000 records
Saved a chunk
Loaded chunk 1907, containing 1000 records
Saved a chunk
Loaded chunk 1908, containing 1000 records
Saved a chunk
Loaded chunk 1909, containing 1000 records
Saved a chunk
Loaded chunk 1910, containing 1000 records
Saved a chunk
Loaded chunk 1911

Saved a chunk
Loaded chunk 2038, containing 1000 records
Saved a chunk
Loaded chunk 2039, containing 1000 records
Saved a chunk
Loaded chunk 2040, containing 1000 records
Saved a chunk
Loaded chunk 2041, containing 1000 records
Saved a chunk
Loaded chunk 2042, containing 1000 records
Saved a chunk
Loaded chunk 2043, containing 1000 records
Saved a chunk
Loaded chunk 2044, containing 1000 records
Saved a chunk
Loaded chunk 2045, containing 1000 records
Saved a chunk
Loaded chunk 2046, containing 1000 records
Saved a chunk
Loaded chunk 2047, containing 1000 records
Saved a chunk
Loaded chunk 2048, containing 1000 records
Saved a chunk
Loaded chunk 2049, containing 1000 records
Saved a chunk
Loaded chunk 2050, containing 1000 records
Saved a chunk
Loaded chunk 2051, containing 1000 records
Saved a chunk
Loaded chunk 2052, containing 1000 records
Saved a chunk
Loaded chunk 2053, containing 1000 records
Saved a chunk
Loaded chunk 2054, containing 1000 records
Saved a chunk
Loaded chunk 2055

Saved a chunk
Loaded chunk 2182, containing 1000 records
Saved a chunk
Loaded chunk 2183, containing 1000 records
Saved a chunk
Loaded chunk 2184, containing 1000 records
Saved a chunk
Loaded chunk 2185, containing 1000 records
Saved a chunk
Loaded chunk 2186, containing 1000 records
Saved a chunk
Loaded chunk 2187, containing 1000 records
Saved a chunk
Loaded chunk 2188, containing 1000 records
Saved a chunk
Loaded chunk 2189, containing 1000 records
Saved a chunk
Loaded chunk 2190, containing 1000 records
Saved a chunk
Loaded chunk 2191, containing 1000 records
Saved a chunk
Loaded chunk 2192, containing 1000 records
Saved a chunk
Loaded chunk 2193, containing 1000 records
Saved a chunk
Loaded chunk 2194, containing 1000 records
Saved a chunk
Loaded chunk 2195, containing 1000 records
Saved a chunk
Loaded chunk 2196, containing 1000 records
Saved a chunk
Loaded chunk 2197, containing 1000 records
Saved a chunk
Loaded chunk 2198, containing 1000 records
Saved a chunk
Loaded chunk 2199

Saved a chunk
Loaded chunk 2326, containing 1000 records
Saved a chunk
Loaded chunk 2327, containing 1000 records
Saved a chunk
Loaded chunk 2328, containing 1000 records
Saved a chunk
Loaded chunk 2329, containing 1000 records
Saved a chunk
Loaded chunk 2330, containing 1000 records
Saved a chunk
Loaded chunk 2331, containing 1000 records
Saved a chunk
Loaded chunk 2332, containing 1000 records
Saved a chunk
Loaded chunk 2333, containing 1000 records
Saved a chunk
Loaded chunk 2334, containing 1000 records
Saved a chunk
Loaded chunk 2335, containing 1000 records
Saved a chunk
Loaded chunk 2336, containing 1000 records
Saved a chunk
Loaded chunk 2337, containing 1000 records
Saved a chunk
Loaded chunk 2338, containing 1000 records
Saved a chunk
Loaded chunk 2339, containing 1000 records
Saved a chunk
Loaded chunk 2340, containing 1000 records
Saved a chunk
Loaded chunk 2341, containing 1000 records
Saved a chunk
Loaded chunk 2342, containing 1000 records
Saved a chunk
Loaded chunk 2343

Saved a chunk
Loaded chunk 2470, containing 1000 records
Saved a chunk
Loaded chunk 2471, containing 1000 records
Saved a chunk
Loaded chunk 2472, containing 1000 records
Saved a chunk
Loaded chunk 2473, containing 1000 records
Saved a chunk
Loaded chunk 2474, containing 1000 records
Saved a chunk
Loaded chunk 2475, containing 1000 records
Saved a chunk
Loaded chunk 2476, containing 1000 records
Saved a chunk
Loaded chunk 2477, containing 1000 records
Saved a chunk
Loaded chunk 2478, containing 1000 records
Saved a chunk
Loaded chunk 2479, containing 1000 records
Saved a chunk
Loaded chunk 2480, containing 1000 records
Saved a chunk
Loaded chunk 2481, containing 1000 records
Saved a chunk
Loaded chunk 2482, containing 1000 records
Saved a chunk
Loaded chunk 2483, containing 1000 records
Saved a chunk
Loaded chunk 2484, containing 1000 records
Saved a chunk
Loaded chunk 2485, containing 1000 records
Saved a chunk
Loaded chunk 2486, containing 1000 records
Saved a chunk
Loaded chunk 2487

Saved a chunk
Loaded chunk 2614, containing 1000 records
Saved a chunk
Loaded chunk 2615, containing 1000 records
Saved a chunk
Loaded chunk 2616, containing 1000 records
Saved a chunk
Loaded chunk 2617, containing 1000 records
Saved a chunk
Loaded chunk 2618, containing 1000 records
Saved a chunk
Loaded chunk 2619, containing 1000 records
Saved a chunk
Loaded chunk 2620, containing 1000 records
Saved a chunk
Loaded chunk 2621, containing 1000 records
Saved a chunk
Loaded chunk 2622, containing 1000 records
Saved a chunk
Loaded chunk 2623, containing 1000 records
Saved a chunk
Loaded chunk 2624, containing 1000 records
Saved a chunk
Loaded chunk 2625, containing 1000 records
Saved a chunk
Loaded chunk 2626, containing 1000 records
Saved a chunk
Loaded chunk 2627, containing 1000 records
Saved a chunk
Loaded chunk 2628, containing 1000 records
Saved a chunk
Loaded chunk 2629, containing 1000 records
Saved a chunk
Loaded chunk 2630, containing 1000 records
Saved a chunk
Loaded chunk 2631

Saved a chunk
Loaded chunk 2758, containing 1000 records
Saved a chunk
Loaded chunk 2759, containing 1000 records
Saved a chunk
Loaded chunk 2760, containing 1000 records
Saved a chunk
Loaded chunk 2761, containing 1000 records
Saved a chunk
Loaded chunk 2762, containing 1000 records
Saved a chunk
Loaded chunk 2763, containing 1000 records
Saved a chunk
Loaded chunk 2764, containing 1000 records
Saved a chunk
Loaded chunk 2765, containing 1000 records
Saved a chunk
Loaded chunk 2766, containing 1000 records
Saved a chunk
Loaded chunk 2767, containing 1000 records
Saved a chunk
Loaded chunk 2768, containing 1000 records
Saved a chunk
Loaded chunk 2769, containing 1000 records
Saved a chunk
Loaded chunk 2770, containing 1000 records
Saved a chunk
Loaded chunk 2771, containing 1000 records
Saved a chunk
Loaded chunk 2772, containing 1000 records
Saved a chunk
Loaded chunk 2773, containing 1000 records
Saved a chunk
Loaded chunk 2774, containing 1000 records
Saved a chunk
Loaded chunk 2775

Saved a chunk
Loaded chunk 2902, containing 1000 records
Saved a chunk
Loaded chunk 2903, containing 1000 records
Saved a chunk
Loaded chunk 2904, containing 1000 records
Saved a chunk
Loaded chunk 2905, containing 1000 records
Saved a chunk
Loaded chunk 2906, containing 1000 records
Saved a chunk
Loaded chunk 2907, containing 1000 records
Saved a chunk
Loaded chunk 2908, containing 1000 records
Saved a chunk
Loaded chunk 2909, containing 1000 records
Saved a chunk
Loaded chunk 2910, containing 1000 records
Saved a chunk
Loaded chunk 2911, containing 1000 records
Saved a chunk
Loaded chunk 2912, containing 1000 records
Saved a chunk
Loaded chunk 2913, containing 1000 records
Saved a chunk
Loaded chunk 2914, containing 1000 records
Saved a chunk
Loaded chunk 2915, containing 1000 records
Saved a chunk
Loaded chunk 2916, containing 1000 records
Saved a chunk
Loaded chunk 2917, containing 1000 records
Saved a chunk
Loaded chunk 2918, containing 1000 records
Saved a chunk
Loaded chunk 2919

Saved a chunk
Loaded chunk 3046, containing 1000 records
Saved a chunk
Loaded chunk 3047, containing 1000 records
Saved a chunk
Loaded chunk 3048, containing 1000 records
Saved a chunk
Loaded chunk 3049, containing 1000 records
Saved a chunk
Loaded chunk 3050, containing 1000 records
Saved a chunk
Loaded chunk 3051, containing 1000 records
Saved a chunk
Loaded chunk 3052, containing 1000 records
Saved a chunk
Loaded chunk 3053, containing 1000 records
Saved a chunk
Loaded chunk 3054, containing 1000 records
Saved a chunk
Loaded chunk 3055, containing 1000 records
Saved a chunk
Loaded chunk 3056, containing 1000 records
Saved a chunk
Loaded chunk 3057, containing 1000 records
Saved a chunk
Loaded chunk 3058, containing 1000 records
Saved a chunk
Loaded chunk 3059, containing 1000 records
Saved a chunk
Loaded chunk 3060, containing 1000 records
Saved a chunk
Loaded chunk 3061, containing 1000 records
Saved a chunk
Loaded chunk 3062, containing 1000 records
Saved a chunk
Loaded chunk 3063

Saved a chunk
Loaded chunk 3190, containing 1000 records
Saved a chunk
Loaded chunk 3191, containing 1000 records
Saved a chunk
Loaded chunk 3192, containing 1000 records
Saved a chunk
Loaded chunk 3193, containing 1000 records
Saved a chunk
Loaded chunk 3194, containing 1000 records
Saved a chunk
Loaded chunk 3195, containing 1000 records
Saved a chunk
Loaded chunk 3196, containing 1000 records
Saved a chunk
Loaded chunk 3197, containing 1000 records
Saved a chunk
Loaded chunk 3198, containing 1000 records
Saved a chunk
Loaded chunk 3199, containing 1000 records
Saved a chunk
Loaded chunk 3200, containing 1000 records
Saved a chunk
Loaded chunk 3201, containing 1000 records
Saved a chunk
Loaded chunk 3202, containing 1000 records
Saved a chunk
Loaded chunk 3203, containing 1000 records
Saved a chunk
Loaded chunk 3204, containing 1000 records
Saved a chunk
Loaded chunk 3205, containing 1000 records
Saved a chunk
Loaded chunk 3206, containing 1000 records
Saved a chunk
Loaded chunk 3207

Saved a chunk
Loaded chunk 3334, containing 1000 records
Saved a chunk
Loaded chunk 3335, containing 1000 records
Saved a chunk
Loaded chunk 3336, containing 1000 records
Saved a chunk
Loaded chunk 3337, containing 1000 records
Saved a chunk
Loaded chunk 3338, containing 1000 records
Saved a chunk
Loaded chunk 3339, containing 1000 records
Saved a chunk
Loaded chunk 3340, containing 1000 records
Saved a chunk
Loaded chunk 3341, containing 1000 records
Saved a chunk
Loaded chunk 3342, containing 1000 records
Saved a chunk
Loaded chunk 3343, containing 1000 records
Saved a chunk
Loaded chunk 3344, containing 1000 records
Saved a chunk
Loaded chunk 3345, containing 1000 records
Saved a chunk
Loaded chunk 3346, containing 1000 records
Saved a chunk
Loaded chunk 3347, containing 1000 records
Saved a chunk
Loaded chunk 3348, containing 1000 records
Saved a chunk
Loaded chunk 3349, containing 1000 records
Saved a chunk
Loaded chunk 3350, containing 1000 records
Saved a chunk
Loaded chunk 3351

Saved a chunk
Loaded chunk 3478, containing 1000 records
Saved a chunk
Loaded chunk 3479, containing 1000 records
Saved a chunk
Loaded chunk 3480, containing 1000 records
Saved a chunk
Loaded chunk 3481, containing 1000 records
Saved a chunk
Loaded chunk 3482, containing 1000 records
Saved a chunk
Loaded chunk 3483, containing 1000 records
Saved a chunk
Loaded chunk 3484, containing 1000 records
Saved a chunk
Loaded chunk 3485, containing 1000 records
Saved a chunk
Loaded chunk 3486, containing 1000 records
Saved a chunk
Loaded chunk 3487, containing 1000 records
Saved a chunk
Loaded chunk 3488, containing 1000 records
Saved a chunk
Loaded chunk 3489, containing 1000 records
Saved a chunk
Loaded chunk 3490, containing 1000 records
Saved a chunk
Loaded chunk 3491, containing 1000 records
Saved a chunk
Loaded chunk 3492, containing 1000 records
Saved a chunk
Loaded chunk 3493, containing 1000 records
Saved a chunk
Loaded chunk 3494, containing 1000 records
Saved a chunk
Loaded chunk 3495

Saved a chunk
Loaded chunk 3622, containing 1000 records
Saved a chunk
Loaded chunk 3623, containing 1000 records
Saved a chunk
Loaded chunk 3624, containing 1000 records
Saved a chunk
Loaded chunk 3625, containing 1000 records
Saved a chunk
Loaded chunk 3626, containing 1000 records
Saved a chunk
Loaded chunk 3627, containing 1000 records
Saved a chunk
Loaded chunk 3628, containing 1000 records
Saved a chunk
Loaded chunk 3629, containing 1000 records
Saved a chunk
Loaded chunk 3630, containing 1000 records
Saved a chunk
Loaded chunk 3631, containing 1000 records
Saved a chunk
Loaded chunk 3632, containing 1000 records
Saved a chunk
Loaded chunk 3633, containing 1000 records
Saved a chunk
Loaded chunk 3634, containing 1000 records
Saved a chunk
Loaded chunk 3635, containing 1000 records
Saved a chunk
Loaded chunk 3636, containing 1000 records
Saved a chunk
Loaded chunk 3637, containing 1000 records
Saved a chunk
Loaded chunk 3638, containing 1000 records
Saved a chunk
Loaded chunk 3639

Saved a chunk
Loaded chunk 3766, containing 1000 records
Saved a chunk
Loaded chunk 3767, containing 1000 records
Saved a chunk
Loaded chunk 3768, containing 1000 records
Saved a chunk
Loaded chunk 3769, containing 1000 records
Saved a chunk
Loaded chunk 3770, containing 1000 records
Saved a chunk
Loaded chunk 3771, containing 1000 records
Saved a chunk
Loaded chunk 3772, containing 1000 records
Saved a chunk
Loaded chunk 3773, containing 1000 records
Saved a chunk
Loaded chunk 3774, containing 1000 records
Saved a chunk
Loaded chunk 3775, containing 1000 records
Saved a chunk
Loaded chunk 3776, containing 1000 records
Saved a chunk
Loaded chunk 3777, containing 1000 records
Saved a chunk
Loaded chunk 3778, containing 1000 records
Saved a chunk
Loaded chunk 3779, containing 1000 records
Saved a chunk
Loaded chunk 3780, containing 1000 records
Saved a chunk
Loaded chunk 3781, containing 1000 records
Saved a chunk
Loaded chunk 3782, containing 1000 records
Saved a chunk
Loaded chunk 3783

Saved a chunk
Loaded chunk 3910, containing 1000 records
Saved a chunk
Loaded chunk 3911, containing 1000 records
Saved a chunk
Loaded chunk 3912, containing 1000 records
Saved a chunk
Loaded chunk 3913, containing 1000 records
Saved a chunk
Loaded chunk 3914, containing 1000 records
Saved a chunk
Loaded chunk 3915, containing 1000 records
Saved a chunk
Loaded chunk 3916, containing 1000 records
Saved a chunk
Loaded chunk 3917, containing 1000 records
Saved a chunk
Loaded chunk 3918, containing 1000 records
Saved a chunk
Loaded chunk 3919, containing 1000 records
Saved a chunk
Loaded chunk 3920, containing 1000 records
Saved a chunk
Loaded chunk 3921, containing 1000 records
Saved a chunk
Loaded chunk 3922, containing 1000 records
Saved a chunk
Loaded chunk 3923, containing 1000 records
Saved a chunk
Loaded chunk 3924, containing 1000 records
Saved a chunk
Loaded chunk 3925, containing 1000 records
Saved a chunk
Loaded chunk 3926, containing 1000 records
Saved a chunk
Loaded chunk 3927

Saved a chunk
Loaded chunk 4054, containing 1000 records
Saved a chunk
Loaded chunk 4055, containing 1000 records
Saved a chunk
Loaded chunk 4056, containing 1000 records
Saved a chunk
Loaded chunk 4057, containing 1000 records
Saved a chunk
Loaded chunk 4058, containing 1000 records
Saved a chunk
Loaded chunk 4059, containing 1000 records
Saved a chunk
Loaded chunk 4060, containing 1000 records
Saved a chunk
Loaded chunk 4061, containing 1000 records
Saved a chunk
Loaded chunk 4062, containing 1000 records
Saved a chunk
Loaded chunk 4063, containing 1000 records
Saved a chunk
Loaded chunk 4064, containing 1000 records
Saved a chunk
Loaded chunk 4065, containing 1000 records
Saved a chunk
Loaded chunk 4066, containing 1000 records
Saved a chunk
Loaded chunk 4067, containing 1000 records
Saved a chunk
Loaded chunk 4068, containing 1000 records
Saved a chunk
Loaded chunk 4069, containing 1000 records
Saved a chunk
Loaded chunk 4070, containing 1000 records
Saved a chunk
Loaded chunk 4071

Saved a chunk
Loaded chunk 4198, containing 1000 records
Saved a chunk
Loaded chunk 4199, containing 1000 records
Saved a chunk
Loaded chunk 4200, containing 1000 records
Saved a chunk
Loaded chunk 4201, containing 1000 records
Saved a chunk
Loaded chunk 4202, containing 1000 records
Saved a chunk
Loaded chunk 4203, containing 1000 records
Saved a chunk
Loaded chunk 4204, containing 1000 records
Saved a chunk
Loaded chunk 4205, containing 1000 records
Saved a chunk
Loaded chunk 4206, containing 1000 records
Saved a chunk
Loaded chunk 4207, containing 1000 records
Saved a chunk
Loaded chunk 4208, containing 1000 records
Saved a chunk
Loaded chunk 4209, containing 1000 records
Saved a chunk
Loaded chunk 4210, containing 1000 records
Saved a chunk
Loaded chunk 4211, containing 1000 records
Saved a chunk
Loaded chunk 4212, containing 1000 records
Saved a chunk
Loaded chunk 4213, containing 1000 records
Saved a chunk
Loaded chunk 4214, containing 1000 records
Saved a chunk
Loaded chunk 4215

In [17]:
col.create_index('doc_companieshouseregisterednumber')

'doc_companieshouseregisterednumber_1'

## Determine whether, for each file with no digital record, a paper record was submitted

For each entry in the "doesn't have an electronic record" csv file, see if it has a paper record instead by querying the Companies House API.  Record the date of the entry.

In [None]:
counter = 0

for chunk in pd.read_csv("./output/CH_no_digital_records.csv", chunksize=100):
    
    results = pd.DataFrame()
    
    for index, row in chunk.iterrows():
        
        # Wait for 2/10th of a second - this to accomodate rate limiting by CH to 600 requests/minute
        tic.sleep(.2)
        
        r = requests.get(information_url.format(row[' CompanyNumber']), auth=(key, ""))

        doc_dates = []
            
        try:
            for each in r.json()['items']:
    
                if each['type'] == "AA":
                    doc_dates.append( each['date'])
        
            row['num_paper_records'] = len(doc_dates)
            row['paper_record_dates'] = ":".join(doc_dates)
            row['response_code'] = r.status_code
            
            results = results.append(row, ignore_index=True)
            
        except:
            row['num_paper_records'] = None
            row['paper_record_dates'] = None
            row['response_code'] = r.status_code
            
            results = results.append(row, ignore_index=True)

    # Append the discovered missing DB entries to the output csv file
    if (counter > 0) & (len(chunk) > 0):
        with open("./output/CH_no_digital_records_searched.csv", 'a') as f:
            results.to_csv(f, mode='a', header=False, index=False)
            print("Saved a chunk.  Reporting latest:", r.status_code, doc_dates)
    
    # Create a csv file for the discovered missing DB entries if one doesn't exist yet
    else:
        results.to_csv("./output/CH_no_digital_records_searched.csv", index=False)
        print("Saved first chunk")
    
    counter += 1

Saved first chunk
Saved a chunk.  Reporting latest: 200 ['2018-05-01']
Saved a chunk.  Reporting latest: 200 ['2018-08-22', '2017-07-17', '2016-08-11', '2015-09-08', '2014-05-15', '2013-05-31', '2012-06-19', '2011-04-06', '2010-08-12', '2009-06-07', '2008-03-31']
Saved a chunk.  Reporting latest: 200 ['2011-11-22', '2010-11-22', '2010-03-30', '2009-01-08', '2008-06-17']
Saved a chunk.  Reporting latest: 200 []
Saved a chunk.  Reporting latest: 200 []
Saved a chunk.  Reporting latest: 200 ['2012-01-17', '2010-12-08', '2009-11-28', '2008-10-24', '2007-12-11', '2006-12-06']
Saved a chunk.  Reporting latest: 200 []
Saved a chunk.  Reporting latest: 200 []
Saved a chunk.  Reporting latest: 200 ['2006-11-05']
Saved a chunk.  Reporting latest: 429 []
Saved a chunk.  Reporting latest: 429 []
Saved a chunk.  Reporting latest: 200 ['2018-11-04', '2018-03-21', '2016-08-10']
