In [22]:
import re

# Setup constants used elsewhere

date_pattern = re.compile(r'As of ([A-Za-z]{3,}) (20\d{2})', re.IGNORECASE)
item_count = re.compile(r'Total Data Items: ([\d,]+)')
data_yyyymm = re.compile(r'(20\d{2})(\d{2})')
month_map = dict(jan=1, feb=2, mar=3, apr=4, may=5, jun=6, jul=7, aug=8, sep=9, oct=10, nov=11, dec=12, january=1, february=2, march=3, april=4, june=6, july=7, august=8, september=9, october=10, november=11, december=12)
exclude_list = [
    "202104", # This month's data is missing # of records
    "202105", # Same "data month" as above
    "202106", # Same "data month" as above
    "202107", # This month's data is missing # of records
    "202108", # Same "data month" as above
]
archive_url_regex = re.compile(r'GET /web/(\d{14})/')
archive_url_template = "https://web.archive.org/web/{timestamp}/https://www.imdb.com/pressroom/stats/"

In [23]:
import pathlib

# Load data file list from disk

data_files = sorted([item for item in pathlib.Path("./data/").glob("*.txt") if not any([exclude in item.name for exclude in exclude_list])])

In [24]:
import datetime

def get_file_data(file: pathlib.Path) -> tuple[tuple[int, int], tuple[int, int], int, str, datetime.datetime]:
    """Returns: [[file_year, file_month], [data_year, data_month], num_records, archive_url, archive_date_timestamp]"""
    # print(file)
    file_text = file.read_text()
    archive_original_timestamp = archive_url_regex.findall(file_text.partition("<!DOCTYPE html>")[0])[-1]
    archive_date = datetime.datetime.strptime(archive_original_timestamp, "%Y%m%d%H%M%S")
    # print(archive_url_template.format(timestamp=archive_original_timestamp))
    file_year, file_month = data_yyyymm.search(file.name).group(1, 2)
    data_month, data_year = date_pattern.search(file_text).group(1, 2)
    num_records = item_count.search(file_text).group(1)
    return ((int(file_year), int(file_month)), (int(data_year), month_map[data_month.lower()]), int(num_records.replace(",", "")), archive_url_template.format(timestamp=archive_original_timestamp), archive_date)

In [25]:
computed = [get_file_data(f) for f in data_files]
unique_data_months = { item[1]: item for item in computed }

In [34]:
heading="Data Year/Data Month,# of Records,Archive URL,Archive Date"

rows = [heading.split(",")]
for _, (data_month, data_year), num_records, archive_url, archive_date in sorted(unique_data_months.values(), key=lambda x: x[1]):
    rows.append([f'{data_month:02d}/{data_year}', f'{num_records:,}', archive_url, archive_date.strftime("%Y-%m-%d %H:%M:%S")])

import csv
import io

csv_file = io.StringIO()
csv.writer(csv_file).writerows(rows)
csv_file.seek(0)
print(csv_file.read())

Data Year/Data Month,# of Records,Archive URL,Archive Date
2019/5,"336,394,426",https://web.archive.org/web/20191122024753/https://www.imdb.com/pressroom/stats/,2019-11-22 02:47:53
2019/12,"364,275,174",https://web.archive.org/web/20200521064449/https://www.imdb.com/pressroom/stats/,2020-05-21 06:44:49
2020/6,"384,871,729",https://web.archive.org/web/20200715020304/https://www.imdb.com/pressroom/stats/,2020-07-15 02:03:04
2020/10,"399,737,106",https://web.archive.org/web/20201111220945/https://www.imdb.com/pressroom/stats/,2020-11-11 22:09:45
2020/12,"404,763,829",https://web.archive.org/web/20210228150759/https://www.imdb.com/pressroom/stats/,2021-02-28 15:07:59
2021/9,"407,000,169",https://web.archive.org/web/20211107065538/https://www.imdb.com/pressroom/stats/,2021-11-07 06:55:38
2021/12,"428,149,001",https://web.archive.org/web/20220319195717/https://www.imdb.com/pressroom/stats/,2022-03-19 19:57:17
2022/3,"447,161,643",https://web.archive.org/web/20220629235407/https://www.imdb.co

|Data Year/Data Month|# of Records|Archive URL|Archive Date                                                                    |
|--------------------|------------|-----------|--------------------------------------------------------------------------------|
|2019/5              |336,394,426 |https://web.archive.org/web/20191122024753/https://www.imdb.com/pressroom/stats/|2019-11-22 02:47:53                                                             |
|2019/12             |364,275,174 |https://web.archive.org/web/20200521064449/https://www.imdb.com/pressroom/stats/|2020-05-21 06:44:49                                                             |
|2020/6              |384,871,729 |https://web.archive.org/web/20200715020304/https://www.imdb.com/pressroom/stats/|2020-07-15 02:03:04                                                             |
|2020/10             |399,737,106 |https://web.archive.org/web/20201111220945/https://www.imdb.com/pressroom/stats/|2020-11-11 22:09:45                                                             |
|2020/12             |404,763,829 |https://web.archive.org/web/20210228150759/https://www.imdb.com/pressroom/stats/|2021-02-28 15:07:59                                                             |
|2021/9              |407,000,169 |https://web.archive.org/web/20211107065538/https://www.imdb.com/pressroom/stats/|2021-11-07 06:55:38                                                             |
|2021/12             |428,149,001 |https://web.archive.org/web/20220319195717/https://www.imdb.com/pressroom/stats/|2022-03-19 19:57:17                                                             |
|2022/3              |447,161,643 |https://web.archive.org/web/20220629235407/https://www.imdb.com/pressroom/stats/|2022-06-29 23:54:07                                                             |
|2022/6              |451,282,450 |https://web.archive.org/web/20220901052223/https://www.imdb.com/pressroom/stats/|2022-09-01 05:22:23                                                             |
|2022/9              |465,914,377 |https://web.archive.org/web/20221026144332/https://www.imdb.com/pressroom/stats/|2022-10-26 14:43:32                                                             |
|2022/11             |477,899,162 |https://web.archive.org/web/20221206045738/https://www.imdb.com/pressroom/stats/|2022-12-06 04:57:38                                                             |
|2022/12             |484,123,998 |https://web.archive.org/web/20230330064107/https://www.imdb.com/pressroom/stats/|2023-03-30 06:41:07                                                             |


In [35]:
heading = "qid,P4876,qal585,S854,s1065,s2960,s813"

def format_date(date: datetime.datetime, precision: int = 11) -> str:
    return f"+{date.year:04d}-{date.month:02d}-{date.day:02d}T{date.hour:02d}:{date.minute:02d}:{date.second:02d}Z/{precision}"

reference_url = "https://www.imdb.com/pressroom/stats/"
retrieved = format_date(datetime.datetime.now())
qid="P345"
print(heading)
for (file_year, file_month), (data_year, data_month), num_records, archive_url, archive_date in unique_data_months.values():
    print(f'{qid},{num_records},{format_date(datetime.datetime(data_year, data_month, 1), precision=10)},"""{reference_url}""","""{archive_url}""",{format_date(archive_date)},{retrieved}')

qid,P4876,qal585,S854,s1065,s2960,s813
P345,336394426,+2019-05-01T00:00:00Z/10,"""https://www.imdb.com/pressroom/stats/""","""https://web.archive.org/web/20191122024753/https://www.imdb.com/pressroom/stats/""",+2019-11-22T02:47:53Z/11,+2023-04-01T08:19:29Z/11
P345,364275174,+2019-12-01T00:00:00Z/10,"""https://www.imdb.com/pressroom/stats/""","""https://web.archive.org/web/20200521064449/https://www.imdb.com/pressroom/stats/""",+2020-05-21T06:44:49Z/11,+2023-04-01T08:19:29Z/11
P345,384871729,+2020-06-01T00:00:00Z/10,"""https://www.imdb.com/pressroom/stats/""","""https://web.archive.org/web/20200715020304/https://www.imdb.com/pressroom/stats/""",+2020-07-15T02:03:04Z/11,+2023-04-01T08:19:29Z/11
P345,399737106,+2020-10-01T00:00:00Z/10,"""https://www.imdb.com/pressroom/stats/""","""https://web.archive.org/web/20201111220945/https://www.imdb.com/pressroom/stats/""",+2020-11-11T22:09:45Z/11,+2023-04-01T08:19:29Z/11
P345,404763829,+2020-12-01T00:00:00Z/10,"""https://www.imdb.com/pressroom/sta