In [1]:
import pandas as pd
from pymongo import MongoClient
from io import BytesIO
from zipfile import ZipFile
from pySmartDL import SmartDL


################
def check_data_exists(name_db: str, name_collection: str, year: int):
    # Connect to MongoDB
    client = MongoClient('localhost', 27017)
    # db contains collections
    db = client[name_db]
    # mcd will save in each year
    collection = db[name_collection]

    # Check if collection contains data with "year" variable
    if collection.count_documents({"year": year}) > 0:
        print(f"DB:{name_db}, The {name_collection} collection contains data for the year {year}.")
        # no need update, retrun empty collection
        collection = {}
        return collection
    else:
        print(f"DB:{name_db}, The {name_collection} collection does not contain data for the year {year}.")
        return collection



################
def get_data(url):
    # Download the file using pySmartDL
    obj = SmartDL(url, progress_bar=False)
    obj.start()

    # Read the downloaded file into memory
    with open(obj.get_dest(), "rb") as f:
        zip_data = f.read()
  
    # Unzip the file in memory
    with ZipFile(BytesIO(zip_data)) as zf:
        # Get the name of the first file in the zip (assuming it's the DTA file)
        dta_filename = zf.namelist()[0]
        
        # Read the DTA file into a pandas DataFrame
        with zf.open(dta_filename) as dta_file:
            df = pd.read_stata(dta_file)

    # Display the first few rows of the DataFrame
    print(df.head())
    return df




################
def fiter_cols(lst_col) :
    filtered_cols = lst_col
    # Occurance data drop - residence info only counts.
    filtered_cols = [col for col in filtered_cols if not col.endswith( 'oc' )]
    for prefix in [ # drop following metadata & unneccssary variables
                    'reparea', 'shipno',
                    'rectype', 'restatus', 
                    'occup', 'indus',
                    'econdp_','econds_','enicon_','record_','eniflag','rnifla_','entity'] :    
        filtered_cols = [col for col in filtered_cols if not col.startswith( prefix )]
    return filtered_cols



# url = "https://data.nber.org/mortality/1982/mort1982.dta.zip"
# get_data(url)

In [2]:
################
# getting data from 1982 to 2003.
for year in range(1982, 2004):  # 1982 to 2003
    # check data exists
    collection = check_data_exists(name_db='nvss_mcd', name_collection=f'{year}', year=int(year))
    if not collection == {}: 
        print('Get the data')
        # URL of the zip file
        url = f"https://data.nber.org/mortality/{year}/mort{year}.dta.zip"
        df = get_data(url=url)
        # add year due to inconsistency of 'datayear'
        df['year'] = year
        # 
        records = df[fiter_cols(df.columns)].to_dict('records')
        collection.insert_many(records)

df={}
records={}
collection={}

DB:nvss_mcd, The 1982 collection contains data for the year 1982.
DB:nvss_mcd, The 1983 collection contains data for the year 1983.
DB:nvss_mcd, The 1984 collection contains data for the year 1984.
DB:nvss_mcd, The 1985 collection contains data for the year 1985.
DB:nvss_mcd, The 1986 collection contains data for the year 1986.
DB:nvss_mcd, The 1987 collection contains data for the year 1987.
DB:nvss_mcd, The 1988 collection contains data for the year 1988.
DB:nvss_mcd, The 1989 collection contains data for the year 1989.
DB:nvss_mcd, The 1990 collection contains data for the year 1990.
DB:nvss_mcd, The 1991 collection contains data for the year 1991.
DB:nvss_mcd, The 1992 collection contains data for the year 1992.
DB:nvss_mcd, The 1993 collection contains data for the year 1993.
DB:nvss_mcd, The 1994 collection contains data for the year 1994.
DB:nvss_mcd, The 1995 collection contains data for the year 1995.
DB:nvss_mcd, The 1996 collection contains data for the year 1996.
DB:nvss_mc