In [56]:
import os
import tempfile
from pathlib import Path

import pandas as pd
import rarfile


In [58]:
data_path = Path(os.environ["DATA_PATH"])
raw_path = data_path / "raws"

In [83]:
def row_to_frame(row: str) -> pd.DataFrame:
    split = row.replace("\x05", "\x08").replace("\u0000", "\x08").strip().strip("\x08").split("\x08")

    if len(split) % 72 != 0:
        err = f"Row length {len(split)} is not a multiple of 72."
        raise ValueError(err)

    return (
        pd.Series(
            [x.strip() for x in split],
            name="col",
        )
        .to_frame()
        .assign(
            col_idx=lambda df: df.index % 72,
            row_idx=lambda df: df.index // 72,
        )
        .pivot_table(index="row_idx", columns="col_idx", values="col", aggfunc="first")
        .rename(columns={0: "index"})
        .set_index("index")
    )



extracted_path = Path("/Users/rodolfofigueroa/Library/CloudStorage/OneDrive-InstitutoTecnologicoydeEstudiosSuperioresdeMonterrey/census_processing_data/raws/1990/SCINCE1990/SCINCE")

df = []
for dir_path in extracted_path.glob("[0-9A-Z][0-9]"):
    if not dir_path.is_dir():
        continue

    for fpath in dir_path.glob("*.PNF"):
        state_code = fpath.stem[0]
        if not state_code.isdigit():
            state_code = str(ord(state_code) - 55)
        state_code = state_code.zfill(2)

        with fpath.open() as f:
            line = f.readline()

        df_temp = row_to_frame(line)
        df_temp.index = (
            state_code + fpath.stem[1:] + df_temp.index.str.replace("-", "")
        )
        df.append(df_temp)

    out = pd.concat(df).replace(["*", "N.D.", "N/D"], np.nan).sort_index()
    out = cast_to_numeric(out)

In [84]:
out

col_idx,1,2,3,4,5,6,7,8,9,10,...,62,63,64,65,66,67,68,69,70,71
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0100100010000,440425,228222.0,382927.0,371835.0,305845.0,273309.0,262490.0,240502.0,106686.0,17667.0,...,5153.0,81428.0,80316.0,508.0,81423.0,76945.0,4486.0,401.0,58637.0,17954.0
010010001020A,1804,993.0,1662.0,1632.0,1402.0,1288.0,1241.0,1133.0,606.0,67.0,...,14.0,358.0,358.0,3.0,362.0,359.0,,3.0,318.0,28.0
0100100010229,213,103.0,180.0,173.0,128.0,109.0,103.0,95.0,39.0,,...,0.0,36.0,23.0,8.0,35.0,29.0,3.0,,18.0,4.0
0100100010233,1667,806.0,1412.0,1357.0,1056.0,909.0,865.0,779.0,318.0,38.0,...,27.0,261.0,256.0,3.0,262.0,243.0,22.0,0.0,157.0,61.0
0100100010252,3122,1575.0,2651.0,2565.0,2093.0,1846.0,1773.0,1649.0,678.0,73.0,...,38.0,611.0,592.0,,615.0,581.0,19.0,10.0,430.0,110.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3232052,12675,6282.0,10903.0,10547.0,8283.0,7161.0,6809.0,6141.0,2933.0,629.0,...,116.0,1490.0,724.0,61.0,1679.0,1261.0,99.0,31.0,1854.0,88.0
3232053,12226,6057.0,10356.0,9985.0,7754.0,6723.0,6393.0,5766.0,2654.0,527.0,...,131.0,1709.0,479.0,171.0,1609.0,1289.0,305.0,5.0,1859.0,51.0
3232054,14076,7093.0,11860.0,11450.0,8861.0,7489.0,7094.0,6367.0,3100.0,592.0,...,137.0,1470.0,330.0,109.0,1886.0,666.0,583.0,134.0,2081.0,49.0
3232055,35373,18356.0,31105.0,30143.0,24245.0,20813.0,19795.0,17771.0,9478.0,2287.0,...,332.0,5456.0,2299.0,460.0,5863.0,2986.0,1906.0,60.0,5755.0,374.0
