## Instruction on how to get the raw dataset we use


In [10]:
import pandas as pd
import numpy as np


In [1]:
data = []
bad_rows = []

with open("FLAT_CMPL.txt", "rb") as f:  # open in binary mode to control decoding
    for i, raw_line in enumerate(f):
        try:
            line = raw_line.decode("utf-8")  # decode manually
            fields = line.rstrip("\n").split("\t")
            if len(fields) == 49:
                data.append(fields)
            else:
                bad_rows.append((i + 1, len(fields)))
        except UnicodeDecodeError:
            bad_rows.append((i + 1, "decode_error"))
            continue  # skip bad encoding line

In [5]:
print(f"\n✅ Loaded {len(data)} good rows")
print(f"⚠️  Skipped {len(bad_rows)} bad rows")


✅ Loaded 2075452 good rows
⚠️  Skipped 0 bad rows


In [6]:
df = pd.DataFrame(data)
print(df.head())

  0       1                       2      3            4     5  6         7   \
0  1  958241      Volvo Car USA, LLC  VOLVO          760  1987  N             
1  2  958130      Ford Motor Company   FORD  THUNDERBIRD  1992  N  19941222   
2  3  958132       Kia America, Inc.    KIA       SEPHIA  1994  Y  19941230   
3  4  958133  Chrysler (FCA US, LLC)  DODGE          600  1987  N  19941231   
4  5  958137  Chrysler (FCA US, LLC)  DODGE      CARAVAN  1991  N  19941218   

  8  9   ... 39 40 41 42 43 44 45 46 47 48  
0  N  0  ...                    V     N  N  
1  N  0  ...                    V     N  N  
2  N  0  ...                    V     N  N  
3  N  0  ...                    V     N  N  
4  N  0  ...                    V     N  N  

[5 rows x 49 columns]


In [12]:
# Calculate the proportion of complaints that were filed in 2015 or later
valid_mask = df[7].astype(str).str[:4].str.isdigit()
np.mean(df[7][valid_mask].astype(str).str[:4].astype(int) >= 2015)

0.4492878469939572

In [13]:
df_cleaned = df.copy()
df_cleaned = df_cleaned[df_cleaned[7].astype(str).str[:4].str.isdigit()]

In [14]:
df_2015 = df_cleaned[
    df_cleaned[7].astype(str).str[:4].astype(int) >= 2015
]

In [15]:
df.to_parquet("complaints_2015.parquet", index=False)