In [13]:
import pandas as pd
import numpy as np
import os

In [None]:
os.getcwd()
#go one directory up
os.chdir('..')
# Load the data
all_codes = pd.read_pickle("gsscoder_python/lookups/all_lad_codes_dates.pickle")


In [29]:
# Ensure datetime columns are properly typed
all_codes['start_date'] = pd.to_datetime(all_codes['start_date'])
all_codes['end_date'] = pd.to_datetime(all_codes['end_date'], errors='coerce')  # coerce invalid dates to NaT

# 2. Fix the 2009 date (now with proper datetime handling)
mask = all_codes['start_date'] == pd.to_datetime("2009-01-01")
all_codes.loc[mask, 'start_date'] = pd.to_datetime("2008-12-31")

# 3. Get change years safely
change_dates = all_codes['start_date'].dropna().unique()
change_dates = sorted(change_dates)  # Now comparing Timestamp vs Timestamp
change_years = pd.to_datetime(change_dates).year.astype(str).tolist()

# 4. Helper function to avoid code duplication
def get_year_data(year, columns):
    year = str(year)  # Ensure string type
    end_of_year = pd.to_datetime(f"{year}-12-31")
    
    # Filter data
    data = all_codes[
        (all_codes['start_date'] <= end_of_year) &
        (all_codes['end_date'].isna() | (all_codes['end_date'] >= end_of_year))
    ].copy()
    
    # Select and sort columns
    data = data[columns]
    
    # Determine sort column (first column in the list)
    sort_col = columns[0]
    
    # Ensure sort column has consistent type (convert to string if needed)
    if data[sort_col].dtype == object:
        data[sort_col] = data[sort_col].astype(str)
    
    return data.sort_values(sort_col).reset_index(drop=True)

# 5. Generate test data
test_codes = {f"y{year}": get_year_data(year, ['gss_code']) for year in change_years}
test_names = {f"y{year}": get_year_data(year, ['gss_name']) for year in change_years}
test_codes_names = {f"y{year}": get_year_data(year, ['gss_code', 'gss_name']) for year in change_years}

# 6. Save results
pd.to_pickle(test_codes, "gsscoder_python/lookups/test_codes.pkl")
pd.to_pickle(test_names, "gsscoder_python/lookups/test_names.pkl")
pd.to_pickle(test_codes_names, "gsscoder_python/lookups/test_codes_names.pkl")