In [2]:
import pandas as pd
import numpy as np
import json
from pathlib import Path

# --- 1) Paths ---
INPUT_PATH = "store_data_cleaned.csv"     # change if needed
OUTPUT_PATH = "store_data_encoded.csv"
CITY_COLS_LIST_PATH = "store_city_columns.json"

# --- 2) Load ---
df = pd.read_csv(INPUT_PATH, dtype={'STORE_NUMBER': str})  # store number as string avoids weird integer formatting
print("Loaded store data:", df.shape)

# --- 3) Normalize whitespace & convert empty strings to NaN ---
def normalize_str_col(s):
    # convert to str, strip whitespace, convert empty to NaN
    s = s.astype(str).str.strip()
    # treat literal 'nan', 'none', '' as missing
    s = s.replace({'': np.nan, 'nan': np.nan, 'None': np.nan, 'none': np.nan})
    return s

df['CITY'] = normalize_str_col(df['CITY'])
df['STATE'] = normalize_str_col(df['STATE'])
# make sure postal code is treated consistently
if 'POSTAL_CODE' in df.columns:
    df['POSTAL_CODE'] = normalize_str_col(df['POSTAL_CODE'])

# --- 4) Diagnostics (optional but recommended) ---
print("Unique stores:", df['STORE_NUMBER'].nunique())
print("Rows with missing CITY:", df['CITY'].isna().sum())
print("Rows with missing STATE:", df['STATE'].isna().sum())
print("Unique cities (sample):", df['CITY'].dropna().unique()[:20])
print(df[['STORE_NUMBER','CITY','STATE','POSTAL_CODE','fill_source']].head(10).to_string(index=False))

# --- 5) Create missing indicators BEFORE we fill (useful feature) ---
df['city_missing']  = df['CITY'].isna().astype(int)
df['state_missing'] = df['STATE'].isna().astype(int)

# --- 6) Fill missing values with a sentinel 'Unknown' ---
df['CITY']  = df['CITY'].fillna('Unknown')
df['STATE'] = df['STATE'].fillna('Unknown')

# Optional: standardize city case so names are consistent.
# Use title case (e.g., 'HOUSTON' -> 'Houston', 'Winter Park' -> 'Winter Park')
# Pick the style you prefer; keep it consistent across datasets.
df['CITY_norm'] = df['CITY'].str.title()
# If you prefer uppercase:
# df['CITY_norm'] = df['CITY'].str.upper()

# --- 7) One-hot encode CITY using pandas.get_dummies ---
city_dummies = pd.get_dummies(df['CITY_norm'], prefix='city', dtype=int)
print("City dummy columns:", city_dummies.shape[1])

# --- 8) Prepare final store dataframe ---
# Keep STORE_NUMBER, STATE (categorical), missing indicators, and city dummies
final_cols = ['STORE_NUMBER', 'STATE', 'city_missing', 'state_missing']
store_enc = pd.concat([ df[final_cols].reset_index(drop=True), city_dummies.reset_index(drop=True) ], axis=1)

# Convert STATE to category dtype (compact, keeps the string readable)
store_enc['STATE'] = store_enc['STATE'].astype('category')

# --- 9) Drop POSTAL_CODE (as requested) but do not delete original file ---
# (we already didn't include POSTAL_CODE in store_enc; original file stays unchanged)
print("Final encoded store shape:", store_enc.shape)

# --- 10) Save encoded dataframe & city column list ---
store_enc.to_csv(OUTPUT_PATH, index=False)
print(f"Saved encoded store file to: {OUTPUT_PATH}")

# Also save list of city columns so you can ensure same columns when merging/test scoring
city_cols = [c for c in store_enc.columns if c.startswith('city_')]
with open(CITY_COLS_LIST_PATH, 'w') as f:
    json.dump(city_cols, f)
print("Saved city columns list to:", CITY_COLS_LIST_PATH)

# --- 11) Quick merge example (how you'll later join to orders) ---
# orders_df = pd.read_csv("/path/to/order_data_cleaned_and_encoded.csv", dtype={'STORE_NUMBER': str})
# merged = orders_df.merge(store_enc, on='STORE_NUMBER', how='left')
# merged[['STORE_NUMBER'] + city_cols + ['STATE', 'city_missing', 'state_missing']].head()

# --- 12) Notes for future / scoring ---
# - If you later get a test set with a new city not in city_cols, add those as zero columns (or map to 'Unknown').
# - Keep CITY list file (store_city_columns.json) and reuse it to align columns across train / inference.
# - If you prefer fewer columns, you can label-encode CITY or use target-encoding instead of one-hot.


Loaded store data: (38, 5)
Unique stores: 38
Rows with missing CITY: 1
Rows with missing STATE: 1
Unique cities (sample): ['GRAPEVINE' 'HUNTERSVILLE' 'Winter Park' 'LAS VEGAS' 'ARDMORE' 'Oxnard'
 'Nashville' 'Austin' 'RUTHERFORD' 'OMAHA' 'IRVING' 'CHARLOTTE'
 'ARLINGTON' 'Brandon' 'Dallas' 'Henderson' 'WALLED LAKE' 'HOUSTON'
 'APPLE VALLEY' 'SAN ANTONIO']
STORE_NUMBER         CITY STATE POSTAL_CODE fill_source
        2156    GRAPEVINE    TX     76051.0    complete
        1419 HUNTERSVILLE    NC     28078.0    complete
        2249  Winter Park    FL     32792.0    complete
        2513    LAS VEGAS    NV     89129.0    complete
        1754      ARDMORE    OK     73401.0    complete
         949       Oxnard    CA     93033.0    complete
         820    Nashville    TN     37203.0    complete
        1161       Austin    TX     78723.0    complete
        2517   RUTHERFORD    CA     94573.0    complete
        4391        OMAHA    NE     68107.0    complete
City dummy columns: 33
Fin

In [5]:
store_enc = store_enc.drop(columns=['city_missing', 'state_missing'])

In [6]:
store_enc = store_enc.drop(index=22).reset_index(drop=True)

In [7]:
store_enc.head(40)

Unnamed: 0,STORE_NUMBER,STATE,city_Apple Valley,city_Ardmore,city_Arlington,city_Atwater,city_Aurora,city_Austin,city_Brandon,city_Charlotte,...,city_Mililani,city_Nashville,city_Omaha,city_Oxnard,city_Rutherford,city_San Antonio,city_Scottsdale,city_Unknown,city_Walled Lake,city_Winter Park
0,2156,TX,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1419,NC,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2249,FL,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,2513,NV,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1754,OK,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,949,CA,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
6,820,TN,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
7,1161,TX,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
8,2517,CA,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
9,4391,NE,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0


In [8]:
store_enc.to_csv(OUTPUT_PATH, index=False)

In [9]:
df.head()

Unnamed: 0,STORE_NUMBER,CITY,STATE,POSTAL_CODE,fill_source,city_missing,state_missing,CITY_norm
0,2156,GRAPEVINE,TX,76051.0,complete,0,0,Grapevine
1,1419,HUNTERSVILLE,NC,28078.0,complete,0,0,Huntersville
2,2249,Winter Park,FL,32792.0,complete,0,0,Winter Park
3,2513,LAS VEGAS,NV,89129.0,complete,0,0,Las Vegas
4,1754,ARDMORE,OK,73401.0,complete,0,0,Ardmore


In [10]:
df['CITY'].unique()

array(['GRAPEVINE', 'HUNTERSVILLE', 'Winter Park', 'LAS VEGAS', 'ARDMORE',
       'Oxnard', 'Nashville', 'Austin', 'RUTHERFORD', 'OMAHA', 'IRVING',
       'CHARLOTTE', 'ARLINGTON', 'Brandon', 'Dallas', 'Henderson',
       'WALLED LAKE', 'HOUSTON', 'APPLE VALLEY', 'SAN ANTONIO', 'LAVEEN',
       'MACOMB', 'Unknown', 'ATWATER', 'MIAMI', 'Elsa', 'AURORA',
       'Mililani', 'Linden', 'Cicero', 'EL PASO', 'SCOTTSDALE',
       'LITTLETON'], dtype=object)

In [11]:
len(df['CITY'].unique())

33

In [15]:
len(store_enc.columns)

35