In [None]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, MultiLabelBinarizer
import pickle

# 1. Load cleaned dataset
df = pd.read_csv("cleaned_data.csv")

# 2. Split city into main_city and place_in_city
df[["place_in_city", "main_city"]] = df["city"].str.split(",", n=1, expand=True)

# 3. Process cuisine column (multi-label)
df["cuisine"] = df["cuisine"].fillna("").apply(lambda x: [c.strip() for c in x.split(",")])

# 4. Encode main_city and place_in_city
city_encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
city_encoded = city_encoder.fit_transform(df[["main_city", "place_in_city"]])
city_encoded_df = pd.DataFrame(city_encoded, columns=city_encoder.get_feature_names_out(["main_city", "place_in_city"]), index=df.index)

# 5. Encode cuisine (multi-label)
cuisine_encoder = MultiLabelBinarizer()
cuisine_encoded = cuisine_encoder.fit_transform(df["cuisine"])
cuisine_encoded_df = pd.DataFrame(cuisine_encoded, columns=cuisine_encoder.classes_, index=df.index)

# 6. Final dataset (keep name, ratings, cost + encoded features)
final_df = pd.concat([df[["name", "rating", "rating_count", "cost"]], city_encoded_df, cuisine_encoded_df], axis=1)

# 7. Save encoder (both city and cuisine inside one dict)
with open("encoder.pkl", "wb") as f:
    pickle.dump({"city_encoder": city_encoder, "cuisine_encoder": cuisine_encoder}, f)

# 8. Save final encoded dataset
final_df.to_csv("encoded_data.csv", index=False)

print("Encoding complete")
print("Final shape:", final_df.shape)


✅ Encoding complete
Final shape: (148398, 978)
