In [6]:
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer, OneHotEncoder, MaxAbsScaler
from sklearn.decomposition import TruncatedSVD
import pickle

# Load your cleaned dataset
df = pd.read_csv("cleaned_data.csv")

# -----------------------------
# 1. Split city into main_city and sub_city
# -----------------------------
df[['main_city', 'sub_city']] = df['city'].str.split(',', n=1, expand=True)
df['sub_city'] = df['sub_city'].fillna('Unknown')

# -----------------------------
# 2. Encode categorical features
# -----------------------------

# Cuisine (multi-label encoding)
df['cuisine'] = df['cuisine'].fillna("").apply(lambda x: x.split(','))
mlb = MultiLabelBinarizer()
cuisine_encoded = pd.DataFrame(mlb.fit_transform(df['cuisine']),
                               columns=[f"cuisine_{c}" for c in mlb.classes_])

# Main city + sub city (OneHotEncoder)
ohe = OneHotEncoder(sparse_output=True, handle_unknown='ignore')
city_encoded = ohe.fit_transform(df[['main_city', 'sub_city']])

# -----------------------------
# 3. Numeric features
# -----------------------------
numeric_cols = ['rating', 'rating_count', 'cost']
numeric_df = df[numeric_cols].fillna(0)

# -----------------------------
# 4. Combine all features
# -----------------------------
from scipy.sparse import hstack

X = hstack([
    cuisine_encoded,   # multi-hot cuisines
    city_encoded,      # sparse city data
    numeric_df         # numerical
])

# -----------------------------
# 5. Scale with MaxAbsScaler (memory safe)
# -----------------------------
scaler = MaxAbsScaler()
X_scaled = scaler.fit_transform(X)

# -----------------------------
# 6. Dimensionality reduction (to save memory further)
# -----------------------------
svd = TruncatedSVD(n_components=200, random_state=42)
X_reduced = svd.fit_transform(X_scaled)

# -----------------------------
# 7. Save processed dataset
# -----------------------------
with open("encoded_scaled.pkl", "wb") as f:
    pickle.dump(X_reduced, f)

print("✅ Encoding, scaling, and dimensionality reduction complete!")
print("Final shape:", X_reduced.shape)


✅ Encoding, scaling, and dimensionality reduction complete!
Final shape: (148398, 200)
