In [17]:
import pandas as pd
df = pd.read_csv("/Users/rajeshkumar/Desktop/Swiggy project/swiggy.csv")

In [None]:
print(df.info())
print(df.describe())

In [18]:
# Count duplicates
print("Duplicate rows:", df.duplicated().sum())

# Remove duplicates
df = df.drop_duplicates()


Duplicate rows: 0


In [19]:
import numpy as np

def parse_rating_count(x):
    if pd.isna(x):
        return np.nan
    if isinstance(x, str):
        x = x.strip()
        if "Too Few" in x:        # Case: "Too Few Ratings"
            return np.nan
        if x.endswith("ratings"): # Case: "50+ ratings"
            num = x.split()[0].replace("+", "")
            return int(num) if num.isdigit() else np.nan
        if x.isdigit():           # Case: "200"
            return int(x)
    return np.nan   # anything else becomes NaN

df['rating_count'] = df['rating_count'].apply(parse_rating_count).astype(float)
df['cost'] = df['cost'].str.replace("₹", "").str.strip().astype(float)
import numpy as np

# Replace '--' with NaN and convert to float
df['rating'] = df['rating'].replace('--', np.nan).astype(float)



In [None]:
df.to_csv("cleaned_data.csv", index=False)
print("✅ Cleaned dataset saved as cleaned_data.csv")


In [21]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, MultiLabelBinarizer
import pickle

# ---- Load cleaned dataset ----
df = pd.read_csv("cleaned_data.csv")

# ---- 1. Split city into sub_city and main_city ----
df[['sub_city', 'main_city']] = df['city'].str.split(",", n=1, expand=True)
df['sub_city'] = df['sub_city'].str.strip()
df['main_city'] = df['main_city'].fillna(df['sub_city']).str.strip()

# ---- 2. Encode "name" using LabelEncoder ----
name_encoder = LabelEncoder()
df['name_encoded'] = name_encoder.fit_transform(df['name'])

# ---- 3. Encode "main_city" using OneHotEncoder ----
city_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
city_encoded = city_encoder.fit_transform(df[['main_city']])

city_df = pd.DataFrame(
    city_encoded,
    index=df.index,  # ✅ preserve same index as cleaned_data
    columns=city_encoder.get_feature_names_out(['main_city'])
)

# ---- 4. Encode "cuisine" (multi-label) ----
df['cuisine'] = df['cuisine'].fillna("").str.split(",")
mlb = MultiLabelBinarizer()
cuisine_encoded = mlb.fit_transform(df['cuisine'])

cuisine_df = pd.DataFrame(
    cuisine_encoded,
    index=df.index,  # ✅ preserve same index as cleaned_data
    columns=mlb.classes_
)

# ---- 5. Concatenate all features (indices aligned) ----
numerical_cols = ['rating', 'rating_count', 'cost']
final_df = pd.concat(
    [df[numerical_cols], df[['name_encoded']], city_df, cuisine_df],
    axis=1
)

# ✅ Ensure index matches cleaned_data before saving
assert (final_df.index == df.index).all(), "Index mismatch detected!"

# ---- 6. Save Encoders ----
with open("encoder.pkl", "wb") as f:
    pickle.dump({
        "name_encoder": name_encoder,
        "city_encoder": city_encoder,
        "cuisine_encoder": mlb
    }, f)

# ---- 7. Save Preprocessed Dataset ----
final_df.to_csv("encoded_data.csv", index=False)

print("✅ Preprocessing complete: encoded_data.csv + encoder.pkl saved (indices preserved)")


✅ Preprocessing complete: encoded_data.csv + encoder.pkl saved (indices preserved)


In [None]:
import pandas as pd

cleaned = pd.read_csv("cleaned_data.csv")
encoded = pd.read_csv("encoded_data.csv")

# Check if lengths match
print("Same length?", len(cleaned) == len(encoded))

# Check if indices align
print("Same indices?", cleaned.index.equals(encoded.index))


In [2]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def recommend_restaurant_cosine(restaurant_name, top_n=5):
    matches = cleaned[cleaned['name'].str.lower() == restaurant_name.lower()]
    if matches.empty:
        return f"❌ Restaurant '{restaurant_name}' not found!"
    
    idx = matches.index[0]
    
    # Compare only with that restaurant (not whole n x n)
    sim_scores = cosine_similarity(
        [encoded.iloc[idx]],   # query vector
        encoded
    )[0]
    
    # Get top N
    top_indices = np.argsort(sim_scores)[::-1][1:top_n+1]
    
    return cleaned.iloc[top_indices][['name','city','cuisine','rating','cost']]


In [None]:
import pandas as pd

cleaned = pd.read_csv("cleaned_data.csv")
encoded = pd.read_csv("encoded_data.csv")

# Check if lengths match
print("Same length?", len(cleaned) == len(encoded))

# Check if indices align
print("Same indices?", cleaned.index.equals(encoded.index))
