In [11]:
#Steps:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [8]:
df=pd.read_csv('Dataset .csv')

In [9]:
# Display basic information about the dataset
print("Original Dataset:")
print(df.info())

Original Dataset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9551 entries, 0 to 9550
Data columns (total 21 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Restaurant ID         9551 non-null   int64  
 1   Restaurant Name       9551 non-null   object 
 2   Country Code          9551 non-null   int64  
 3   City                  9551 non-null   object 
 4   Address               9551 non-null   object 
 5   Locality              9551 non-null   object 
 6   Locality Verbose      9551 non-null   object 
 7   Longitude             9551 non-null   float64
 8   Latitude              9551 non-null   float64
 9   Cuisines              9542 non-null   object 
 10  Average Cost for two  9551 non-null   int64  
 11  Currency              9551 non-null   object 
 12  Has Table booking     9551 non-null   object 
 13  Has Online delivery   9551 non-null   object 
 14  Is delivering now     9551 non-null   object 
 15  Swi

In [13]:
# Handling Missing Values
# Identify categorical columns
categorical_cols = df.select_dtypes(include='object').columns

In [14]:
# Create separate imputers for numerical and categorical columns
numeric_imputer = SimpleImputer(strategy='mean')
categorical_imputer = SimpleImputer(strategy='most_frequent')

In [15]:
# Apply imputers to respective columns
df_numeric = pd.DataFrame(numeric_imputer.fit_transform(df.select_dtypes(include='number')), columns=df.select_dtypes(include='number').columns)
df_categorical = pd.DataFrame(categorical_imputer.fit_transform(df[categorical_cols]), columns=categorical_cols)

In [16]:
# Combine the numeric and categorical dataframes
df_filled = pd.concat([df_numeric, df_categorical], axis=1)

In [17]:
# Display information after handling missing values
print("\nDataset after handling missing values:")
print(df_filled.info())


Dataset after handling missing values:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9551 entries, 0 to 9550
Data columns (total 21 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Restaurant ID         9551 non-null   float64
 1   Country Code          9551 non-null   float64
 2   Longitude             9551 non-null   float64
 3   Latitude              9551 non-null   float64
 4   Average Cost for two  9551 non-null   float64
 5   Price range           9551 non-null   float64
 6   Aggregate rating      9551 non-null   float64
 7   Votes                 9551 non-null   float64
 8   Restaurant Name       9551 non-null   object 
 9   City                  9551 non-null   object 
 10  Address               9551 non-null   object 
 11  Locality              9551 non-null   object 
 12  Locality Verbose      9551 non-null   object 
 13  Cuisines              9551 non-null   object 
 14  Currency              9551 non-n

In [18]:
# Encoding Categorical Variables
# Assuming that categorical variables are identified by data type 'object'
label_encoder = LabelEncoder()
df_filled[categorical_cols] = df_filled[categorical_cols].apply(label_encoder.fit_transform)


In [19]:
# Display information after encoding categorical variables
print("\nDataset after encoding categorical variables:")
print(df_filled.info())


Dataset after encoding categorical variables:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9551 entries, 0 to 9550
Data columns (total 21 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Restaurant ID         9551 non-null   float64
 1   Country Code          9551 non-null   float64
 2   Longitude             9551 non-null   float64
 3   Latitude              9551 non-null   float64
 4   Average Cost for two  9551 non-null   float64
 5   Price range           9551 non-null   float64
 6   Aggregate rating      9551 non-null   float64
 7   Votes                 9551 non-null   float64
 8   Restaurant Name       9551 non-null   int32  
 9   City                  9551 non-null   int32  
 10  Address               9551 non-null   int32  
 11  Locality              9551 non-null   int32  
 12  Locality Verbose      9551 non-null   int32  
 13  Cuisines              9551 non-null   int32  
 14  Currency              955

In [20]:
# Save the preprocessed dataset
df_filled.to_csv('preprocessed_dataset.csv', index=False)
print("\nPreprocessed dataset saved as 'preprocessed_dataset.csv'")


Preprocessed dataset saved as 'preprocessed_dataset.csv'


In [21]:
# Step 2: Determine criteria for restaurant recommendations
# Example criteria: cuisine preference and price range
user_preferences = {'Cuisines': 'Italian', 'Price range': 2.0}  # Adjust based on your dataset


In [22]:
# Step 3: Implement content-based filtering
# Combine relevant columns into a single text column for TF-IDF vectorization
df['combined_features'] = df['Cuisines'].astype(str) + ' ' + df['Price range'].astype(str)

In [23]:
# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(df['combined_features'])

In [24]:
# Compute cosine similarity
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [26]:
# Get restaurant recommendations based on user preferences
indices = df[df['Cuisines'] == user_preferences['Cuisines']].index
indices = indices[indices.isin(df[df['Price range'] == user_preferences['Price range']].index)]
similarity_scores = list(enumerate(cosine_sim[indices[0]]))
similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)[1:6]  # Top 5 recommendations

recommended_restaurants = [df['Restaurant Name'][i[0]] for i in similarity_scores]

# Display recommended restaurants
print(f"\nUser Preferences: {user_preferences}")
print("Recommended Restaurants:")
for i, restaurant in enumerate(recommended_restaurants, 1):
    print(f"{i}. {restaurant}")


User Preferences: {'Cuisines': 'Italian', 'Price range': 2.0}
Recommended Restaurants:
1. Gero
2. D.O.C Ristorante
3. Terra�_o It��lia
4. La Dolce Vita Ristorante
5. Tony's Italian Restaurant & Pizza
