In [46]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity


In [47]:

# Step 1: Load and print the head
df = pd.read_csv("google_hotel_data_clean_v2.csv")
print("Step 1: Dataset head:\n", df.head())


Step 1: Dataset head:
                 Hotel_Name  Hotel_Rating   City     Feature_1       Feature_2  \
0       Crowne Plaza Kochi           4.6  kochi  5-star hotel  Free breakfast   
1     Trident Hotel Cochin           4.5  kochi  5-star hotel  Free breakfast   
2        The Galaxy Suites           3.8  kochi     Apartment       Sleeps 10   
3         The Renai cochin           4.2  kochi  4-star hotel  Free breakfast   
4  Ramada by Wyndham Kochi           4.5  kochi  5-star hotel       Breakfast   

      Feature_3     Feature_4            Feature_5           Feature_6  \
0    Free Wi-Fi  Free parking                 Pool             Hot tub   
1         Wi-Fi  Free parking                 Pool    Air conditioning   
2  Free parking    Free Wi-Fi  No air conditioning  No airport shuttle   
3    Free Wi-Fi  Free parking                 Pool    Air conditioning   
4    Free Wi-Fi  Free parking                 Pool    Air conditioning   

          Feature_7       Feature_8     Featu

In [48]:

#Get user input for city and filter
user_city = input("Enter the city for recommendations: ").strip().lower()
df['City_lower'] = df['City'].str.lower()
df = df[df['City_lower'] == user_city]
if df.empty:
    print(f"No hotels found in city: {user_city}")
    exit()
df = df.drop(columns='City_lower')


In [49]:

# Step 2: Keep only the second occurrence of each hotel
df['row_number'] = df.groupby('Hotel_Name').cumcount()
df = df[df['row_number'] == 1].drop(columns='row_number')


In [50]:

# Step 3: Combine Feature_1 through Feature_9
feature_cols = [f'Feature_{i}' for i in range(1, 10)]
df['all_features'] = df[feature_cols].values.tolist()
df['all_features'] = df['all_features'].apply(lambda x: [i for i in x if pd.notna(i)])


In [51]:

# Step 4: Count feature frequencies
from collections import Counter
flat_features = [item for sublist in df['all_features'] for item in sublist]
feature_freq = Counter(flat_features)
sorted_feature_freq = dict(sorted(feature_freq.items(), key=lambda x: x[1], reverse=True))
print("\nStep 4: Feature frequencies sorted:\n", sorted_feature_freq)



Step 4: Feature frequencies sorted:
 {'5-star hotel': 1, 'Free breakfast': 1, 'Wi-Fi': 1, 'Free parking': 1, 'Pool': 1, 'Hot tub': 1, 'Air conditioning': 1, 'Fitness center': 1, 'Spa': 1}


In [52]:

# Step 5: Top 10 most repeated features
top_10_features = list(sorted_feature_freq.keys())[:10]
print("\nStep 5: Top 10 features:\n", top_10_features)



Step 5: Top 10 features:
 ['5-star hotel', 'Free breakfast', 'Wi-Fi', 'Free parking', 'Pool', 'Hot tub', 'Air conditioning', 'Fitness center', 'Spa']


In [53]:

# Step 6: Map feature names to readable format
feature_mapping = {
    top_10_features[0]: 'Free Breakfast',
    top_10_features[1]: 'Free WiFi',
    top_10_features[2]: 'Parking',
    top_10_features[3]: 'Pool',
    top_10_features[4]: 'Gym',
    top_10_features[5]: 'Restaurant',
    top_10_features[6]: 'Pet Friendly',
    top_10_features[7]: 'Spa',
    #top_10_features[8]: 'Bar',
    top_10_features[8]: 'Air Conditioning'
}
print("\nStep 6: Feature mapping:\n", feature_mapping)



Step 6: Feature mapping:
 {'5-star hotel': 'Free Breakfast', 'Free breakfast': 'Free WiFi', 'Wi-Fi': 'Parking', 'Free parking': 'Pool', 'Pool': 'Gym', 'Hot tub': 'Restaurant', 'Air conditioning': 'Pet Friendly', 'Fitness center': 'Spa', 'Spa': 'Air Conditioning'}


In [54]:

# Step 7: Binary encode presence of each top feature
for raw_feature, new_name in feature_mapping.items():
    df[new_name] = df['all_features'].apply(lambda x: 1 if raw_feature in x else 0)


In [55]:

# Step 8: Create list of selected features
selected_features = list(feature_mapping.values())
print("\nStep 8: Selected features:\n", selected_features)



Step 8: Selected features:
 ['Free Breakfast', 'Free WiFi', 'Parking', 'Pool', 'Gym', 'Restaurant', 'Pet Friendly', 'Spa', 'Air Conditioning']


In [56]:

# Step 9: Create final DataFrame
df['Hotel_Rating'] = pd.to_numeric(df['Hotel_Rating'], errors='coerce')
df['Hotel_Price'] = pd.to_numeric(df['Hotel_Price'], errors='coerce')
df_rec = df[['Hotel_Name', 'Hotel_Rating', 'Hotel_Price'] + selected_features].dropna()
print("\nStep 9: Final DataFrame:\n", df_rec.head())



Step 9: Final DataFrame:
                           Hotel_Name  Hotel_Rating  Hotel_Price  \
161  The LaLiT Golf & Spa Resort Goa           4.4      11855.0   

     Free Breakfast  Free WiFi  Parking  Pool  Gym  Restaurant  Pet Friendly  \
161               1          1        1     1    1           1             1   

     Spa  Air Conditioning  
161    1                 1  


In [57]:

# Step 10: User feature preferences (update this as needed)
user_input = [0, 1, 0, 1, 0, 0, 0, 1, 1]  # Matches selected_features order
similarity_scores = cosine_similarity([user_input], df_rec[selected_features])[0]


In [58]:

# Step 11: Add similarity scores
df_rec['similarity'] = similarity_scores


In [59]:

# Step 12: Sort by similarity
df_sorted = df_rec.sort_values(by='similarity', ascending=False)


In [60]:

# Step 13: Show top recommendations
print(f"\nStep 13: Top Recommended Hotels in {user_city.title()}:\n", df_sorted[['Hotel_Name', 'Hotel_Rating', 'Hotel_Price', 'similarity']].head(5))



Step 13: Top Recommended Hotels in Goa:
                           Hotel_Name  Hotel_Rating  Hotel_Price  similarity
161  The LaLiT Golf & Spa Resort Goa           4.4      11855.0    0.666667
