In [2]:
import pandas as pd

# Load your dataset
df = pd.read_csv("Ethiopian_hotels.csv")

# Check what columns exist
print("Columns:", df.columns.tolist())




Columns: ['hotel_id', 'name', 'city', 'region', 'star_rating', 'review_score', 'price_per_night_etb', '24h Front Desk', 'ATM on site', 'Accessible Rooms', 'Air Conditioning', 'Airport Shuttle', 'Bar', 'Breakfast Included', 'Business Center', 'Concierge', 'Conference Room', 'Electric Kettle', 'Family Rooms', 'Fitness Center', 'Free Parking', 'Free WiFi', 'Laundry Service', 'Mini Fridge', 'Non-smoking Rooms', 'Pet Friendly', 'Restaurant', 'Room Service', 'Spa', 'Swimming Pool', 'TV']


In [3]:
# If amenities are spread across multiple columns (e.g. 'wifi', 'pool', 'spa', etc. as 0/1),
# create a single string column called 'matched_amenities'
if 'amenities' not in df.columns:
    amenity_columns = [col for col in df.columns if col not in ['hotel_id','hotel_name','location','category','price']]
    # Combine into one string (only the amenities marked as 1 or True)
    df['matched_amenities'] = df[amenity_columns].apply(
        lambda row: ", ".join([col for col in amenity_columns if row[col]==1]), axis=1
    )
else:
    # If you already have one 'amenities' column, just rename it
    df.rename(columns={'amenities': 'matched_amenities'}, inplace=True)

# Save processed dataset for use in recommendation system
df.to_csv("processed_hotels.csv", index=False)



In [4]:
df.head()

Unnamed: 0,hotel_id,name,city,region,star_rating,review_score,price_per_night_etb,24h Front Desk,ATM on site,Accessible Rooms,...,Laundry Service,Mini Fridge,Non-smoking Rooms,Pet Friendly,Restaurant,Room Service,Spa,Swimming Pool,TV,matched_amenities
0,ETH200000,Meskel Suites Court,Hosaena,SNNPR,4,4.3,6919,0,1,1,...,0,0,0,0,1,0,0,0,0,"ATM on site, Accessible Rooms, Bar, Breakfast ..."
1,ETH200001,Green Lodge Residence,Lalibela,Amhara,3,4.1,4456,1,0,0,...,0,1,0,1,0,0,1,0,1,"24h Front Desk, Air Conditioning, Airport Shut..."
2,ETH200002,Ras Lodge Village,Adama,Oromia,4,4.2,9116,0,1,0,...,1,0,1,1,0,1,0,1,0,"ATM on site, Family Rooms, Laundry Service, No..."
3,ETH200003,Ethio Boutique House,Yirgalem,SNNPR,3,4.2,5598,0,0,1,...,1,0,0,1,1,0,1,1,0,"Accessible Rooms, Bar, Free Parking, Laundry S..."
4,ETH200004,Walia Lodge Gardens,Shashamane,Oromia,3,3.5,4728,0,0,0,...,0,0,1,1,1,0,1,0,1,"Breakfast Included, Concierge, Electric Kettle..."


In [7]:


# 1. Import libraries
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# 2. Load processed dataset
df = pd.read_csv("processed_hotels.csv")
print("Dataset loaded. Total hotels:", len(df))

# Content-based on amenities
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['matched_amenities'])
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Collaborative-based on review score
df['review_score_norm'] = (df['review_score'] - df['review_score'].min()) / (df['review_score'].max() - df['review_score'].min())
collab_sim = 1 - np.abs(df['review_score_norm'].values[:, None] - df['review_score_norm'].values[None, :])

# Hybrid similarity: 60% content + 40% review
hybrid_sim = 0.6 * cosine_sim + 0.4 * collab_sim


def recommend_hotels_partial(
    hotel_name,
    df=df,
    sim_matrix=hybrid_sim,
    top_n=5,
    min_star=0,
    max_price=np.inf,
    city=None,
    region=None
):
    # Find hotel indices that contain the input string
    matches = df[df['name'].str.contains(hotel_name, case=False)].index.tolist()

    if not matches:
        return f"No hotel found matching '{hotel_name}'"

    # Use the first match for recommendation
    idx = matches[0]

    sim_scores = list(enumerate(sim_matrix[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    recommendations = []
    for i, score in sim_scores[1:]:
        hotel = df.iloc[i]
        if hotel['star_rating'] >= min_star and hotel['price_per_night_etb'] <= max_price:
            if (city is None or hotel['city'].lower() == city.lower()) and \
               (region is None or hotel['region'].lower() == region.lower()):
                recommendations.append(hotel)
        if len(recommendations) >= top_n:
            break

    if not recommendations:
        return "No hotels found matching the filters."

    return pd.DataFrame(recommendations)[['name', 'city', 'region', 'star_rating', 'price_per_night_etb']]

# ===========================
# Input example
# ===========================


hotel_to_search = "Axum"
min_star_rating = 3        # Minimum star rating
max_price_budget = 5000    # Maximum price in ETB
preferred_city = None      # None to ignore city filter
preferred_region = None
# Use the partial match function
recommendations = recommend_hotels_partial(
    hotel_to_search,
    top_n=5,
    min_star=min_star_rating,
    max_price=max_price_budget,
    city=preferred_city,
    region=preferred_region
)

print(f"\nTop hotel recommendations for '{hotel_to_search}':\n")
print(recommendations)



Dataset loaded. Total hotels: 3500

Top hotel recommendations for 'Axum':

                          name          city             region  star_rating  \
700     Skylight Suites Deluxe    Arba Minch              SNNPR            3   
1764         Bora Resort Oasis        Assosa  Benishangul-Gumuz            3   
1599    Koka Guest House Court  Mizan Teferi              SNNPR            3   
636         Ayat Boutique View  Debre Birhan             Amhara            3   
2315  Blue Nile Resort Retreat       Hawassa             Sidama            3   

      price_per_night_etb  
700                  4345  
1764                 3645  
1599                 4908  
636                  3525  
2315                 3018  


In [None]:
# Filter hotels in Addis Ababa (case-insensitive)
addis_hotels = df[df['city'].str.lower() == 'addis ababa']

# Show the list
print(f"Total hotels in Addis Ababa: {len(addis_hotels)}")
addis_hotels[['name', 'star_rating', 'review_score', 'price_per_night_etb', 'region']]

Total hotels in Addis Ababa: 119


Unnamed: 0,name,star_rating,review_score,price_per_night_etb,region
34,Ayat Guest House Gardens,2,3.2,2694,Addis Ababa
54,Ras Boutique Residence,4,4.7,8198,Addis Ababa
82,Ayat Inn Deluxe,5,4.5,12379,Addis Ababa
115,Trinity Residences Center,3,4.1,5243,Addis Ababa
126,Axum Lodge Oasis,2,3.1,2391,Addis Ababa
...,...,...,...,...,...
3322,Cactus Resort Court,2,3.6,1998,Addis Ababa
3379,Nile View Suites Retreat,4,4.5,6051,Addis Ababa
3415,Anbessa Resort International,3,4.0,4270,Addis Ababa
3431,Simien Hotel Oasis,3,3.6,4301,Addis Ababa


In [None]:
# The user can choose which amneties they want to use

In [8]:
def recommend_hotels_partial(
    hotel_name,
    df=df,
    sim_matrix=hybrid_sim,
    top_n=5,
    min_star=0,
    max_price=np.inf,
    city=None,
    region=None,
    required_amenities=None
):
    """
    Hybrid hotel recommendation with:
    - Partial hotel name matching
    - Filters: min_star, max_price, city, region
    - Amenities filter (list of strings)
    """
    hotel_name = hotel_name.strip()  # remove extra spaces
    # Find hotel indices that contain the input string
    matches = df[df['name'].str.contains(hotel_name, case=False)].index.tolist()

    if not matches:
        return f"No hotel found matching '{hotel_name}'"

    idx = matches[0]
    sim_scores = list(enumerate(sim_matrix[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    recommendations = []
    for i, score in sim_scores[1:]:  # skip the hotel itself
        hotel = df.iloc[i]

        # Apply star rating and price filters
        if hotel['star_rating'] < min_star or hotel['price_per_night_etb'] > max_price:
            continue

        # Apply city/region filters
        if city and hotel['city'].lower() != city.lower():
            continue
        if region and hotel['region'].lower() != region.lower():
            continue

        # Apply amenities filter
        if required_amenities:
            hotel_amenities = hotel['matched_amenities'].lower()
            if not all(a.lower() in hotel_amenities for a in required_amenities):
                continue

        # Passed all filters, add to recommendations
        recommendations.append(hotel)

        if len(recommendations) >= top_n:
            break

    if not recommendations:
        return "No hotels found matching the filters."

    return pd.DataFrame(recommendations)[
        ['name', 'city', 'region', 'star_rating', 'price_per_night_etb', 'matched_amenities']
    ]


In [10]:
hotel_to_search = "Axum"
min_star_rating = 3
max_price_budget = 8000
preferred_city = None
preferred_region = None
amenities_filter = ["Pool", "Breakfast", "Free Parking"]  # Must have all these

recommendations = recommend_hotels_partial(
    hotel_to_search,
    top_n=5,
    min_star=min_star_rating,
    max_price=max_price_budget,
    city=preferred_city,
    region=preferred_region,
    required_amenities=amenities_filter
)

print(f"\nTop hotel recommendations for '{hotel_to_search}' with amenities {amenities_filter}:\n")
print(recommendations)



Top hotel recommendations for 'Axum' with amenities ['Pool', 'Breakfast', 'Free Parking']:

                             name          city       region  star_rating  \
272       Skylight Suites Retreat   Addis Ababa  Addis Ababa            3   
1912  Afar Oasis Boutique Retreat          Axum       Tigray            4   
1291   Emerald Guest House Center      Bishoftu       Oromia            3   
209          Trinity Suites Haven  Debre Birhan       Amhara            3   
1802          Ethio Suites Deluxe  Debre Markos       Amhara            4   

      price_per_night_etb                                  matched_amenities  
272                  6407  24h Front Desk, Accessible Rooms, Airport Shut...  
1912                 6009  24h Front Desk, Breakfast Included, Concierge,...  
1291                 5398  24h Front Desk, Accessible Rooms, Bar, Breakfa...  
209                  5642  Breakfast Included, Fitness Center, Free Parki...  
1802                 7152  24h Front Desk, ATM on