In [2]:
import pandas as pd

# Load dataset
df = pd.read_csv('Dataset_Cleaned.csv')

# Inspect
print(df.head())

# Example cleanup (adjust to your actual columns)
df['Amenities'] = df['Amenities'].str.lower()
df['Location'] = df['Location'].str.lower()
df.dropna(inplace=True)


  Boarding_House_ID       Location Room_Type  Price (LKR) Availability  \
0               BD1            NaN    Single        11337    Available   
1               BD2      Colombo 3    Shared        10263    Available   
2               BD3   CINEC campus    Shared        11764    Available   
3               BD4           NSBM    Double        13806     Occupied   
4               BD5          SLIIT    Single        10548     Occupied   

                            Amenities  Ratings  Proximity_to_Landmark  \
0   Air Conditioning, Study/Workspace      2.3                    1.8   
1                                 NaN      4.0                    2.2   
2                                 NaN      3.5                    2.6   
3                                 NaN      3.2                    4.3   
4                                 NaN      2.8                    3.8   

                                               Query  
0  Find a boarding house with Air Conditioning an...  
1     

In [4]:
# Create a list of searchable amenities, locations, etc.
all_amenities = set()
for amens in df['Amenities']:
    all_amenities.update([a.strip() for a in amens.split(',')])

print(all_amenities)  # Can be used to match tokens from user input


{'', 'air conditioning', 'pet-friendly', 'wi-fi', 'attach bathroom.', 'pool', 'laundary', 'gym', 'kitchen', 'air condition', 'parking', 'wifi', 'study/workspace', 'laundry', 'attach bathroom'}


In [6]:
# Initialize sets
all_amenities = set()
all_locations = set()

# Extract all amenities
for amens in df['Amenities']:
    all_amenities.update([a.strip().lower() for a in amens.split(',')])

# Extract all locations
for loc in df['Location']:
    all_locations.add(loc.strip().lower())

# Combine into one set (optional, for global search matching)
all_searchable_terms = all_amenities.union(all_locations)

# Print results
print("All Amenities:", all_amenities)
print("All Locations:", all_locations)
print("All Searchable Terms:", all_searchable_terms)


All Amenities: {'', 'air conditioning', 'pet-friendly', 'wi-fi', 'attach bathroom.', 'pool', 'laundary', 'gym', 'kitchen', 'air condition', 'parking', 'wifi', 'study/workspace', 'laundry', 'attach bathroom'}
All Locations: {'thimbirigasyaya', 'wellawatte', 'nugegoda', 'collombo 2', 'colombo 1', 'angoda', 'biyagama', 'horizon campus', 'pelawatte', 'kaduwela', 'colombo 10', 'colombo 9', 'thalangama', 'colombo 2', 'rajagiriya', 'colombo 5', 'kohuwala', 'piliyandala', 'maharagama', 'colombo 8', 'colombo 4', 'pannipitiya', 'narahenpita', 'kotte', 'kesbewa', 'moratuwa', 'pittugala', 'homagama', 'colombo 3', 'kiribathgoda', 'ethul kotte', 'thalawathugoda', 'wattala', 'dehiwala', 'mount lavinia', 'kirulapone', 'colombo 6', 'borella'}
All Searchable Terms: {'', 'thimbirigasyaya', 'air conditioning', 'attach bathroom.', 'laundary', 'wellawatte', 'nugegoda', 'collombo 2', 'colombo 1', 'attach bathroom', 'angoda', 'biyagama', 'horizon campus', 'pelawatte', 'laundry', 'kaduwela', 'colombo 10', 'bor

In [7]:
import spacy

nlp = spacy.load("en_core_web_sm")

# Example query
query = "Boarding House Malabe with Wifi under 20000"

# Run NLP
doc = nlp(query)

# Extract named entities
for ent in doc.ents:
    print(ent.text, ent.label_)

# Simple custom matcher (you can expand this)
matched_amenities = [a for a in all_amenities if a in query.lower()]
print("Matched amenities:", matched_amenities)


Boarding House Malabe ORG
Wifi PERSON
Matched amenities: ['', 'wifi']


In [15]:
def parse_query(query):
    filters = {
        "location": None,
        "amenities": [],
        "price_max": None,
        "Boarding_House_ID": None
    }

    doc = nlp(query.lower())

    # Match amenities
    filters["amenities"] = [a for a in all_amenities if a in query.lower()]

    # Simple location detection (you can use spaCy NER)
    for token in doc:
        if token.ent_type_ == "GPE":
            filters["location"] = token.text

    # Price extraction
    for token in doc:
        if token.like_num and "under" in query.lower():
            filters["price_max"] = int(token.text)

    # Optional name match
    for word in query.split():
        if word.lower() in df['Boarding_House_ID'].str.lower().values:
            filters["Boarding_House_ID"] = word.lower()

    return filters

print(parse_query("Boarding House with Wifi and Kitchen under 20000"))


{'location': None, 'amenities': ['', 'kitchen', 'wifi'], 'price_max': 20000, 'Boarding_House_ID': None}
