In [1]:
import re

import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
df = pd.read_csv('apartment.csv')

df.shape

(1415, 11)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1415 entries, 0 to 1414
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   SocietyName     971 non-null    object
 1   BHK             1415 non-null   object
 2   Furnishing      1415 non-null   object
 3   BuiltUpArea     1415 non-null   object
 4   Locality        1415 non-null   object
 5   NearbyPlace_1   1413 non-null   object
 6   DistanceAway_1  1413 non-null   object
 7   NearbyPlace_2   1413 non-null   object
 8   DistanceAway_2  1413 non-null   object
 9   Description     1415 non-null   object
 10  Highlights      593 non-null    object
dtypes: object(11)
memory usage: 121.7+ KB


In [4]:
df.duplicated().sum()

23

In [5]:
df.drop_duplicates(inplace=True)

In [6]:
df.shape

(1392, 11)

In [7]:
df.isna().sum()

SocietyName       440
BHK                 0
Furnishing          0
BuiltUpArea         0
Locality            0
NearbyPlace_1       2
DistanceAway_1      2
NearbyPlace_2       2
DistanceAway_2      2
Description         0
Highlights        817
dtype: int64

In [8]:
df = df[~(df['SocietyName'].isna())]

df.reset_index(drop=True, inplace=True)

In [9]:
df.shape

(952, 11)

In [10]:
df.drop_duplicates(subset=['SocietyName'], inplace=True)

In [11]:
df.shape

df.reset_index(drop=True, inplace=True)

In [12]:
df.isna().sum()

SocietyName         0
BHK                 0
Furnishing          0
BuiltUpArea         0
Locality            0
NearbyPlace_1       1
DistanceAway_1      1
NearbyPlace_2       1
DistanceAway_2      1
Description         0
Highlights        207
dtype: int64

In [13]:
df.shape

(391, 11)

In [14]:
df.sample(10)

Unnamed: 0,SocietyName,BHK,Furnishing,BuiltUpArea,Locality,NearbyPlace_1,DistanceAway_1,NearbyPlace_2,DistanceAway_2,Description,Highlights
37,Anand Ilyf,"3 BHK Flat for rent in Vaishno Devi Circle, Ah...",Fully Furnished,1800 sq.ft,"3 BHK Flat for rent in Vaishno Devi Circle, Ah...",H3 World School,is 1.6 km away,Ashirvad Hospital - Best Gynecologist | Orthop...,is 3.3 km away,Anand eyelife it's good apartment it's fully f...,Property HighlightsAerobics RoomAmphitheatre24...
19,Shatrunjay Apartment,"3 BHK Flat for rent in Ramdev Nagar, Ahmedabad",Fully Furnished,1700 sq.ft,"3 BHK Flat for rent in Ramdev Nagar, Ahmedabad",Anand Niketan Group of Schools,is 0.5 km away,Shalby Multi-Specialty Hospitals,is 0.9 km away,Your search for a spacious home at affordable ...,Property HighlightsChildren Play AreaIndoor Ga...
311,Safal Parisar II,"3 BHK Flat for rent in South Bopal, Ahmedabad",Fully Furnished,1905 sq.ft,"3 BHK Flat for rent in South Bopal, Ahmedabad",eduMETA THE i-SCHOOL South Bopal Ahmedabad Guj...,is 0.1 km away,Sannidhya Multi Speciality Hospital-Gynecologi...,is 0.2 km away,This property is good and good locality 3bhk f...,
103,Orchid Heaven,"3 BHK Flat for rent in Shela, Ahmedabad",Semi Furnished,1985 sq.ft,"3 BHK Flat for rent in Shela, Ahmedabad",Lakshaya International School,is 3.3 km away,Suryam Children Hospital South Bopal,is 2.7 km away,A residential flat available for rent in the p...,Property Highlights24 Hours Concierge24x7 Secu...
17,Shripad Residency,"3 BHK Flat for rent in Gota, Ahmedabad",Fully Furnished,1800 sq.ft,"3 BHK Flat for rent in Gota, Ahmedabad",SMT K N Patel International School,is 1 km away,Mansi Women's Hospital - Best Gynec Hospital -...,is 1 km away,Sripad residency it's good apartment in gota n...,Property HighlightsATMAnti Skid TitlesAmphithe...
254,Safal Seventy,"4 BHK Flat for rent in Iscon Ambli Road, Ahmed...",Fully Furnished,5400 sq.ft,"4 BHK Flat for rent in Iscon Ambli Road, Ahmed...",Anand Niketan Group of Schools,is 0.1 km away,Epic Hospital,is 1.9 km away,This Apartment can be a comfortable and afford...,
158,Sun Rising Homes,"2 BHK Flat for rent in Gota, Ahmedabad",Semi Furnished,800 sq.ft,"2 BHK Flat for rent in Gota, Ahmedabad",SMT K N Patel International School,is 1.7 km away,Mansi Women's Hospital - Best Gynec Hospital -...,is 1.1 km away,Are you looking for an affordable Apartment fo...,Property Highlights24x7 SecurityATMChildren Pl...
148,Devkinandan Parmeshwar 4,"2 BHK Flat for rent in Chandkheda, Ahmedabad",Fully Furnished,1200 sq.ft,"2 BHK Flat for rent in Chandkheda, Ahmedabad",Global Indian International School In Ahmedaba...,is 0.9 km away,Jenshiv Hospital - Physician - Diabetologist i...,is 2.1 km away,Check out this Apartment available for rent in...,Property Highlights24x7 SecurityATMChildren Pl...
213,Sheladia Sarva,"3 BHK Flat for rent in Shela, Ahmedabad",Semi Furnished,1500 sq.ft,"3 BHK Flat for rent in Shela, Ahmedabad",LDR International School - South Bopal Branch,is 1.6 km away,Aalpa's children's hospital,is 2.1 km away,This Apartment can be a comfortable and afford...,
183,Malabar Exotica,"3 BHK Flat for rent in Tragad, Ahmedabad",Unfurnished,1875 sq.ft,"3 BHK Flat for rent in Tragad, Ahmedabad",H3 World School,is 0.5 km away,Ashirvad Hospital - Best Gynecologist | Orthop...,is 4.2 km away,"3.5bhk flat on rent in malabar exotica, new fl...",


In [15]:
df.drop(columns=['Highlights','Locality'], inplace=True)

In [16]:
df.isna().sum()

SocietyName       0
BHK               0
Furnishing        0
BuiltUpArea       0
NearbyPlace_1     1
DistanceAway_1    1
NearbyPlace_2     1
DistanceAway_2    1
Description       0
dtype: int64

In [17]:
df.dropna(inplace=True)

In [18]:
df.sample(1)

Unnamed: 0,SocietyName,BHK,Furnishing,BuiltUpArea,NearbyPlace_1,DistanceAway_1,NearbyPlace_2,DistanceAway_2,Description
94,Sparsh Residency,"3 BHK Flat for rent in Science City, Ahmedabad",Semi Furnished,1800 sq.ft,S S Divine School,is 2.5 km away,Vasundhara Children Hospital . Breastfeeding C...,is 3.3 km away,Here is an excellent 3 BHK Apartment available...


# Recommendation using Nearby Location

In [19]:
df['Bhk'] = df['BHK'].str.split(" ").str.get(0)
df['Description'] = df['Description'].str.split('.').apply(lambda x: '.'.join(x[:-4]) if len(x) > 4 else '')

df.drop('BHK', axis=1, inplace=True)

In [20]:
df.head(1)

Unnamed: 0,SocietyName,Furnishing,BuiltUpArea,NearbyPlace_1,DistanceAway_1,NearbyPlace_2,DistanceAway_2,Description,Bhk
0,Ganesh Malabar Retreat,Fully Furnished,1200 sq.ft,Nirma Vidyavihar - Chharodi,is 0.5 km away,SGVP Holistic Hospital,is 2.2 km away,Malabar retreat it's totally new apartment nea...,2


In [21]:
df['NearbyPlaces'] = df.apply(lambda row: {
    row['NearbyPlace_1']: row['DistanceAway_1'],
    row['NearbyPlace_2']: row['DistanceAway_2']
}, axis=1)

df.drop(columns=['NearbyPlace_1', 'DistanceAway_1', 'NearbyPlace_2', 'DistanceAway_2'], inplace=True)

df['NearbyPlaces'] = df['NearbyPlaces'].apply(
    lambda places: str(places) if isinstance(places, dict) else ''
)

In [22]:
df['NearbyPlaces'][0]

"{'Nirma Vidyavihar - Chharodi': 'is 0.5 km away', 'SGVP Holistic Hospital': 'is 2.2 km away'}"

In [23]:
places_df = df[['SocietyName', 'NearbyPlaces']]

In [24]:
import ast

def distance_to_meters(distance_str):
    try:
        if 'km' in distance_str:
            return float(distance_str.split()[1]) * 1000
        elif 'Meter' in distance_str or 'meter' in distance_str:
            return float(distance_str.split()[0])
        else:
            return None
    except:
        return None

location_matrix = {}
for index, row in places_df.iterrows():
    distances = {}
    for location, distance in ast.literal_eval(row['NearbyPlaces']).items():
        distances[location] = distance_to_meters(distance)
    location_matrix[index] = distances

# Convert the dictionary to a dataframe
location_df = pd.DataFrame.from_dict(location_matrix, orient='index')

# Display the first few rows
location_df.head()

Unnamed: 0,Nirma Vidyavihar - Chharodi,SGVP Holistic Hospital,Puna International School,Ashirvad Hospital - Best Gynecologist | Orthopaedic | Psychiatrist | Neurologist | Pediatrician Doctor Hospital,Taksh Education,Balaji Horizon Women's Hospital,Lakshaya International School,Suryam Children Hospital South Bopal,BHIMANI Children's Hospital & Vaccination Centre,"Sumati Vidhya Vihar, Ghatlodiya,",...,Sai Women's Care Hospital,Adani Vidya Mandir,"ZAHRA MEDIPLUS HOSPITAL (Multispecialty) - || Best Hospital, ICU And Emergency, Multispeciality Hospital",Gurukrupa Children Hospital,The Lighthouse Montessori School,Zebar School for Children,Global Longlife Hospital & Research Limited,"Doon School of Excellence, New Maninagar",Jivan Gastro And Gynec Hospital | Liver Hospital | Maternity Hospital | Gastroenterologist | Pregnancy Care in vastral,Shree Sahajanand Gurukul - Koteshwar
0,500.0,2200.0,,,,,,,,,...,,,,,,,,,,
11,1600.0,,,1900.0,,,,,,,...,,,,,,,,,,
70,400.0,,,,,,,,,,...,,,,,,,,,,
80,600.0,,,,,,,,,,...,,,,,,,,,,
102,400.0,,,,,,,,,,...,,,,,,,,,,


In [25]:
location_df.index = df.SocietyName

In [26]:
location_df.head()

Unnamed: 0_level_0,Nirma Vidyavihar - Chharodi,SGVP Holistic Hospital,Puna International School,Ashirvad Hospital - Best Gynecologist | Orthopaedic | Psychiatrist | Neurologist | Pediatrician Doctor Hospital,Taksh Education,Balaji Horizon Women's Hospital,Lakshaya International School,Suryam Children Hospital South Bopal,BHIMANI Children's Hospital & Vaccination Centre,"Sumati Vidhya Vihar, Ghatlodiya,",...,Sai Women's Care Hospital,Adani Vidya Mandir,"ZAHRA MEDIPLUS HOSPITAL (Multispecialty) - || Best Hospital, ICU And Emergency, Multispeciality Hospital",Gurukrupa Children Hospital,The Lighthouse Montessori School,Zebar School for Children,Global Longlife Hospital & Research Limited,"Doon School of Excellence, New Maninagar",Jivan Gastro And Gynec Hospital | Liver Hospital | Maternity Hospital | Gastroenterologist | Pregnancy Care in vastral,Shree Sahajanand Gurukul - Koteshwar
SocietyName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Ganesh Malabar Retreat,500.0,2200.0,,,,,,,,,...,,,,,,,,,,
Vivaan Eminence,1600.0,,,1900.0,,,,,,,...,,,,,,,,,,
Om Sky,400.0,,,,,,,,,,...,,,,,,,,,,
Orchid Legacy,600.0,,,,,,,,,,...,,,,,,,,,,
Shiv Sadhna Skywalk Manglaam,400.0,,,,,,,,,,...,,,,,,,,,,


In [27]:
location_df.fillna(6900, inplace=True)

In [28]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

# Apply the scaler to the entire dataframe
location_df_normalized = pd.DataFrame(scaler.fit_transform(location_df), columns=location_df.columns, index=location_df.index)

In [29]:
location_df_normalized

Unnamed: 0_level_0,Nirma Vidyavihar - Chharodi,SGVP Holistic Hospital,Puna International School,Ashirvad Hospital - Best Gynecologist | Orthopaedic | Psychiatrist | Neurologist | Pediatrician Doctor Hospital,Taksh Education,Balaji Horizon Women's Hospital,Lakshaya International School,Suryam Children Hospital South Bopal,BHIMANI Children's Hospital & Vaccination Centre,"Sumati Vidhya Vihar, Ghatlodiya,",...,Sai Women's Care Hospital,Adani Vidya Mandir,"ZAHRA MEDIPLUS HOSPITAL (Multispecialty) - || Best Hospital, ICU And Emergency, Multispeciality Hospital",Gurukrupa Children Hospital,The Lighthouse Montessori School,Zebar School for Children,Global Longlife Hospital & Research Limited,"Doon School of Excellence, New Maninagar",Jivan Gastro And Gynec Hospital | Liver Hospital | Maternity Hospital | Gastroenterologist | Pregnancy Care in vastral,Shree Sahajanand Gurukul - Koteshwar
SocietyName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Ganesh Malabar Retreat,-8.287658,-11.039952,0.206435,0.269946,0.050702,0.050702,0.134474,0.195077,0.23552,0.050702,...,0.050702,0.050702,0.050702,0.050702,0.050702,0.050702,0.050702,0.050702,0.050702,0.050702
Vivaan Eminence,-6.841784,0.088028,0.206435,-3.552815,0.050702,0.050702,0.134474,0.195077,0.23552,0.050702,...,0.050702,0.050702,0.050702,0.050702,0.050702,0.050702,0.050702,0.050702,0.050702,0.050702
Om Sky,-8.419101,0.088028,0.206435,0.269946,0.050702,0.050702,0.134474,0.195077,0.23552,0.050702,...,0.050702,0.050702,0.050702,0.050702,0.050702,0.050702,0.050702,0.050702,0.050702,0.050702
Orchid Legacy,-8.156215,0.088028,0.206435,0.269946,0.050702,0.050702,0.134474,0.195077,0.23552,0.050702,...,0.050702,0.050702,0.050702,0.050702,0.050702,0.050702,0.050702,0.050702,0.050702,0.050702
Shiv Sadhna Skywalk Manglaam,-8.419101,0.088028,0.206435,0.269946,0.050702,0.050702,0.134474,0.195077,0.23552,0.050702,...,0.050702,0.050702,0.050702,0.050702,0.050702,0.050702,0.050702,0.050702,0.050702,0.050702
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Savvy Solaris,0.124702,0.088028,0.206435,0.269946,0.050702,0.050702,0.134474,0.195077,0.23552,0.050702,...,0.050702,0.050702,0.050702,0.050702,0.050702,0.050702,0.050702,0.050702,0.050702,0.050702
Padmanabh Dwarkesh Antilia,0.124702,0.088028,0.206435,0.269946,0.050702,0.050702,0.134474,0.195077,0.23552,0.050702,...,0.050702,0.050702,0.050702,0.050702,0.050702,0.050702,0.050702,0.050702,0.050702,0.050702
Gala Luxuria,0.124702,0.088028,0.206435,0.269946,0.050702,0.050702,0.134474,0.195077,0.23552,0.050702,...,0.050702,-19.723083,-19.723083,0.050702,0.050702,0.050702,0.050702,0.050702,0.050702,0.050702
Gala Swing,0.124702,0.088028,0.206435,0.269946,0.050702,0.050702,0.134474,0.195077,0.23552,0.050702,...,0.050702,0.050702,0.050702,0.050702,0.050702,-19.723083,-19.723083,0.050702,0.050702,0.050702


In [30]:
cosine_sim1 = cosine_similarity(location_df_normalized)

In [31]:
cosine_sim1.shape

(390, 390)

In [32]:
def recommend_properties_with_scores(property_name, top_n=247):
    
    cosine_sim_matrix = cosine_sim1
    
    # Get the similarity scores for the property using its name as the index
    sim_scores = list(enumerate(cosine_sim_matrix[location_df_normalized.index.get_loc(property_name)]))
    
    # Sort properties based on the similarity scores
    sorted_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get the indices and scores of the top_n most similar properties
    top_indices = [i[0] for i in sorted_scores[1:top_n+1]]
    top_scores = [i[1] for i in sorted_scores[1:top_n+1]]
    
    # Retrieve the names of the top properties using the indices
    top_properties = location_df_normalized.index[top_indices].tolist()
    
    # Create a dataframe with the results
    recommendations_df = pd.DataFrame({
        'PropertyName': top_properties,
        'SimilarityScore': top_scores
    })
    
    return recommendations_df

# Test the recommender function using a property name
recommend_properties_with_scores('Ganesh Malabar Retreat')

Unnamed: 0,PropertyName,SimilarityScore
0,Sun Shela One,0.698612
1,Gala Celestia,0.698612
2,Aaryan Euphoria,0.544175
3,Vivaan Eminence,0.521487
4,Om Sky,0.511558
...,...,...
242,Savvy Studioz,-0.017995
243,Omkar Earth Paradise,-0.018001
244,The Crest,-0.018093
245,Shivalik Sharda Park view 2,-0.018104


# Recommendation using Description

In [33]:
desc_df = df[['SocietyName', 'Description']]

desc_df = desc_df.set_index('SocietyName')

In [34]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet, stopwords

nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [35]:
def remove_stopwords(text):
    new_text = []
    for word in text.split():
        if word in stopwords.words('english'):
            new_text.append('')
        else:
            new_text.append(word)
    x = new_text[:]
    new_text.clear()
    return " ".join(x)

desc_df['Description'] = desc_df['Description'].apply(remove_stopwords)

In [36]:
lemmatizer = WordNetLemmatizer()

desc_df['Description'] = desc_df['Description'].apply(lambda x:lemmatizer.lemmatize(x, pos='v'))

In [37]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1, 2))

tfidf_matrix = tfidf_vectorizer.fit_transform(desc_df['Description'])

In [38]:
tfidf_matrix.shape

(390, 3307)

In [39]:
cosine_sim2 = cosine_similarity(tfidf_matrix)

In [40]:
cosine_sim2.shape

(390, 390)

In [41]:
def recommend_properties_with_scores(property_name, top_n=190):
    
    cosine_sim_matrix = cosine_sim2
    
    # Get the similarity scores for the property using its name as the index
    sim_scores = list(enumerate(cosine_sim_matrix[desc_df.index.get_loc(property_name)]))
    
    # Sort properties based on the similarity scores
    sorted_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get the indices and scores of the top_n most similar properties
    top_indices = [i[0] for i in sorted_scores[1:top_n+1]]
    top_scores = [i[1] for i in sorted_scores[1:top_n+1]]
    
    # Retrieve the names of the top properties using the indices
    top_properties = desc_df.index[top_indices].tolist()
    
    # Create a dataframe with the results
    recommendations_df = pd.DataFrame({
        'PropertyName': top_properties,
        'SimilarityScore': top_scores
    })
    
    return recommendations_df

# Test the recommender function using a property name
recommend_properties_with_scores('Ganesh Malabar Retreat')

Unnamed: 0,PropertyName,SimilarityScore
0,Shripad Residency,0.661176
1,Alaya Heights,0.643242
2,Himalaya and Mainland Pinnacle Block A To D,0.553928
3,Gala Haven,0.541043
4,Dwarkesh Opulence,0.524672
...,...,...
185,Flora Iris,0.044886
186,Samyaka,0.044315
187,The Crest,0.043673
188,Aamrakunj Gracia,0.043673


In [42]:
import pickle

with open('sim1.pkl', 'wb') as file:
    pickle.dump(cosine_sim1, file)

with open('sim2.pkl', 'wb') as file:
    pickle.dump(cosine_sim2, file)

with open('location_df.pkl', 'wb') as file:
    pickle.dump(location_df, file)