In [1]:
import re

import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
df = pd.read_csv('apartment.csv')

df.shape

(620, 11)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 620 entries, 0 to 619
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   SocietyName     383 non-null    object
 1   BHK             620 non-null    object
 2   Furnishing      620 non-null    object
 3   BuiltUpArea     620 non-null    object
 4   Locality        620 non-null    object
 5   NearbyPlace_1   619 non-null    object
 6   DistanceAway_1  619 non-null    object
 7   NearbyPlace_2   619 non-null    object
 8   DistanceAway_2  619 non-null    object
 9   Description     620 non-null    object
 10  Highlights      137 non-null    object
dtypes: object(11)
memory usage: 53.4+ KB


In [4]:
df.duplicated().sum()

55

In [5]:
df.drop_duplicates(inplace=True)

In [6]:
df.shape

(565, 11)

In [7]:
df.isna().sum()

SocietyName       214
BHK                 0
Furnishing          0
BuiltUpArea         0
Locality            0
NearbyPlace_1       1
DistanceAway_1      1
NearbyPlace_2       1
DistanceAway_2      1
Description         0
Highlights        437
dtype: int64

In [8]:
df = df[~(df['SocietyName'].isna())]

df.reset_index(drop=True, inplace=True)

In [9]:
df.shape

(351, 11)

In [10]:
df.drop_duplicates(subset=['SocietyName'], inplace=True)

In [11]:
df.shape

df.reset_index(drop=True, inplace=True)

In [12]:
df.isna().sum()

SocietyName         0
BHK                 0
Furnishing          0
BuiltUpArea         0
Locality            0
NearbyPlace_1       0
DistanceAway_1      0
NearbyPlace_2       0
DistanceAway_2      0
Description         0
Highlights        132
dtype: int64

In [13]:
df.sample(10)

Unnamed: 0,SocietyName,BHK,Furnishing,BuiltUpArea,Locality,NearbyPlace_1,DistanceAway_1,NearbyPlace_2,DistanceAway_2,Description,Highlights
179,Mahaveer Heights,"3 BHK Flat for rent in Bhimrad, Surat",Fully Furnished,2200 sq.ft,"3 BHK Flat for rent in Bhimrad, Surat",Bhagwan Mahavir International School,is 0.1 km away,Shraddha Clinic - Dr. Khushbu Patel Jani,is 1.1 km away,This Apartment can be a comfortable and afford...,
72,Rajhans Builder Surat Campus,"2 BHK Flat for rent in Limla, Surat",Fully Furnished,1300 sq.ft,"2 BHK Flat for rent in Limla, Surat",Sharda Vidhyalay School ( Ichchapore ),is 6.9 km away,SOCH- Institute Of Mental Health & Deaddiction,is 9.2 km away,Are you looking for an affordable Apartment fo...,Special Highlights24x7 SecurityLiftPower Backu...
163,Sai Jyoti Residency,"2 BHK Flat for rent in Udhna Zone, Surat",Unfurnished,1200 sq.ft,"2 BHK Flat for rent in Udhna Zone, Surat",Navaprajna Public School,is 3.9 km away,Kauvery Hospital Marathahalli Bengaluru,is 1.1 km away,This Apartment can be a comfortable and afford...,
169,JT Stuti Residency,"2 BHK Flat for rent in Adajan Gam, Surat",Fully Furnished,1175 sq.ft,"2 BHK Flat for rent in Adajan Gam, Surat",Sardar Patel Vidhyalaya,is 0.7 km away,Ami Eye Hospital,is 0.6 km away,Check this beautiful 2 BHK Apartment available...,
159,Shrungal Solitaire,"2 BHK Flat for rent in Udhna Zone, Surat",Unfurnished,1120 sq.ft,"2 BHK Flat for rent in Udhna Zone, Surat",Galaxy Army International School,is 0.1 km away,Aarogyam Hospital,is 0.1 km away,This Apartment can be a comfortable and afford...,
9,Happy Home Nakshatra Solitaire,"3 BHK Flat for rent in Palanpur, Surat",Semi Furnished,1518 sq.ft,"3 BHK Flat for rent in Palanpur, Surat",L P Savani School,is 0.7 km away,"Marigold hospital,ICU & Cardio-Diabetes center",is 0.6 km away,The project has 3 bhk multistorey apartments. ...,Special Highlights24x7 SecurityCycling & Joggi...
50,Raj Green Raj Green Hills,"5 BHK Flat for rent in Pal, Surat",Fully Furnished,7500 sq.ft,"5 BHK Flat for rent in Pal, Surat",Sardar Patel Vidhyalaya,is 0.1 km away,Drashti Eye Hospital,is 0.8 km away,This Apartment can be a comfortable and afford...,Special Highlights24x7 SecurityChildren Play A...
35,Monarch Monarch Residency,"3 BHK Flat for rent in Palanpur, Surat",Semi Furnished,2000 sq.ft,"3 BHK Flat for rent in Palanpur, Surat",Shree Jalaram International School ( SJIS Scho...,is 0.2 km away,Dhwani Hospital & ICCU,is 0.3 km away,This Apartment can be a comfortable and afford...,
76,Happy Home Nakshatra Heights,"3 BHK Flat for rent in Pal Gam, Surat",Fully Furnished,1458 sq.ft,"3 BHK Flat for rent in Pal Gam, Surat",L.P. Savani International School,is 0.8 km away,Dr Nirav Soni | Joint Replacement Surgeon| Art...,is 0.2 km away,Check this beautiful 3 BHK Apartment available...,Special Highlights24x7 SecurityChildren Play A...
107,Happy Home Nandini,"3 BHK Flat for rent in Vesu, Surat",Unfurnished,1750 sq.ft,"3 BHK Flat for rent in Vesu, Surat",Agarwal Vidya Vihar,is 0.8 km away,Amritam Multispeciality Hospital,is 1.1 km away,If you are looking for a modern house on rent ...,


In [14]:
df.drop(columns=['Highlights','Locality'], inplace=True)

In [15]:
df.isna().sum()

SocietyName       0
BHK               0
Furnishing        0
BuiltUpArea       0
NearbyPlace_1     0
DistanceAway_1    0
NearbyPlace_2     0
DistanceAway_2    0
Description       0
dtype: int64

In [16]:
df.sample(1)

Unnamed: 0,SocietyName,BHK,Furnishing,BuiltUpArea,NearbyPlace_1,DistanceAway_1,NearbyPlace_2,DistanceAway_2,Description
119,Rameshwaram Devbhoomi,"4 BHK Flat for rent in Bhimrad, Surat",Fully Furnished,3500 sq.ft,Bhagwan Mahavir Concept School | BMCS | Cambri...,is 0.1 km away,Ayushyam children hospital,is 1 km away,This is a modern and stylish home available fo...


# Recommendation using Nearby Location

In [17]:
df['Bhk'] = df['BHK'].str.split(" ").str.get(0)
df['Description'] = df['Description'].str.split('.').apply(lambda x: '.'.join(x[:-4]) if len(x) > 4 else '')

df.drop('BHK', axis=1, inplace=True)

In [18]:
df.head(1)

Unnamed: 0,SocietyName,Furnishing,BuiltUpArea,NearbyPlace_1,DistanceAway_1,NearbyPlace_2,DistanceAway_2,Description,Bhk
0,Aston Homes,Semi Furnished,2200 sq.ft,L P Savani School,is 0.2 km away,BACHPAN Children hospital,is 0.4 km away,Your search for a spacious home at affordable ...,3


In [19]:
df['NearbyPlaces'] = df.apply(lambda row: {
    row['NearbyPlace_1']: row['DistanceAway_1'],
    row['NearbyPlace_2']: row['DistanceAway_2']
}, axis=1)

df.drop(columns=['NearbyPlace_1', 'DistanceAway_1', 'NearbyPlace_2', 'DistanceAway_2'], inplace=True)

df['NearbyPlaces'] = df['NearbyPlaces'].apply(
    lambda places: str(places) if isinstance(places, dict) else ''
)

In [20]:
df['NearbyPlaces'][0]

"{'L P Savani School': 'is 0.2 km away', 'BACHPAN Children hospital': 'is 0.4 km away'}"

In [21]:
places_df = df[['SocietyName', 'NearbyPlaces']]

In [22]:
import ast

def distance_to_meters(distance_str):
    try:
        if 'km' in distance_str:
            return float(distance_str.split()[1]) * 1000
        elif 'Meter' in distance_str or 'meter' in distance_str:
            return float(distance_str.split()[0])
        else:
            return None
    except:
        return None

location_matrix = {}
for index, row in places_df.iterrows():
    distances = {}
    for location, distance in ast.literal_eval(row['NearbyPlaces']).items():
        distances[location] = distance_to_meters(distance)
    location_matrix[index] = distances

# Convert the dictionary to a dataframe
location_df = pd.DataFrame.from_dict(location_matrix, orient='index')

# Display the first few rows
location_df.head()

Unnamed: 0,L P Savani School,BACHPAN Children hospital,Agarwal Vidya Vihar,Amritam Multispeciality Hospital,The Radiant International School,Ortho Plus Hospital,Bhagwan Mahavir International School,Shraddha Clinic - Dr. Khushbu Patel Jani,Shishukunj Vidya Vihar & L B Contractor School,Vesu Prathmik school,...,Nilkanth Aarogyadham Surat,Aashadeep Vidhyalay 1,Shiv Orthopaedic Hospital,Hetvi Eye Hospital,Ramkabir High School,Manavta Hospital (Medicare speciality centre),"Don Bosco High School , Sagaon.",Mahavir Hospital,L. P. Savani Academy,Om Women's and Children Hospital
0,200.0,400.0,,,,,,,,,...,,,,,,,,,,
8,500.0,700.0,,,,,,,,,...,,,,,,,,,,
9,700.0,,,,,,,,,,...,,,,,,,,,,
25,100.0,300.0,,,,,,,,,...,,,,,,,,,,
29,500.0,400.0,,,,,,,,,...,,,,,,,,,,


In [23]:
location_df.index = df.SocietyName

In [24]:
location_df.head()

Unnamed: 0_level_0,L P Savani School,BACHPAN Children hospital,Agarwal Vidya Vihar,Amritam Multispeciality Hospital,The Radiant International School,Ortho Plus Hospital,Bhagwan Mahavir International School,Shraddha Clinic - Dr. Khushbu Patel Jani,Shishukunj Vidya Vihar & L B Contractor School,Vesu Prathmik school,...,Nilkanth Aarogyadham Surat,Aashadeep Vidhyalay 1,Shiv Orthopaedic Hospital,Hetvi Eye Hospital,Ramkabir High School,Manavta Hospital (Medicare speciality centre),"Don Bosco High School , Sagaon.",Mahavir Hospital,L. P. Savani Academy,Om Women's and Children Hospital
SocietyName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Aston Homes,200.0,400.0,,,,,,,,,...,,,,,,,,,,
Raghuvir Spectrum,500.0,700.0,,,,,,,,,...,,,,,,,,,,
Ramaa Residency,700.0,,,,,,,,,,...,,,,,,,,,,
Swagat Clifton,100.0,300.0,,,,,,,,,...,,,,,,,,,,
Shyam Enclave Building A B C,500.0,400.0,,,,,,,,,...,,,,,,,,,,


In [25]:
location_df.fillna(6900, inplace=True)

In [26]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

# Apply the scaler to the entire dataframe
location_df_normalized = pd.DataFrame(scaler.fit_transform(location_df), columns=location_df.columns, index=location_df.index)

In [27]:
location_df_normalized

Unnamed: 0_level_0,L P Savani School,BACHPAN Children hospital,Agarwal Vidya Vihar,Amritam Multispeciality Hospital,The Radiant International School,Ortho Plus Hospital,Bhagwan Mahavir International School,Shraddha Clinic - Dr. Khushbu Patel Jani,Shishukunj Vidya Vihar & L B Contractor School,Vesu Prathmik school,...,Nilkanth Aarogyadham Surat,Aashadeep Vidhyalay 1,Shiv Orthopaedic Hospital,Hetvi Eye Hospital,Ramkabir High School,Manavta Hospital (Medicare speciality centre),"Don Bosco High School , Sagaon.",Mahavir Hospital,L. P. Savani Academy,Om Women's and Children Hospital
SocietyName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Aston Homes,-4.103137,-3.636068,0.247132,0.195379,0.163465,0.180262,0.319102,0.301671,0.208347,0.163876,...,0.072739,0.072739,0.072739,0.072739,0.072739,0.072739,0.072739,0.072739,0.072739,0.072739
Raghuvir Spectrum,-3.908319,-3.455279,0.247132,0.195379,0.163465,0.180262,0.319102,0.301671,0.208347,0.163876,...,0.072739,0.072739,0.072739,0.072739,0.072739,0.072739,0.072739,0.072739,0.072739,0.072739
Ramaa Residency,-3.778441,0.281015,0.247132,0.195379,0.163465,0.180262,0.319102,0.301671,0.208347,0.163876,...,0.072739,0.072739,0.072739,0.072739,0.072739,0.072739,0.072739,0.072739,0.072739,0.072739
Swagat Clifton,-4.168076,-3.696331,0.247132,0.195379,0.163465,0.180262,0.319102,0.301671,0.208347,0.163876,...,0.072739,0.072739,0.072739,0.072739,0.072739,0.072739,0.072739,0.072739,0.072739,0.072739
Shyam Enclave Building A B C,-3.908319,-3.636068,0.247132,0.195379,0.163465,0.180262,0.319102,0.301671,0.208347,0.163876,...,0.072739,0.072739,0.072739,0.072739,0.072739,0.072739,0.072739,0.072739,0.072739,0.072739
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Shaileshbhai Gonawala Sai Kg Heights,0.247795,0.281015,0.247132,0.195379,0.163465,0.180262,0.319102,0.301671,0.208347,0.163876,...,-13.747727,0.072739,0.072739,0.072739,0.072739,0.072739,0.072739,0.072739,0.072739,0.072739
Flora Avenue,0.247795,0.281015,0.247132,0.195379,0.163465,0.180262,0.319102,0.301671,0.208347,0.163876,...,0.072739,-13.747727,-13.747727,0.072739,0.072739,0.072739,0.072739,0.072739,0.072739,0.072739
M D Om Palace,0.247795,0.281015,0.247132,0.195379,0.163465,0.180262,0.319102,0.301671,0.208347,0.163876,...,0.072739,0.072739,0.072739,0.072739,-13.747727,-13.747727,0.072739,0.072739,0.072739,0.072739
Green Park Complex,0.247795,0.281015,0.247132,0.195379,0.163465,0.180262,0.319102,0.301671,0.208347,0.163876,...,0.072739,0.072739,0.072739,0.072739,0.072739,0.072739,-13.747727,-13.747727,0.072739,0.072739


In [28]:
cosine_sim1 = cosine_similarity(location_df_normalized)

In [29]:
cosine_sim1.shape

(190, 190)

In [30]:
def recommend_properties_with_scores(property_name, top_n=247):
    
    cosine_sim_matrix = cosine_sim1
    
    # Get the similarity scores for the property using its name as the index
    sim_scores = list(enumerate(cosine_sim_matrix[location_df_normalized.index.get_loc(property_name)]))
    
    # Sort properties based on the similarity scores
    sorted_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get the indices and scores of the top_n most similar properties
    top_indices = [i[0] for i in sorted_scores[1:top_n+1]]
    top_scores = [i[1] for i in sorted_scores[1:top_n+1]]
    
    # Retrieve the names of the top properties using the indices
    top_properties = location_df_normalized.index[top_indices].tolist()
    
    # Create a dataframe with the results
    recommendations_df = pd.DataFrame({
        'PropertyName': top_properties,
        'SimilarityScore': top_scores
    })
    
    return recommendations_df

# Test the recommender function using a property name
recommend_properties_with_scores('Aston Homes')

Unnamed: 0,PropertyName,SimilarityScore
0,Swagat Clifton,0.999993
1,Avadh Copper Stone,0.999993
2,Happy Home Nakshatra Solitaire,0.999993
3,Raghuvir Spectrum,0.999926
4,Kush Crystal Heights,0.999878
...,...,...
184,Milestone 7 Heaven,-0.090523
185,Vaishnodevi Sky,-0.090653
186,Samarth Developers Surat Samarth Enclave,-0.091012
187,Shree Kalyan Residency,-0.091102


# Recommendation using Description

In [31]:
desc_df = df[['SocietyName', 'Description']]

desc_df = desc_df.set_index('SocietyName')

In [32]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet, stopwords

nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [33]:
def remove_stopwords(text):
    new_text = []
    for word in text.split():
        if word in stopwords.words('english'):
            new_text.append('')
        else:
            new_text.append(word)
    x = new_text[:]
    new_text.clear()
    return " ".join(x)

desc_df['Description'] = desc_df['Description'].apply(remove_stopwords)

In [34]:
lemmatizer = WordNetLemmatizer()

desc_df['Description'] = desc_df['Description'].apply(lambda x:lemmatizer.lemmatize(x, pos='v'))

In [35]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1, 2))

tfidf_matrix = tfidf_vectorizer.fit_transform(desc_df['Description'])

In [36]:
tfidf_matrix.shape

(190, 1727)

In [37]:
cosine_sim2 = cosine_similarity(tfidf_matrix)

In [38]:
cosine_sim2.shape

(190, 190)

In [39]:
def recommend_properties_with_scores(property_name, top_n=190):
    
    cosine_sim_matrix = cosine_sim2
    
    # Get the similarity scores for the property using its name as the index
    sim_scores = list(enumerate(cosine_sim_matrix[desc_df.index.get_loc(property_name)]))
    
    # Sort properties based on the similarity scores
    sorted_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get the indices and scores of the top_n most similar properties
    top_indices = [i[0] for i in sorted_scores[1:top_n+1]]
    top_scores = [i[1] for i in sorted_scores[1:top_n+1]]
    
    # Retrieve the names of the top properties using the indices
    top_properties = desc_df.index[top_indices].tolist()
    
    # Create a dataframe with the results
    recommendations_df = pd.DataFrame({
        'PropertyName': top_properties,
        'SimilarityScore': top_scores
    })
    
    return recommendations_df

# Test the recommender function using a property name
recommend_properties_with_scores('Aston Homes')

Unnamed: 0,PropertyName,SimilarityScore
0,Western Villa,0.936161
1,Vastu Shilp,0.936161
2,Meera Heights,0.906823
3,Aakash Earrth,0.780104
4,Aagam Prestige,0.636216
...,...,...
184,Sangini Swapna Sangini,0.005470
185,SNS Splendid,0.005365
186,Mangalam Heights,0.004678
187,Rajhans Platinum Residency,0.002132


In [40]:
import pickle

with open('sim1.pkl', 'wb') as file:
    pickle.dump(cosine_sim1, file)

with open('sim2.pkl', 'wb') as file:
    pickle.dump(cosine_sim2, file)

with open('location_df.pkl', 'wb') as file:
    pickle.dump(location_df, file)