# SMART HOME LOCATION SELECTOR

### Import Required Libraries

In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

### Dataset Loading and Initial Inspection

In [17]:
df = pd.read_csv("../data/smart_home_location_dataset_20000.csv")

In [19]:
df.shape

(19998, 23)

In [21]:
df.head()

Unnamed: 0,city,locality,zone,latitude,longitude,price_per_sqft,monthly_rent,maintenance_cost,safety_score,crime_rate,...,school_distance_km,college_distance_km,metro_distance_km,bus_stop_distance_km,road_connectivity_score,green_space_score,shopping_mall_distance_km,internet_speed_mbps,water_availability_score,power_backup
0,Bengaluru,Indiranagar,North,24.5907,74.5612,6786,31434,5967,6.8,2.9,...,2.83,4.37,3.99,1.08,7.5,6.5,2.03,282,6.0,Yes
1,Bengaluru,BTM Layout,East,17.2182,74.3364,10014,31396,2259,7.6,2.6,...,3.36,4.22,5.36,0.63,7.9,8.2,3.9,280,7.3,Yes
2,Bengaluru,Indiranagar,North,23.3141,78.5753,5807,48512,2327,7.6,3.0,...,3.11,5.21,3.82,0.2,8.2,6.9,1.85,178,7.3,No
3,Bengaluru,Electronic City,South,23.6854,79.1057,5416,48021,1762,8.9,2.8,...,3.98,2.89,3.67,0.65,9.2,6.6,1.96,123,8.6,No
4,Bengaluru,Yelahanka,Central,19.6121,77.1349,10430,46746,2633,8.0,1.4,...,0.97,5.08,3.36,0.64,8.1,6.6,1.88,201,9.0,Yes


In [23]:
df.columns

Index(['city', 'locality', 'zone', 'latitude', 'longitude', 'price_per_sqft',
       'monthly_rent', 'maintenance_cost', 'safety_score', 'crime_rate',
       'pollution_index', 'noise_level', 'hospital_distance_km',
       'school_distance_km', 'college_distance_km', 'metro_distance_km',
       'bus_stop_distance_km', 'road_connectivity_score', 'green_space_score',
       'shopping_mall_distance_km', 'internet_speed_mbps',
       'water_availability_score', 'power_backup'],
      dtype='object')

In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19998 entries, 0 to 19997
Data columns (total 23 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   city                       19998 non-null  object 
 1   locality                   19998 non-null  object 
 2   zone                       19998 non-null  object 
 3   latitude                   19998 non-null  float64
 4   longitude                  19998 non-null  float64
 5   price_per_sqft             19998 non-null  int64  
 6   monthly_rent               19998 non-null  int64  
 7   maintenance_cost           19998 non-null  int64  
 8   safety_score               19998 non-null  float64
 9   crime_rate                 19998 non-null  float64
 10  pollution_index            19998 non-null  int64  
 11  noise_level                19998 non-null  float64
 12  hospital_distance_km       19998 non-null  float64
 13  school_distance_km         19998 non-null  flo

In [27]:
df.describe()

Unnamed: 0,latitude,longitude,price_per_sqft,monthly_rent,maintenance_cost,safety_score,crime_rate,pollution_index,noise_level,hospital_distance_km,school_distance_km,college_distance_km,metro_distance_km,bus_stop_distance_km,road_connectivity_score,green_space_score,shopping_mall_distance_km,internet_speed_mbps,water_availability_score
count,19998.0,19998.0,19998.0,19998.0,19998.0,19998.0,19998.0,19998.0,19998.0,19998.0,19998.0,19998.0,19998.0,19998.0,19998.0,19998.0,19998.0,19998.0,19998.0
mean,20.71713,76.498997,10230.729773,53883.361736,3999.261726,8.005631,3.2503,189.742574,5.257081,3.250701,2.396819,3.998107,2.844005,0.795592,7.734813,6.741164,3.239897,180.011951,7.7452
std,4.608973,2.197002,3319.311399,20826.662049,1446.207306,0.862387,1.304463,40.676972,1.300032,1.586452,1.206816,1.730123,1.531752,0.404469,1.010922,1.298034,1.592542,69.575453,1.015206
min,12.8006,72.7002,4500.0,18001.0,1500.0,6.5,1.0,120.0,3.0,0.5,0.3,1.0,0.2,0.1,6.0,4.5,0.5,60.0,6.0
25%,16.72655,74.5865,7341.25,35877.5,2734.0,7.3,2.1,154.0,4.1,1.88,1.36,2.51,1.52,0.45,6.9,5.6,1.84,120.0,6.9
50%,20.6857,76.49765,10228.0,53551.0,3993.0,8.0,3.2,190.0,5.3,3.27,2.41,4.0,2.85,0.79,7.7,6.7,3.23,180.0,7.7
75%,24.692075,78.40855,13080.0,71946.25,5251.75,8.7,4.4,225.0,6.4,4.61,3.43,5.5,4.18,1.14,8.6,7.9,4.62,240.0,8.6
max,28.6991,80.3,15999.0,89999.0,6500.0,9.5,5.5,260.0,7.5,6.0,4.5,7.0,5.5,1.5,9.5,9.0,6.0,300.0,9.5


### Feature Understanding and Classification

In [30]:
categorical_features = [
    "city",
    "locality",
    "zone",
    "power_backup"
]

categorical_features

['city', 'locality', 'zone', 'power_backup']

In [32]:
numerical_features = [
    "latitude",
    "longitude",
    "price_per_sqft",
    "monthly_rent",
    "maintenance_cost",
    "safety_score",
    "crime_rate",
    "pollution_index",
    "noise_level",
    "hospital_distance_km",
    "school_distance_km",
    "college_distance_km",
    "metro_distance_km",
    "bus_stop_distance_km",
    "road_connectivity_score",
    "green_space_score",
    "shopping_mall_distance_km",
    "internet_speed_mbps",
    "water_availability_score"
]

numerical_features

['latitude',
 'longitude',
 'price_per_sqft',
 'monthly_rent',
 'maintenance_cost',
 'safety_score',
 'crime_rate',
 'pollution_index',
 'noise_level',
 'hospital_distance_km',
 'school_distance_km',
 'college_distance_km',
 'metro_distance_km',
 'bus_stop_distance_km',
 'road_connectivity_score',
 'green_space_score',
 'shopping_mall_distance_km',
 'internet_speed_mbps',
 'water_availability_score']

In [34]:
positive_features = [
    "safety_score",
    "road_connectivity_score",
    "green_space_score",
    "internet_speed_mbps",
    "water_availability_score"
]

In [36]:
negative_features = [
    "price_per_sqft",
    "monthly_rent",
    "maintenance_cost",
    "pollution_index",
    "noise_level",
    "crime_rate",
    "hospital_distance_km",
    "school_distance_km",
    "college_distance_km",
    "metro_distance_km",
    "bus_stop_distance_km",
    "shopping_mall_distance_km"
]

In [38]:
df.isnull().sum()

city                         0
locality                     0
zone                         0
latitude                     0
longitude                    0
price_per_sqft               0
monthly_rent                 0
maintenance_cost             0
safety_score                 0
crime_rate                   0
pollution_index              0
noise_level                  0
hospital_distance_km         0
school_distance_km           0
college_distance_km          0
metro_distance_km            0
bus_stop_distance_km         0
road_connectivity_score      0
green_space_score            0
shopping_mall_distance_km    0
internet_speed_mbps          0
water_availability_score     0
power_backup                 0
dtype: int64

### Data Preprocessing

In [41]:
df["power_backup"] = df["power_backup"].map({"Yes": 1, "No": 0})

In [43]:
df_encoded = pd.get_dummies(
    df,
    columns=["city", "zone"],
    drop_first=True
)

In [45]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
df_encoded[numerical_features] = scaler.fit_transform(
    df_encoded[numerical_features]
)

In [47]:
df_encoded.head()

Unnamed: 0,locality,latitude,longitude,price_per_sqft,monthly_rent,maintenance_cost,safety_score,crime_rate,pollution_index,noise_level,...,power_backup,city_Chennai,city_Delhi NCR,city_Hyderabad,city_Mumbai,city_Pune,zone_East,zone_North,zone_South,zone_West
0,Indiranagar,0.741586,0.244875,0.1988,0.186575,0.8934,0.1,0.422222,0.05,0.088889,...,1,False,False,False,False,False,False,True,False,False
1,BTM Layout,0.277863,0.215295,0.47952,0.186047,0.1518,0.366667,0.355556,0.628571,0.6,...,1,False,False,False,False,False,True,False,False,False
2,Indiranagar,0.661289,0.77306,0.113662,0.423776,0.1654,0.366667,0.444444,0.664286,0.155556,...,0,False,False,False,False,False,False,True,False,False
3,Electronic City,0.684643,0.842851,0.079659,0.416956,0.0524,0.8,0.4,0.114286,0.2,...,0,False,False,False,False,False,False,False,True,False
4,Yelahanka,0.428437,0.583529,0.515697,0.399247,0.2266,0.5,0.088889,0.085714,0.866667,...,1,False,False,False,False,False,False,False,False,False


In [49]:
df_processed = df_encoded.copy()