## Importing the dataset

In [9]:
from pathlib import Path
import pandas as pd
import tomli  

config_path = Path.cwd().parent / "config.toml"

with open(config_path, "rb") as f:
    config = tomli.load(f)
dataset_path = config["paths"]["dataset"]
full_dataset_path = Path.cwd().parent / dataset_path

df = pd.read_csv(full_dataset_path)
df.head()


Unnamed: 0.1,Unnamed: 0,Price,Area,Location,No. of Bedrooms,New/Resale,Gymnasium,Lift Available,Car Parking,Maintenance Staff,24x7 Security,Children's Play Area,Clubhouse,Intercom,Landscaped Gardens,Indoor Games,Gas Connection,Jogging Track,Swimming Pool
0,0,4850000,720,Kharghar,1,0,0,1,1,1,1,0,0,0,0,0,0,0,0
1,1,4500000,600,Kharghar,1,0,1,1,1,1,1,0,1,0,0,0,0,1,1
2,2,6700000,650,Kharghar,1,0,1,1,1,1,1,1,1,1,0,0,0,1,1
3,3,4500000,650,Kharghar,1,0,0,1,1,1,1,0,0,1,1,0,0,0,0
4,4,5000000,665,Kharghar,1,0,0,1,1,1,1,0,0,1,1,0,0,0,0


In [10]:
features = df.columns
features

Index(['Unnamed: 0', 'Price', 'Area', 'Location', 'No. of Bedrooms',
       'New/Resale', 'Gymnasium', 'Lift Available', 'Car Parking',
       'Maintenance Staff', '24x7 Security', 'Children's Play Area',
       'Clubhouse', 'Intercom', 'Landscaped Gardens', 'Indoor Games',
       'Gas Connection', 'Jogging Track', 'Swimming Pool'],
      dtype='object')

In [11]:
df.drop("Unnamed: 0", axis=1, inplace=True)

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6347 entries, 0 to 6346
Data columns (total 18 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Price                 6347 non-null   int64 
 1   Area                  6347 non-null   int64 
 2   Location              6347 non-null   object
 3   No. of Bedrooms       6347 non-null   int64 
 4   New/Resale            6347 non-null   int64 
 5   Gymnasium             6347 non-null   int64 
 6   Lift Available        6347 non-null   int64 
 7   Car Parking           6347 non-null   int64 
 8   Maintenance Staff     6347 non-null   int64 
 9   24x7 Security         6347 non-null   int64 
 10  Children's Play Area  6347 non-null   int64 
 11  Clubhouse             6347 non-null   int64 
 12  Intercom              6347 non-null   int64 
 13  Landscaped Gardens    6347 non-null   int64 
 14  Indoor Games          6347 non-null   int64 
 15  Gas Connection        6347 non-null   

- The dataset is cleaned and does not contain any missing values. 

In [43]:
features = df.columns
for feature in features:
    feature_unique = df[feature].unique()
    feature_values_count = len(feature_unique)
    
    print(f"{feature} {feature_values_count} {feature_unique[:5]}")


Price 979 [ 4850000  4500000  6700000  5000000 17000000]
Area 1140 [ 720  600  650  665 2000]
Location 413 ['Kharghar' 'Sector-13 Kharghar' 'Sector 18 Kharghar' 'Sector 20 Kharghar'
 'Sector 15 Kharghar']
No. of Bedrooms 7 [1 4 3 2 5]
New/Resale 2 [0 1]
Gymnasium 2 [0 1]
Lift Available 2 [1 0]
Car Parking 2 [1 0]
Maintenance Staff 2 [1 0]
24x7 Security 2 [1 0]
Children's Play Area 2 [0 1]
Clubhouse 2 [0 1]
Intercom 2 [0 1]
Landscaped Gardens 2 [0 1]
Indoor Games 2 [0 1]
Gas Connection 2 [0 1]
Jogging Track 2 [0 1]
Swimming Pool 2 [0 1]


- From above output we can see besides price all featurs seems to be categorical. 
- We will consider all featurs with value 0/1 as categorical.
- Price, Area and No of bedroom as continious
- Location needs to explored more

In [46]:
df['Location'].unique()

array(['Kharghar', 'Sector-13 Kharghar', 'Sector 18 Kharghar',
       'Sector 20 Kharghar', 'Sector 15 Kharghar', 'Dombivali',
       'Churchgate', 'Prabhadevi', 'Jogeshwari West', 'Kalyan East',
       'Malad East', 'Virar East', 'Virar', 'Malad West', 'Borivali East',
       'Mira Road East', 'Goregaon West', 'Kandivali West',
       'Borivali West', 'Kandivali East', 'Andheri East', 'Goregaon East',
       'Wadala', 'Ulwe', 'Dahisar', 'kandivali', 'Goregaon',
       'Bhandup West', 'thakur village kandivali east', 'Santacruz West',
       'Kanjurmarg', 'I C Colony', 'Dahisar W', 'Marol', 'Parel',
       'Lower Parel', 'Worli', 'Jogeshwari East', 'Chembur Shell Colony',
       'Central Avenue', 'Chembur East', 'Diamond Market Road', 'Mulund',
       'Nalasopara West', 'raheja vihar', 'Powai Lake', 'MHADA Colony 20',
       'Tolaram Colony', 'Taloja', 'Thane West', 'Vangani',
       'Sector 5 Ulwe', 'Sector12 New Panvel', 'Sector 17 Ulwe',
       'Sector9 Kamothe', 'Sector 19 Kharghar

In [50]:
west_locations = df[df['Location'].str.contains('West', case=False, na=False)]
west_locations["Location"].unique()

array(['Jogeshwari West', 'Malad West', 'Goregaon West', 'Kandivali West',
       'Borivali West', 'Bhandup West', 'Santacruz West',
       'Nalasopara West', 'Thane West', 'Virar West', 'Ambernath West',
       'Bandra West', 'Badlapur West', 'West Amardeep Colony',
       'vasant vihar thane west', 'Kalyan West', 'Ghatkopar West',
       'Dahisar West', 'Mulund West', 'Andheri West', 'Dombivli (West)',
       'Vasai West', 'Western Express Highway Kandivali East',
       'link road borivali west', 'Khar West', 'Bhayandar West',
       'Grant Road West', 'Dadar West', 'Anand Nagar Thane West',
       'vile parle west', 'Kurla West', 'Ville Parle West',
       'Vikhroli West'], dtype=object)