In [1]:
# Uncomment to change the width of the page
from IPython.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

In [2]:
# Import
import pandas as pd
import numpy as np

In [3]:
# Define folder locations
raw_data_folder="."
cleaned_data_folder="."

In [4]:
df_pbazaar = pd.read_csv(f"{raw_data_folder}/sarangali_pbazzar.csv")

In [5]:
df_pbazaar.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,17083,17084,17085,17086,17087,17088,17089,17090,17091,17092
area_sft,200.0,10.0,150.0,111.0,250.0,180.0,175.0,10.0,700.0,100.0,...,1280.0,1075.0,1350.0,1100.0,800.0,2150.0,1000.0,1445.0,600.0,800.0
attach_bathrooms,,,,,,,,,,,...,2.0,1.0,1.0,1.0,1.0,3.0,1.0,2.0,1.0,1.0
balcony,,,,,,,,,,,...,2.0,2.0,2.0,2.0,,3.0,,3.0,,1.0
bedrooms,,,,,,,,,,,...,3.0,2.0,3.0,3.0,2.0,4.0,2.0,3.0,3.0,2.0
common_bathrooms,,,,,,,,,,,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,,1.0
dining,,,,,,,,,,,...,Space,Room,Room,Room,Space,Room,Space,Room,Room,Room
floor,,,,,,,,,,,...,,,,,,,,,,
floor_type,,,,,,,,,,,...,Mosaic,Tiled,Tiled,Tiled,Tiled,Tiled,Tiled,Tiled,Tiled,Tiled
garage_number,,,,,,,,,,,...,,,,,,,,,,
living,,,,,,,,,,,...,Space,Space,Space,Space,Space,Space,Space,Space,Space,Space


**How much data do we have in the raw data file**

In [6]:
df_pbazaar.shape

(17093, 21)

**Print a list of all column names and their the type of values in each column**

In [7]:
print(df_pbazaar.columns)

Index(['area_sft', 'attach_bathrooms', 'balcony', 'bedrooms',
       'common_bathrooms', 'dining', 'floor', 'floor_type', 'garage_number',
       'living', 'location', 'parking_space', 'price', 'price_per_month',
       'price_per_sft', 'property_type', 'property_url', 'road_width_ft',
       'size_in_katha', 'total_floor', 'view'],
      dtype='object')


In [8]:
print(df_pbazaar.dtypes)

area_sft            float64
attach_bathrooms    float64
balcony             float64
bedrooms            float64
common_bathrooms    float64
dining               object
floor               float64
floor_type           object
garage_number       float64
living               object
location             object
parking_space       float64
price                object
price_per_month     float64
price_per_sft       float64
property_type        object
property_url         object
road_width_ft       float64
size_in_katha        object
total_floor         float64
view                 object
dtype: object


**For reference**<br>
**Filer structure requirements in #task-2-data-preprocessing:** <br>
The sample from files (screenshots) provided by @Ekoue LOGOSU-TEKO
<img src="CSV_sample-1.png" alt="Alternative text" />
<img src="CSV_sample-2.png" alt="Alternative text" />


## Construct a cleaned version of the raw CSV


In [9]:
# Creat a new DataFrame for storing the cleaned data
df_pbazaar_new = pd.DataFrame()

In [10]:
df_pbazaar_new["area"] = pd.DataFrame(df_pbazaar["area_sft"])

In [11]:
# The full range of unique values in 'property_type'
print(df_pbazaar["property_type"].unique())

['Commercial Space Rent,Shop' 'Commercial Space Rent,Apartment for Office'
 'Commercial Space Rent,Commercial Building' 'Furnished Apartment Rent'
 'Shop Buy' 'Apartment Buy' 'Land/Plot Buy' 'Garage Rent'
 'Independent House Rent' 'Commercial Space Buy' 'Land/Plot Developer'
 'Residential Apartment Rent' 'Apartment Developer'
 'Independent House Buy']


In [12]:
# The cell creates the following mapping: 
# <new_value>: <old_value(s)>
# 
# "Apartment": 'Commercial Space Rent,Apartment for Office', 'Furnished Apartment Rent', 'Apartment Buy', 'Residential Apartment Rent', 'Apartment Developer'
# "Shop": 'Commercial Space Rent,Shop', 'Shop Buy'
# "House": 'Independent House Rent', 'Independent House Buy'
# "Land": 'Land/Plot Buy', 'Land/Plot Developer'
# "Building": 'Commercial Space Rent,Commercial Building'
# "Garage": 'Garage Rent'
# "Commercial Space": 'Commercial Space Buy'

df_pbazaar_new["building_type"] = np.where(df_pbazaar["property_type"].str.contains("Apartment"), "Apartment",
                                           np.where(df_pbazaar["property_type"].str.contains("Shop"), "Shop",
                                                    np.where(df_pbazaar["property_type"].str.contains("House"), "House",
                                                             np.where(df_pbazaar["property_type"].str.contains("Garage"), "Garage",
                                                                      np.where(df_pbazaar["property_type"].str.contains("Land"), "Land",
                                                                               np.where(df_pbazaar["property_type"].str.contains("Building"), "Building",
                                                                                        np.where(df_pbazaar["property_type"].str.contains("Commercial Space Buy"), "Commercial Space",
                                                                                                 "NA")))))))



In [13]:
print(df_pbazaar_new["building_type"].unique())

['Shop' 'Apartment' 'Building' 'Land' 'Garage' 'House' 'Commercial Space']


In [14]:
df_pbazaar_new["building_type"].value_counts()

Apartment           13121
Land                 1342
Building             1226
Shop                  477
Garage                457
Commercial Space      249
House                 221
Name: building_type, dtype: int64

In [15]:
df_pbazaar_new["building_nature"] = np.where(df_pbazaar["property_type"].str.contains("Commercial"), "Commercial", "Residential")

In [16]:
df_pbazaar_new["building_nature"].value_counts()

Residential    14353
Commercial      2740
Name: building_nature, dtype: int64

In [17]:
# This column is now removed from the standard CSV file
# # There is no other info available about the zone area
# df_pbazaar_new["otherZoneArea"] = 'NA'
# 

df_pbazaar_new["num_bath_rooms"] = df_pbazaar["attach_bathrooms"].add(df_pbazaar["common_bathrooms"], fill_value=0)
df_pbazaar_new["num_bed_rooms"] = df_pbazaar["bedrooms"]

In [18]:
# Print the no. of NaNs for the number of bathrooms and bedrooms
print(len(df_pbazaar_new[df_pbazaar_new["num_bath_rooms"].isna()]))
print(len(df_pbazaar_new[df_pbazaar_new["num_bed_rooms"].isna()]))

3835
3970


In [19]:
# Replace the NaNs in the number of bathrooms and bedrooms with zeros
df_pbazaar_new["num_bath_rooms"].fillna(0, inplace=True)
df_pbazaar_new["num_bed_rooms"].fillna(0, inplace=True)

In [20]:
# Sanity check: are the NaNs replaced?
print(len(df_pbazaar_new[df_pbazaar_new["num_bath_rooms"].isna()]))
print(len(df_pbazaar_new[df_pbazaar_new["num_bed_rooms"].isna()]))

0
0


In [21]:
# The 'price' column has the currency symbol and a string following it
df_pbazaar_new[["price", "price_info"]] = df_pbazaar["price"].str.split("৳", expand=True)

In [22]:
# Format the 'price column'

# 1. remove commas
df_pbazaar_new["price"] = df_pbazaar_new["price"].str.replace(',', '')

# 2. Convert the 'price' to numeric and replace entries with string (instead of number) by 'NaN'
# Note: sometimes instead of a number there is only text, e.g. 'Negotiable';
df_pbazaar_new["price"] = pd.to_numeric(df_pbazaar_new['price'], errors='coerce')

In [23]:
df_pbazaar_new.dtypes

area               float64
building_type       object
building_nature     object
num_bath_rooms     float64
num_bed_rooms      float64
price              float64
price_info          object
dtype: object

In [24]:
# Take a look at non-numeric info captured in the 'price_info' column
df_pbazaar_new['price_info'].value_counts()

Per Month      11427
Per sft         2046
 Per Sft         963
Per katha        812
 Per Month       626
Per Decimal      239
                  53
Per Bigha         43
Per Acres         19
Per 100            2
Name: price_info, dtype: int64

In [25]:
# Remove preceding blank spaces from a value in a column
df_pbazaar_new['price_info'] = df_pbazaar_new['price_info'].str.lstrip()

In [26]:
# All entries where the price is 'per square feet' (the 'price_info' is 'Per sft' or 'Per Sft') 
# change the value in 'price' column to price * areas

df_pbazaar_new["price"] = np.where(df_pbazaar_new['price_info'].isin(set(['Per sft', 'Per Sft'])), 
                                   df_pbazaar_new["price"] * df_pbazaar_new["area"], 
                                   df_pbazaar_new["price"])

In [27]:
# There is no other info available for property description and property overview
df_pbazaar_new["property_description"] = 'NaN'

df_pbazaar_new["property_overview"] = 'NaN'

In [28]:
# Add the URL column as it is
df_pbazaar_new["property_url"] = df_pbazaar["property_url"]

In [29]:
# Generate a 'purpose' column with required values ("Rent" or "Sale") based info in the 'property_type' column
df_pbazaar_new["purpose"] = np.where(df_pbazaar["property_type"].str.contains("Rent"), "Rent", 
                                     np.where(df_pbazaar["property_type"].str.contains("Buy"), "Sale", "NA"))

In [30]:
# Sanity checks
print(df_pbazaar_new["purpose"].unique())
print()
print(df_pbazaar_new["purpose"].value_counts())

['Rent' 'Sale' 'NA']

Rent    13293
Sale     3788
NA         12
Name: purpose, dtype: int64


**New convention for Address**

In [31]:
# 
# @Ekoue LOGOSU-TEKO:
# 
# Hello @channel.
# @Shariar Hossain Omee
#  has created a function to split location into relevant parts. 
# It can be found here: 
# https://github.com/OmdenaAI/dhaka-bangladesh-real-estate-recommendation/blob/main/src/tasks/task-2-data-preprocessing/functions/address_extractor.py
# 
# The code return a dictionary having the following keys: City, Area, Address. They are to become the following columns in the cleaned dataset:
# City -> city
# Area -> locality
# Address -> address


# NOTE: since the "location" column in the pbazaar data is already has the format of 'locality, city, country' (see sample entries below), here we
# don't need to use the function written by @Shariar Hossain Omee.
# 
# df_pbazaar["location"][1:4]
# 1          Mirpur, Dhaka, Bangladesh
# 2          Mirpur, Dhaka, Bangladesh
# 3    Shahjahanpur, Dhaka, Bangladesh

# Split the 'location' into separate individual entries
df_TMP = pd.DataFrame()
df_TMP[["locality", "city", "address"]] = pd.DataFrame(df_pbazaar["location"].str.split(", ", expand=True))


In [32]:
# Add the info to the new dataframe
df_pbazaar_new['city'] = df_TMP['city']
df_pbazaar_new['locality'] = df_TMP['locality']
df_pbazaar_new['address'] = df_TMP['address']


In [33]:
# df_pbazaar["location"][1:4]
# detailed_address

df_pbazaar_new.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,17083,17084,17085,17086,17087,17088,17089,17090,17091,17092
area,200.0,10.0,150.0,111.0,250.0,180.0,175.0,10.0,700.0,100.0,...,1280.0,1075.0,1350.0,1100.0,800.0,2150.0,1000.0,1445.0,600.0,800.0
building_type,Shop,Shop,Shop,Shop,Shop,Shop,Shop,Shop,Shop,Shop,...,Apartment,Apartment,Apartment,Apartment,Apartment,Apartment,Apartment,Apartment,Apartment,Apartment
building_nature,Commercial,Commercial,Commercial,Commercial,Commercial,Commercial,Commercial,Commercial,Commercial,Commercial,...,Residential,Residential,Residential,Residential,Residential,Residential,Residential,Residential,Residential,Residential
num_bath_rooms,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3.0,2.0,2.0,2.0,2.0,4.0,2.0,3.0,1.0,2.0
num_bed_rooms,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3.0,2.0,3.0,3.0,2.0,4.0,2.0,3.0,3.0,2.0
price,8000.0,833.3,13999.5,15000.54,25000.0,7020.0,119999.25,1000.0,25000.0,15000.0,...,30000.0,15000.0,,13000.0,11000.0,24000.0,30000.0,22000.0,8300.0,13000.0
price_info,Per Sft,Per Sft,Per Sft,Per Sft,Per Month,Per Sft,Per Sft,Per Sft,Per Month,Per Month,...,Per Month,Per Month,,Per Month,Per Month,Per Month,Per Month,Per Month,Per Month,Per Month
property_description,,,,,,,,,,,...,,,,,,,,,,
property_overview,,,,,,,,,,,...,,,,,,,,,,
property_url,https://pbazaar.com//en/200-sft-commercial-spa...,https://pbazaar.com//en/120-sft-shop-rent-at-m...,https://pbazaar.com//en/150-sft-shop-rent-at-m...,https://pbazaar.com//en/111-sft-shop-rent-at-s...,https://pbazaar.com//en/250sft-commecial-shop-...,https://pbazaar.com//en/180sft-commecial-shop-...,https://pbazaar.com//en/175-sft-shop-rent-at-b...,https://pbazaar.com//en/150-sft-shop-rent-at-m...,https://pbazaar.com//en/700sft-commecial-shop-...,https://pbazaar.com//en/100sft-commecial-shop-...,...,https://pbazaar.com//en/1280-sft-flat-4th-floor,https://pbazaar.com//en/small-flat-for-rent-191,https://pbazaar.com//en/flat-rent-90,https://pbazaar.com//en/back-side-flat-rent-11,https://pbazaar.com//en/2-bed-for-rent-32,https://pbazaar.com//en/nice-flat-for-rent-937,https://pbazaar.com//en/2-bed-for-rent-33,https://pbazaar.com//en/semi-furnished-flat-re...,https://pbazaar.com//en/nice-flat-for-rent-1011,https://pbazaar.com//en/nice-flat-for-rent-1012


In [34]:
# Delete the temporary DataFrame
del df_TMP

In [35]:
# _extraInfo:
# df_pbazaar['dining'].unique(): array([nan, 'Space', 'Room'], dtype=object)
#  ==> append to the end as '_extraInfo'
# 
# df_pbazaar['floor'].unique(): (float64) floor number.. (1st, 2nd, third etc.)
#  ==> append to the end as '_extraInfo'
# 
# df_pbazaar['floor_type'].unique(): a mixed collection of floor type and floor number
#         [nan, 'Tiled', 'Mosaic', ' Other', 'Ready', 'Marble', 'Ground',
#            'Ground Floor', '12', '10', '1st', '2nd', '3rd', '13', '15', '3',
#            '5', '9', '14', '4', '8', '4th', '5th', '1', '7', '2', '6', '18',
#            '16', '20', 'Raw', '1627', '7th', '19', '8th', '11', '22']
#  ==> append to the end as with '_extraInfo'
# 
# df_pbazaar['garage_number'].unique(): NaN
#  ==> DELETE (redundant if all values are NaN)
# 
# df_pbazaar['living'].unique(): array([nan, 'Space', 'Room'], dtype=object)
#  ==> append to the end as '_extraInfo'
# 
# df_pbazaar['parking_space'].unique(): no. of parking spaces
#  ==> append as 'parking-spaces-amenity'
# 
# df_pbazaar['road_width_ft'].unique(): (float64) number with width in feet
#  ==> append to the end as '_extraInfo'
# 
# df_pbazaar['size_in_katha'].unique(): sometimes the size is given in the "katha" unit; 
#                                      'size_in_katha' column had price in Bangladesh currency (but then the 'price' column has that price in it)
#                                      (but then the 'price' column has that price in it)
#  ==> DELETE (redundant info)
#
# df_pbazaar['total_floor'].unique(): (float64) number with possibly total no. of floors in the building
#  ==> append to the end as '_extraInfo'
# 
# df_pbazaar['view'].unique(): a string possibly describing the view from the apartment
#             [nan, ' Land View', 'South facing', 'East facing', 'Corner plot',
#                'West facing', 'North facing', 'Park view house or plot',
#                'Lake view house or plot']
#  ==> append to the end as '_extraInfo'


df_pbazaar_new['parking-spaces-amenity'] = df_pbazaar['parking_space']



In [36]:
# Delete and copy the 'price_info' and add as '_extraInfo'
col_price_info = df_pbazaar_new.pop('price_info') # remove the column from the dataframe and save it to variable


In [37]:
df_pbazaar_new.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,17083,17084,17085,17086,17087,17088,17089,17090,17091,17092
area,200.0,10.0,150.0,111.0,250.0,180.0,175.0,10.0,700.0,100.0,...,1280.0,1075.0,1350.0,1100.0,800.0,2150.0,1000.0,1445.0,600.0,800.0
building_type,Shop,Shop,Shop,Shop,Shop,Shop,Shop,Shop,Shop,Shop,...,Apartment,Apartment,Apartment,Apartment,Apartment,Apartment,Apartment,Apartment,Apartment,Apartment
building_nature,Commercial,Commercial,Commercial,Commercial,Commercial,Commercial,Commercial,Commercial,Commercial,Commercial,...,Residential,Residential,Residential,Residential,Residential,Residential,Residential,Residential,Residential,Residential
num_bath_rooms,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3.0,2.0,2.0,2.0,2.0,4.0,2.0,3.0,1.0,2.0
num_bed_rooms,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3.0,2.0,3.0,3.0,2.0,4.0,2.0,3.0,3.0,2.0
price,8000.0,833.3,13999.5,15000.54,25000.0,7020.0,119999.25,1000.0,25000.0,15000.0,...,30000.0,15000.0,,13000.0,11000.0,24000.0,30000.0,22000.0,8300.0,13000.0
property_description,,,,,,,,,,,...,,,,,,,,,,
property_overview,,,,,,,,,,,...,,,,,,,,,,
property_url,https://pbazaar.com//en/200-sft-commercial-spa...,https://pbazaar.com//en/120-sft-shop-rent-at-m...,https://pbazaar.com//en/150-sft-shop-rent-at-m...,https://pbazaar.com//en/111-sft-shop-rent-at-s...,https://pbazaar.com//en/250sft-commecial-shop-...,https://pbazaar.com//en/180sft-commecial-shop-...,https://pbazaar.com//en/175-sft-shop-rent-at-b...,https://pbazaar.com//en/150-sft-shop-rent-at-m...,https://pbazaar.com//en/700sft-commecial-shop-...,https://pbazaar.com//en/100sft-commecial-shop-...,...,https://pbazaar.com//en/1280-sft-flat-4th-floor,https://pbazaar.com//en/small-flat-for-rent-191,https://pbazaar.com//en/flat-rent-90,https://pbazaar.com//en/back-side-flat-rent-11,https://pbazaar.com//en/2-bed-for-rent-32,https://pbazaar.com//en/nice-flat-for-rent-937,https://pbazaar.com//en/2-bed-for-rent-33,https://pbazaar.com//en/semi-furnished-flat-re...,https://pbazaar.com//en/nice-flat-for-rent-1011,https://pbazaar.com//en/nice-flat-for-rent-1012
purpose,Rent,Rent,Rent,Rent,Rent,Rent,Rent,Rent,Rent,Rent,...,Rent,Rent,Rent,Rent,Rent,Rent,Rent,Rent,Rent,Rent


In [38]:
# df_pbazaar.describe(include='all').T

df_pbazaar_new.describe(include='all').T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
area,14910.0,,,,1797.250838,4026.719658,1.0,950.0,1340.0,2100.0,296600.0
building_type,17093.0,7.0,Apartment,13121.0,,,,,,,
building_nature,17093.0,2.0,Residential,14353.0,,,,,,,
num_bath_rooms,17093.0,,,,2.193764,1.553784,0.0,1.0,2.0,3.0,36.0
num_bed_rooms,17093.0,,,,2.189434,1.492289,0.0,1.0,3.0,3.0,36.0
price,16025.0,,,,2249090321.351024,191152217993.0036,1.0,17000.0,35000.0,180000.0,24000000000000.0
property_description,17093.0,1.0,,17093.0,,,,,,,
property_overview,17093.0,1.0,,17093.0,,,,,,,
property_url,17093.0,17093.0,https://pbazaar.com//en/200-sft-commercial-spa...,1.0,,,,,,,
purpose,17093.0,3.0,Rent,13293.0,,,,,,,


In [39]:
# Save cleaned dataset to csv
df_pbazaar_new.to_csv(f"{cleaned_data_folder}/pbazaar-cleaned-Umesh.csv", index=False)

<br>

## Create second CSV with other info

The reason behind having these extra columns (even though they are not particularly amenities of the property) is that it seems that some of them could influence the price. Moreover, this is extra information that might be useful in future; instead of throwing it away, we just keep it for the time being (it's easy to delete these columns if they don't turn out to be useful).
### (i) Columns from the raw data file as '_extraInfo' columns

In [40]:
# _extraInfo:
# 
# df_pbazaar['dining'].unique(): array([nan, 'Space', 'Room'], dtype=object)
#  ==> append to the end as '_extraInfo'
# 
# df_pbazaar['floor'].unique(): (float64) floor number.. (1st, 2nd, third etc.)
#  ==> append to the end as '_extraInfo'
# 
# df_pbazaar['floor_type'].unique(): a mixed collection of floor type and floor number
#         [nan, 'Tiled', 'Mosaic', ' Other', 'Ready', 'Marble', 'Ground',
#            'Ground Floor', '12', '10', '1st', '2nd', '3rd', '13', '15', '3',
#            '5', '9', '14', '4', '8', '4th', '5th', '1', '7', '2', '6', '18',
#            '16', '20', 'Raw', '1627', '7th', '19', '8th', '11', '22']
#  ==> append to the end as with '_extraInfo'
# 
# df_pbazaar['living'].unique(): array([nan, 'Space', 'Room'], dtype=object)
#  ==> append to the end as '_extraInfo'
# 
# df_pbazaar['road_width_ft'].unique(): (float64) number with width in feet
#  ==> append to the end as '_extraInfo'
# 
# df_pbazaar['total_floor'].unique(): (float64) number with possibly total no. of floors in the building
#  ==> append to the end as '_extraInfo'
# 
# df_pbazaar['view'].unique(): a string possibly describing the view from the apartment
#             [nan, ' Land View', 'South facing', 'East facing', 'Corner plot',
#                'West facing', 'North facing', 'Park view house or plot',
#                'Lake view house or plot']
#  ==> append to the end as '_extraInfo'


# Add the 'price_info' column that was deleted earlier
df_pbazaar_new['price_info_extraInfo'] = col_price_info

df_pbazaar_new['dining_extraInfo'] = df_pbazaar['dining']

df_pbazaar_new['floor_extraInfo'] = df_pbazaar['floor']

df_pbazaar_new['floor_type_extraInfo'] = df_pbazaar['floor_type']

df_pbazaar_new['living_extraInfo'] = df_pbazaar['living']

df_pbazaar_new['road_width_ft_extraInfo'] = df_pbazaar['road_width_ft']

df_pbazaar_new['total_floor_extraInfo'] = df_pbazaar['total_floor']

df_pbazaar_new['view_extraInfo'] = df_pbazaar['view']

In [41]:
df_pbazaar_new.describe(include='all').T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
area,14910.0,,,,1797.250838,4026.719658,1.0,950.0,1340.0,2100.0,296600.0
building_type,17093.0,7.0,Apartment,13121.0,,,,,,,
building_nature,17093.0,2.0,Residential,14353.0,,,,,,,
num_bath_rooms,17093.0,,,,2.193764,1.553784,0.0,1.0,2.0,3.0,36.0
num_bed_rooms,17093.0,,,,2.189434,1.492289,0.0,1.0,3.0,3.0,36.0
price,16025.0,,,,2249090321.351024,191152217993.0036,1.0,17000.0,35000.0,180000.0,24000000000000.0
property_description,17093.0,1.0,,17093.0,,,,,,,
property_overview,17093.0,1.0,,17093.0,,,,,,,
property_url,17093.0,17093.0,https://pbazaar.com//en/200-sft-commercial-spa...,1.0,,,,,,,
purpose,17093.0,3.0,Rent,13293.0,,,,,,,


In [42]:
# Save cleaned dataset to csv
df_pbazaar_new.to_csv(f"{cleaned_data_folder}/pbazaar-cleaned_WithSomeExtraInfo-Umesh.csv", index=False)

In [43]:
# Open the saved CSVs and check the data
df_test = pd.read_csv(f"{cleaned_data_folder}/pbazaar-cleaned-Umesh.csv")
df_test.describe(include='all').T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
area,14910.0,,,,1797.250838,4026.719658,1.0,950.0,1340.0,2100.0,296600.0
building_type,17093.0,7.0,Apartment,13121.0,,,,,,,
building_nature,17093.0,2.0,Residential,14353.0,,,,,,,
num_bath_rooms,17093.0,,,,2.193764,1.553784,0.0,1.0,2.0,3.0,36.0
num_bed_rooms,17093.0,,,,2.189434,1.492289,0.0,1.0,3.0,3.0,36.0
price,16025.0,,,,2249090321.351024,191152217993.0036,1.0,17000.0,35000.0,180000.0,24000000000000.0
property_description,0.0,,,,,,,,,,
property_overview,0.0,,,,,,,,,,
property_url,17093.0,17093.0,https://pbazaar.com//en/200-sft-commercial-spa...,1.0,,,,,,,
purpose,17081.0,2.0,Rent,13293.0,,,,,,,


In [44]:
# Open the saved CSVs and check the data
df_test = pd.read_csv(f"{cleaned_data_folder}/pbazaar-cleaned_WithSomeExtraInfo-Umesh.csv")
df_test.describe(include='all').T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
area,14910.0,,,,1797.250838,4026.719658,1.0,950.0,1340.0,2100.0,296600.0
building_type,17093.0,7.0,Apartment,13121.0,,,,,,,
building_nature,17093.0,2.0,Residential,14353.0,,,,,,,
num_bath_rooms,17093.0,,,,2.193764,1.553784,0.0,1.0,2.0,3.0,36.0
num_bed_rooms,17093.0,,,,2.189434,1.492289,0.0,1.0,3.0,3.0,36.0
price,16025.0,,,,2249090321.351024,191152217993.0036,1.0,17000.0,35000.0,180000.0,24000000000000.0
property_description,0.0,,,,,,,,,,
property_overview,0.0,,,,,,,,,,
property_url,17093.0,17093.0,https://pbazaar.com//en/200-sft-commercial-spa...,1.0,,,,,,,
purpose,17081.0,2.0,Rent,13293.0,,,,,,,
