This notebook goal is to merge all cleaned csv files.

## TOC
1. [Importing cleaned csv files](#importing-cleaned-csvs)
1. [Merging cleaned csvs in one master dataset](#merging-cleaned-csvs)
    1. [Merging `dbstall`](#merging-bdstall)
    1. [Merging `toleter`](#merging-toleter)
    1. [Merging `rentalhomebd`](#merging-rentalhomebd)
    1. [Merging `pbazaar`](#merging-pbazaar)
    1. [Merging `flatforsale`](#merging-flatforsale)
    1. [Merging `iqibd`](#merging-iqibd)
    1. [Merging `btib_df`](#merging-btib_df)
1. [Tidying merged csvs](#tidying-merged-csvs)
1. [Creating csv without amenities](#creating-csv-without-amenities)
1. [Creating csv with amenities](#creating-csv-with-amenities)

In [1]:
import pandas as pd
import numpy as np

from googletrans import Translator, constants # https://www.thepythoncode.com/article/translate-text-in-python

import os
import sys

sys.path.append("../functions")
from amenity_identifier import *



In [2]:
is_maintenance_or_cleaning_amenity("emergency exist")

False

In [3]:
# CSV folders

cleaned_data_folder="../../../data/CLeaned_Data"
merged_data_folder="../../../data/Merged_Data"

bproperty_folder= f"{cleaned_data_folder}/bproperty"
bdstall_folder= f"{cleaned_data_folder}/bdstall"
toleter_folder= f"{cleaned_data_folder}/toleter"
rentalhomebd_folder= f"{cleaned_data_folder}/rentalhomebd"
pbazaar_folder= f"{cleaned_data_folder}/pbazaar"
flatforsale_folder= f"{cleaned_data_folder}/flatforsaleindhaka_spider" 
iqibd_folder= f"{cleaned_data_folder}/iqibd"
btibrokeragebd_folder= f"{cleaned_data_folder}/btibrokeragebd"

<span id="importing-cleaned-csvs" ></span>

## Importing the cleaned csv files

In [4]:
# Importing bproperty cleaned csv
bproperty_df = pd.read_csv(f"{bproperty_folder}/cleaned_bproperty.csv")
bproperty_df.head(3).T

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


Unnamed: 0,0,1,2
area,1185.0,2464.0,1140.0
building_type,Apartment,Apartment,Apartment
building_nature,Residential,Residential,Residential
image_url,https://images-cdn.bproperty.com/thumbnails/15...,https://images-cdn.bproperty.com/thumbnails/15...,https://images-cdn.bproperty.com/thumbnails/15...
num_bath_rooms,0,4,0
num_bed_rooms,3,3,3
price,6100000.0,28900000.0,7500000.0
property_description,Grab This 1185 Sq Ft Beautiful Flat Is Vacant ...,A Vibrant 2464 Sq Ft Residential Flat For Sale...,1140 Sq Ft Nicely Planned Apartment Is Availab...
property_overview,This flat consists of facilities you can think...,Ready to move in somewhere with everything nea...,A spacious 1140 Square Feet apartment in Mirp...
property_url,https://www.bproperty.com/en/property/details-...,https://www.bproperty.com/en/property/details-...,https://www.bproperty.com/en/property/details-...


In [5]:
bproperty_df.shape

(17329, 51)

In [6]:
bproperty_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17329 entries, 0 to 17328
Data columns (total 51 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   area                              17329 non-null  float64
 1   building_type                     17329 non-null  object 
 2   building_nature                   17329 non-null  object 
 3   image_url                         17312 non-null  object 
 4   num_bath_rooms                    17329 non-null  int64  
 5   num_bed_rooms                     17329 non-null  int64  
 6   price                             17329 non-null  float64
 7   property_description              17329 non-null  object 
 8   property_overview                 17329 non-null  object 
 9   property_url                      17329 non-null  object 
 10  purpose                           17329 non-null  object 
 11  city                              17329 non-null  object 
 12  loca

In [7]:
# Creating an id column for the dataframe
bproperty_df["id"] = bproperty_df.index.values
bproperty_df["id"] = bproperty_df["id"].apply(lambda x: "bproperty-"+str(x) )
bproperty_df["id"]

0            bproperty-0
1            bproperty-1
2            bproperty-2
3            bproperty-3
4            bproperty-4
              ...       
17324    bproperty-17324
17325    bproperty-17325
17326    bproperty-17326
17327    bproperty-17327
17328    bproperty-17328
Name: id, Length: 17329, dtype: object

In [8]:
# Importing bdstall cleaned csv
bdstall_df = pd.read_csv(f"{bdstall_folder}/Farjana_bdstall-cleaned-Umesh.csv")
bdstall_df.head(3).T

Unnamed: 0,0,1,2
area,1350.0,1250.0,1252.0
building_type,Apartment,Apartment,Apartment
building_nature,,,
num_bath_rooms,3.0,3.0,3.0
num_bed_rooms,3.0,3.0,3.0
price,1500000,2500000,5200000
property_description,Shares of 1350 Sqft land share in Banasree M B...,Shares of 1250 Sqft land will be sold on the o...,A 1252 Sqft flat will be sold in Arshi Nagar a...
property_overview,,,
property_url,https://www.bdstall.com/details/near-hatirjhee...,https://www.bdstall.com/details/banasree-near-...,https://www.bdstall.com/details/near-mohammadp...
purpose,Sale,Sale,Sale


In [9]:
bdstall_df.shape

(58, 13)

In [10]:
bdstall_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58 entries, 0 to 57
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   area                  58 non-null     float64
 1   building_type         58 non-null     object 
 2   building_nature       0 non-null      float64
 3   num_bath_rooms        58 non-null     float64
 4   num_bed_rooms         58 non-null     float64
 5   price                 58 non-null     int64  
 6   property_description  58 non-null     object 
 7   property_overview     0 non-null      float64
 8   property_url          58 non-null     object 
 9   purpose               56 non-null     object 
 10  city                  20 non-null     object 
 11  locality              26 non-null     object 
 12  address               49 non-null     object 
dtypes: float64(5), int64(1), object(7)
memory usage: 6.0+ KB


In [11]:
# Creating an id column for the dataframe
bdstall_df["id"] = bdstall_df.index.values
bdstall_df["id"] = bdstall_df["id"].apply(lambda x: "bdstall-"+str(x) )
bdstall_df["id"]

0      bdstall-0
1      bdstall-1
2      bdstall-2
3      bdstall-3
4      bdstall-4
5      bdstall-5
6      bdstall-6
7      bdstall-7
8      bdstall-8
9      bdstall-9
10    bdstall-10
11    bdstall-11
12    bdstall-12
13    bdstall-13
14    bdstall-14
15    bdstall-15
16    bdstall-16
17    bdstall-17
18    bdstall-18
19    bdstall-19
20    bdstall-20
21    bdstall-21
22    bdstall-22
23    bdstall-23
24    bdstall-24
25    bdstall-25
26    bdstall-26
27    bdstall-27
28    bdstall-28
29    bdstall-29
30    bdstall-30
31    bdstall-31
32    bdstall-32
33    bdstall-33
34    bdstall-34
35    bdstall-35
36    bdstall-36
37    bdstall-37
38    bdstall-38
39    bdstall-39
40    bdstall-40
41    bdstall-41
42    bdstall-42
43    bdstall-43
44    bdstall-44
45    bdstall-45
46    bdstall-46
47    bdstall-47
48    bdstall-48
49    bdstall-49
50    bdstall-50
51    bdstall-51
52    bdstall-52
53    bdstall-53
54    bdstall-54
55    bdstall-55
56    bdstall-56
57    bdstall-57
Name: id, dtyp

In [12]:
# Importing toleter cleaned csv
toleter_df = pd.read_csv(f"{toleter_folder}/Farjana_toleter-cleaned-Umesh.csv")
toleter_df.head(3).T

Unnamed: 0,0,1,2
area,,,4035.0
building_type,Apartment,Apartment,Apartment
building_nature,,,
num_bath_rooms,1.0,1.0,1.0
num_bed_rooms,1.0,1.0,1.0
price,31000.0,36000.0,36000.0
property_description,"Short-Term Rentals are available with Kitchen,...","Short-Term Rentals with Kitchen, TV, Refrigera...",Two Room Furnished Serviced Apartment RENT in ...
property_overview,"Built in Year : 2020,Parking Spaces,Electricit...","Built in Year : 2020,Parking Spaces,Electricit...","Built in Year : 2020,Size 375 Sq. Meter,Floors..."
property_url,https://www.toleter.com/property/bd1683/,https://www.toleter.com/property/bd1635/,https://www.toleter.com/property/bd1650/
purpose,Rent,Rent,Rent


In [13]:
toleter_df.shape

(258, 16)

In [14]:
toleter_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 258 entries, 0 to 257
Data columns (total 16 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   area                             169 non-null    float64
 1   building_type                    258 non-null    object 
 2   building_nature                  0 non-null      float64
 3   num_bath_rooms                   258 non-null    float64
 4   num_bed_rooms                    258 non-null    float64
 5   price                            258 non-null    float64
 6   property_description             251 non-null    object 
 7   property_overview                224 non-null    object 
 8   property_url                     258 non-null    object 
 9   purpose                          258 non-null    object 
 10  city                             40 non-null     object 
 11  locality                         161 non-null    object 
 12  address               

In [15]:
# Creating an id column for the dataframe
toleter_df["id"] = toleter_df.index.values
toleter_df["id"] = toleter_df["id"].apply(lambda x: "toleter-"+str(x) )
toleter_df["id"]

0        toleter-0
1        toleter-1
2        toleter-2
3        toleter-3
4        toleter-4
          ...     
253    toleter-253
254    toleter-254
255    toleter-255
256    toleter-256
257    toleter-257
Name: id, Length: 258, dtype: object

In [16]:
# Importing rentalhomebd cleaned csv
rentalhomebd_df = pd.read_csv(f"{rentalhomebd_folder}/cleaned_rentalhomebd.csv")
rentalhomebd_df.head(3).T

Unnamed: 0,0,1,2
title,A FULLY FURNISHED APARTMENT @Gulshan,A furnished apartment of 3800 SQFT @Gulshan,A furnished apartment of 2300SQFT @Gulshan area
num_bed_rooms,3 bd,3 bd,3 bd
num_bath_rooms,4 ba,3 ba,3 ba
area,2800.0,3800.0,2300.0
building_type,Apartment/Flats,Apartment/Flats,Apartment/Flats
purpose,Rent,Rent,Rent
price,210000.0,190000.0,170000.0
property_description,The furnished apartment is now up for rent tha...,The furnished apartment is now up for rent tha...,A 2300 SQFT nicely viewed apartment at Gulshan...
property_url,http://www.rentalhomebd.com/properties/a-fully...,http://www.rentalhomebd.com/properties/a-furni...,http://www.rentalhomebd.com/properties/a-furni...
wifi-amenity,No,No,No


In [17]:
rentalhomebd_df.shape

(441, 52)

In [18]:
rentalhomebd_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 441 entries, 0 to 440
Data columns (total 52 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   title                          441 non-null    object 
 1   num_bed_rooms                  441 non-null    object 
 2   num_bath_rooms                 441 non-null    object 
 3   area                           441 non-null    float64
 4   building_type                  441 non-null    object 
 5   purpose                        441 non-null    object 
 6   price                          441 non-null    float64
 7   property_description           441 non-null    object 
 8   property_url                   441 non-null    object 
 9   wifi-amenity                   441 non-null    object 
 10  landscape-garden-amenity       441 non-null    object 
 11  wasa-amenity                   441 non-null    object 
 12  internet-amenity               441 non-null    obj

In [19]:
rentalhomebd_df["num_bath_rooms"].unique()

array([' 4 ba', ' 3 ba', ' 5 ba', ' 0 ba', ' 2 ba', ' 7 ba', ' 6 ba',
       ' 9 ba', ' 1 ba'], dtype=object)

In [20]:
rentalhomebd_df["num_bed_rooms"].unique()

array([' 3 bd', ' 5 bd', ' 4 bd', ' 0 bd', ' 2 bd', ' 1 bd', ' 8 bd'],
      dtype=object)

`num_bath_rooms` and `num_bath_rooms` are string. They should be converted to int.

In [21]:
# Removing the units (ba, bd, ...) in num_bed_rooms and num_bath_rooms
rentalhomebd_df["num_bed_rooms"] = rentalhomebd_df["num_bed_rooms"].apply(lambda x: x.split(" ")[1] )
rentalhomebd_df["num_bath_rooms"] = rentalhomebd_df["num_bath_rooms"].apply(lambda x: x.split(" ")[1] )

# Converting num_bed_rooms and num_bath_rooms to integer
rentalhomebd_df["num_bed_rooms"] = rentalhomebd_df["num_bed_rooms"].astype(int)
rentalhomebd_df["num_bath_rooms"] = rentalhomebd_df["num_bath_rooms"].astype(int)

# Checking type conversion was succesful
rentalhomebd_df["num_bed_rooms"].dtype, rentalhomebd_df["num_bath_rooms"].dtype

(dtype('int32'), dtype('int32'))

In [22]:
# Creating an id column for the dataframe
rentalhomebd_df["id"] = rentalhomebd_df.index.values
rentalhomebd_df["id"] = rentalhomebd_df["id"].apply(lambda x: "rentalhomebd-"+str(x) )
rentalhomebd_df["id"]

0        rentalhomebd-0
1        rentalhomebd-1
2        rentalhomebd-2
3        rentalhomebd-3
4        rentalhomebd-4
             ...       
436    rentalhomebd-436
437    rentalhomebd-437
438    rentalhomebd-438
439    rentalhomebd-439
440    rentalhomebd-440
Name: id, Length: 441, dtype: object

In [23]:
# Importing pbazaar cleaned csv
pbazaar_df = pd.read_csv(f"{pbazaar_folder}/pbazaar-cleaned_WithSomeExtraInfo-Umesh.csv")
pbazaar_df.head(3).T

Unnamed: 0,0,1,2
area,200.0,10.0,150.0
building_type,Shop,Shop,Shop
building_nature,Commercial,Commercial,Commercial
num_bath_rooms,0.0,0.0,0.0
num_bed_rooms,0.0,0.0,0.0
price,8000.0,833.3,13999.5
property_description,,,
property_overview,,,
property_url,https://pbazaar.com//en/200-sft-commercial-spa...,https://pbazaar.com//en/120-sft-shop-rent-at-m...,https://pbazaar.com//en/150-sft-shop-rent-at-m...
purpose,Rent,Rent,Rent


In [24]:
# Creating an id column for the dataframe
pbazaar_df["id"] = pbazaar_df.index.values
pbazaar_df["id"] = pbazaar_df["id"].apply(lambda x: "pbazaar-"+str(x) )
pbazaar_df["id"]

0            pbazaar-0
1            pbazaar-1
2            pbazaar-2
3            pbazaar-3
4            pbazaar-4
             ...      
17088    pbazaar-17088
17089    pbazaar-17089
17090    pbazaar-17090
17091    pbazaar-17091
17092    pbazaar-17092
Name: id, Length: 17093, dtype: object

In [25]:
# Importing flatforsale cleaned csv
flatforsale_df = pd.read_csv(f"{flatforsale_folder}/cleaned_flatforsaleindhaka_spider.csv")
flatforsale_df.head(3).T

Unnamed: 0,0,1,2
city,Dhaka,Dhaka,Dhaka
locality,Motijheel,,Baridhara
address,Old Town,Green Road,Kalachadpur
area,100.0,1150.0,850.0
building_type,Commerical - Other,Apartment,Apartment
building_nature,Commercial,Residential,Residential
num_bath_rooms,0.0,3.0,2.0
num_bed_rooms,0.0,3.0,2.0
price,2000000.0,6900000.0,3357500.0
purpose,Sale,Sale,Sale


In [26]:
flatforsale_df.shape

(25, 13)

In [27]:
flatforsale_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25 entries, 0 to 24
Data columns (total 13 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   city                    25 non-null     object 
 1   locality                24 non-null     object 
 2   address                 12 non-null     object 
 3   area                    25 non-null     float64
 4   building_type           25 non-null     object 
 5   building_nature         25 non-null     object 
 6   num_bath_rooms          25 non-null     float64
 7   num_bed_rooms           25 non-null     float64
 8   price                   25 non-null     float64
 9   purpose                 25 non-null     object 
 10  parking-spaces-amenity  25 non-null     float64
 11  property_description    0 non-null      float64
 12  property_overview       0 non-null      float64
dtypes: float64(7), object(6)
memory usage: 2.7+ KB


In [28]:
# Creating an id column for the dataframe
flatforsale_df["id"] = flatforsale_df.index.values
flatforsale_df["id"] = flatforsale_df["id"].apply(lambda x: "flatforsale-"+str(x) )
flatforsale_df["id"]

0      flatforsale-0
1      flatforsale-1
2      flatforsale-2
3      flatforsale-3
4      flatforsale-4
5      flatforsale-5
6      flatforsale-6
7      flatforsale-7
8      flatforsale-8
9      flatforsale-9
10    flatforsale-10
11    flatforsale-11
12    flatforsale-12
13    flatforsale-13
14    flatforsale-14
15    flatforsale-15
16    flatforsale-16
17    flatforsale-17
18    flatforsale-18
19    flatforsale-19
20    flatforsale-20
21    flatforsale-21
22    flatforsale-22
23    flatforsale-23
24    flatforsale-24
Name: id, dtype: object

In [29]:
# Importing iqibd cleaned csv
iqibd_df = pd.read_csv(f"{iqibd_folder}/cleaned_iqibd_Sunitha.csv")
iqibd_df.head(3).T

Unnamed: 0,0,1,2
air_conditioning-amenity,,,
area,1380.0,1470.0,1500.0
balcony-or-terrace-amenity,,,
building_type,Apartment,Apartment,Apartment
cctv-security-amenity,yes,,yes
central-heating-amenity,,,
elevators-in-building-amenity,yes,yes,yes
emergency_stairs-amenity,yes,yes,yes
fitness_center-amenity,,,
floor,5th,9th,4th


In [30]:
iqibd_df.shape

(261, 37)

In [31]:
iqibd_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 261 entries, 0 to 260
Data columns (total 37 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   air_conditioning-amenity       48 non-null     object 
 1   area                           261 non-null    float64
 2   balcony-or-terrace-amenity     19 non-null     object 
 3   building_type                  261 non-null    object 
 4   cctv-security-amenity          214 non-null    object 
 5   central-heating-amenity        5 non-null      object 
 6   elevators-in-building-amenity  253 non-null    object 
 7   emergency_stairs-amenity       183 non-null    object 
 8   fitness_center-amenity         20 non-null     object 
 9   floor                          260 non-null    object 
 10  garage                         199 non-null    float64
 11  lawn-or-garden-amenity         8 non-null      object 
 12  generator-amenity              257 non-null    obj

In [32]:
# Creating an id column for the dataframe
iqibd_df["id"] = iqibd_df.index.values
iqibd_df["id"] = iqibd_df["id"].apply(lambda x: "iqibd-"+str(x) )
iqibd_df["id"]

0        iqibd-0
1        iqibd-1
2        iqibd-2
3        iqibd-3
4        iqibd-4
         ...    
256    iqibd-256
257    iqibd-257
258    iqibd-258
259    iqibd-259
260    iqibd-260
Name: id, Length: 261, dtype: object

In [33]:
# Importing btibrokeragebd cleaned csv
btib_df = pd.read_csv(f"{btibrokeragebd_folder}/cleaned_btibrokeragebd.csv")
btib_df.head(3).T

Unnamed: 0,0,1,2
area,1517.0,935.0,1636.0
building_type,,,
building_nature,Residential,Residential,Residential
image_url,,,
num_bath_rooms,3.0,2.0,4.0
num_bed_rooms,3.0,2.0,3.0
price,12000000.0,5700000.0,14000000.0
property_description,Are you searching for a beautiful home to buy ...,Are you searching for a beautiful home to buy ...,Are you searching for a beautiful home to buy ...
property_overview,,,
property_url,https://btibrokeragebd.com/property/1517-sft-a...,https://btibrokeragebd.com/property/935-sft-ap...,https://btibrokeragebd.com/property/1636-sft-a...


In [34]:
# Creating an id column for the dataframe
btib_df["id"] = btib_df.index.values
btib_df["id"] = btib_df["id"].apply(lambda x: "btibrokeragebd-"+str(x) )
btib_df["id"]

0        btibrokeragebd-0
1        btibrokeragebd-1
2        btibrokeragebd-2
3        btibrokeragebd-3
4        btibrokeragebd-4
              ...        
176    btibrokeragebd-176
177    btibrokeragebd-177
178    btibrokeragebd-178
179    btibrokeragebd-179
180    btibrokeragebd-180
Name: id, Length: 181, dtype: object

<span id="merging-cleaned-csvs" ></span>

## Merging cleaned csv files

In [35]:
# First step: the merged dataframe is based on the overall structure of bproperty, which is the master dataset
merged_df = bproperty_df.copy()
merged_df.shape

(17329, 52)

<span id="merging-bdstall" ></span>

### Merging `bdstall_df` to master dataset

In [36]:
bdstall_df.shape

(58, 14)

In [37]:
bdstall_df.head(3).T

Unnamed: 0,0,1,2
area,1350.0,1250.0,1252.0
building_type,Apartment,Apartment,Apartment
building_nature,,,
num_bath_rooms,3.0,3.0,3.0
num_bed_rooms,3.0,3.0,3.0
price,1500000,2500000,5200000
property_description,Shares of 1350 Sqft land share in Banasree M B...,Shares of 1250 Sqft land will be sold on the o...,A 1252 Sqft flat will be sold in Arshi Nagar a...
property_overview,,,
property_url,https://www.bdstall.com/details/near-hatirjhee...,https://www.bdstall.com/details/banasree-near-...,https://www.bdstall.com/details/near-mohammadp...
purpose,Sale,Sale,Sale


**Remark**: Looking at the variables in `bdstall`, we can confirm that it can be merged to the `merged_df` since the two datasets have similar variables.

In [38]:
merged_df.shape

(17329, 52)

In [39]:
bdstall_df.shape

(58, 14)

In [40]:
merged_df.shape[0] + bdstall_df.shape[0]

17387

In [41]:
merged_df = pd.concat([merged_df, bdstall_df],ignore_index=True)
merged_df.shape

(17387, 52)

<span id="merging-toleter" ></span>

### Merging `toleter_df` to master dataset

In [42]:
toleter_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 258 entries, 0 to 257
Data columns (total 17 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   area                             169 non-null    float64
 1   building_type                    258 non-null    object 
 2   building_nature                  0 non-null      float64
 3   num_bath_rooms                   258 non-null    float64
 4   num_bed_rooms                    258 non-null    float64
 5   price                            258 non-null    float64
 6   property_description             251 non-null    object 
 7   property_overview                224 non-null    object 
 8   property_url                     258 non-null    object 
 9   purpose                          258 non-null    object 
 10  city                             40 non-null     object 
 11  locality                         161 non-null    object 
 12  address               

**Remark**: The variables here are pretty similar to those of the master dataset. The merge should be able to proceed successfully.

In [43]:
toleter_df.shape

(258, 17)

In [44]:
merged_df.shape[0] + toleter_df.shape[0]

17645

In [45]:
merged_df = pd.concat([merged_df, toleter_df],ignore_index=True)
merged_df.shape

(17645, 53)

<span id="merging-rentalhomebd" ></span>

### Merging `rentalhomebd_df` to master dataset

In [46]:
rentalhomebd_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 441 entries, 0 to 440
Data columns (total 53 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   title                          441 non-null    object 
 1   num_bed_rooms                  441 non-null    int32  
 2   num_bath_rooms                 441 non-null    int32  
 3   area                           441 non-null    float64
 4   building_type                  441 non-null    object 
 5   purpose                        441 non-null    object 
 6   price                          441 non-null    float64
 7   property_description           441 non-null    object 
 8   property_url                   441 non-null    object 
 9   wifi-amenity                   441 non-null    object 
 10  landscape-garden-amenity       441 non-null    object 
 11  wasa-amenity                   441 non-null    object 
 12  internet-amenity               441 non-null    obj

In [47]:
rentalhomebd_df.shape

(441, 53)

In [48]:
merged_df.shape[0] + rentalhomebd_df.shape[0]

18086

In [49]:
merged_df = pd.concat([merged_df, rentalhomebd_df],ignore_index=True)
merged_df.shape

(18086, 87)

<span id="merging-pbazaar" ></span>

### Merging `pbazaar_df` to master dataset

In [50]:
pbazaar_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17093 entries, 0 to 17092
Data columns (total 23 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   area                     14910 non-null  float64
 1   building_type            17093 non-null  object 
 2   building_nature          17093 non-null  object 
 3   num_bath_rooms           17093 non-null  float64
 4   num_bed_rooms            17093 non-null  float64
 5   price                    16025 non-null  float64
 6   property_description     0 non-null      float64
 7   property_overview        0 non-null      float64
 8   property_url             17093 non-null  object 
 9   purpose                  17081 non-null  object 
 10  city                     17046 non-null  object 
 11  locality                 17038 non-null  object 
 12  address                  17046 non-null  object 
 13  parking-spaces-amenity   8084 non-null   float64
 14  price_info_extraInfo  

In [51]:
pbazaar_df.shape

(17093, 23)

In [52]:
merged_df.shape[0] + pbazaar_df.shape[0]

35179

In [53]:
merged_df = pd.concat([merged_df, pbazaar_df],ignore_index=True)
merged_df.shape

(35179, 95)

<span id="merging-flatforsale" ></span>

### Merging `flatforsale_df` to master dataset

In [54]:
flatforsale_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25 entries, 0 to 24
Data columns (total 14 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   city                    25 non-null     object 
 1   locality                24 non-null     object 
 2   address                 12 non-null     object 
 3   area                    25 non-null     float64
 4   building_type           25 non-null     object 
 5   building_nature         25 non-null     object 
 6   num_bath_rooms          25 non-null     float64
 7   num_bed_rooms           25 non-null     float64
 8   price                   25 non-null     float64
 9   purpose                 25 non-null     object 
 10  parking-spaces-amenity  25 non-null     float64
 11  property_description    0 non-null      float64
 12  property_overview       0 non-null      float64
 13  id                      25 non-null     object 
dtypes: float64(7), object(7)
memory usage: 2.9+ 

In [55]:
flatforsale_df.shape

(25, 14)

In [56]:
merged_df.shape[0] + flatforsale_df.shape[0]

35204

In [57]:
merged_df = pd.concat([merged_df, flatforsale_df],ignore_index=True)
merged_df.shape

(35204, 95)

<span id="merging-iqibd" ></span>

### Merging `iqibd_df` to master dataset

In [58]:
iqibd_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 261 entries, 0 to 260
Data columns (total 38 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   air_conditioning-amenity       48 non-null     object 
 1   area                           261 non-null    float64
 2   balcony-or-terrace-amenity     19 non-null     object 
 3   building_type                  261 non-null    object 
 4   cctv-security-amenity          214 non-null    object 
 5   central-heating-amenity        5 non-null      object 
 6   elevators-in-building-amenity  253 non-null    object 
 7   emergency_stairs-amenity       183 non-null    object 
 8   fitness_center-amenity         20 non-null     object 
 9   floor                          260 non-null    object 
 10  garage                         199 non-null    float64
 11  lawn-or-garden-amenity         8 non-null      object 
 12  generator-amenity              257 non-null    obj

In [59]:
iqibd_df["floor"].unique()

array(['5th', '9th', '4th', '6th', '8th', '3rd', '1st', '2nd',
       '8th & 9th', '7th', '6th & 7th', '1st,2nd,3rd,4th,5th,6th,7th,8th',
       '2nd & 5th', '4th,6th,8th & 10th', '3rd,7th & 9th', '1st & 9th',
       '5th & 6th', '1st,3rd,5th & 6th', '2nd,4th,6th,8th & 9th',
       '2nd & 6th', '5tt & 8th', '1st & 8th', '2nd,6th,7th',
       '2nd,4th,6th.8th', '3rd & 4th', '1st,7th', '1sr & 3rd', '3rd,5th',
       '3', '5th,6th,7th,9th', '1st,8th', '3rd,5th,7th', '1st,6th',
       '4th,5th,6th', '3rd,4th', '3rd,5th,6th', '1st & 7th', '12th',
       '2nd,4th,6th', '2nd,3rd,7th,8th', '5th,7th', '9', '8', '7',
       '1st, 5th, 7th', '2nd, 3rd, 5th', '3rd, 7th', '1st-6th',
       '2nd, 4th', '1st,2nd,3rd', '11th', '2nd & 3rd', nan, '7th,9th',
       '4th,5th', '1st & 6th', '2nd,3rd,6th,8th', '1st, 6th, 8th',
       '2nd,4th,6th,7th', '1st, 8th', '1st & 2nd'], dtype=object)

In [60]:
iqibd_df["unit"].unique()

array(['Single Unit', '9A & 9B', '4A', 'C6', nan, '8A', 'A4,B4', '4C',
       'A3', 'B1', 'G9', 'B6', '4B', '9A', '6c', '6A', '2C', '2B',
       'Single', 'c5', '5C', 'A6', 'D-2 & E-5',
       'A-4,B-4,A-10,B-10,C-8,C-6', '7B', 'J1', 'C7', 'E6',
       '3A,3B,7A,7B,9A,9B', 'A1', '5B', 'A-9', '9B', '5A', 'B-1', '8B',
       'A5 & C8', 'A1, A8, B8', 'E1', 'A8', 'D1', 'A7', '3B & 4B', '3B',
       '2A & 2B', 'B2', 'F8', 'C3,C5', 'A-3', 'B-5,B-6,B-7,B-8,B-9',
       'D1,D8', 'D3,D5,D7', '4A,5A,6A', '4B,5B,6B', '3B,5B,6B', 'A1,A7',
       '02', '3B,5A,5B,7A,7B', 'A4', 'A5', 'B12', 'B-12', '2B,4B,6B',
       '2B,3B,7B,8B', '2A,3A,7A,8A', '2A,4A,6A', '5A,5B,7A', 'B5', 'C1',
       'A9', '2E', '3D', 'A5 & C5', '9C', '7D', '3A', '7A', '7A & 7C',
       '7B & 7D', 'A1, A5, A7', '2B, 3A, 3B, 5A', '3C', '10A',
       '3A, 3B, 7A,7B', '6B', '2A', '2B,4A,4B', '7C', '11A', '6C',
       'A9,B7', 'C8', '12J', '2A & 6B',
       '2(A,B,C)3(A,B,C)6(A,B,C)8(A,B,C)', 'D9', 'A2', 'C1, C6, C8',
       '2B,4A,

In [61]:
merged_df["floor-level-amenity"].unique()

array(['yes', nan], dtype=object)

In [62]:
iqibd_df["balcony-or-terrace-amenity"].unique()

array([nan, 'yes'], dtype=object)

In [63]:
iqibd_df["num_balcony"].unique()

array([ 4.,  3.,  2., nan,  7.,  5.,  1.])

In [64]:
bproperty_df["balcony-or-terrace-amenity"].unique()

array(['yes', nan], dtype=object)

In [65]:
# iqibd_df.drop(["balcony-or-terrace-amenity"],inplace=True, axis=1)

In [66]:
# # Renaming some column
# iqibd_df.rename(columns={
#     "num_balcony":"balcony-or-terrace-amenity"
# })

In [67]:
# bproperty_df.columns.to_list()

In [68]:
iqibd_df.shape

(261, 38)

In [69]:
merged_df.shape[0] + iqibd_df.shape[0]

35465

In [70]:
merged_df = pd.concat([merged_df, iqibd_df],ignore_index=True)
merged_df.shape

(35465, 112)

<span id="merging-btib_df" ></span>

### Merging `btib_df` to master dataset

In [71]:
btib_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 181 entries, 0 to 180
Data columns (total 32 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   area                              181 non-null    float64
 1   building_type                     0 non-null      float64
 2   building_nature                   181 non-null    object 
 3   image_url                         0 non-null      float64
 4   num_bath_rooms                    181 non-null    float64
 5   num_bed_rooms                     181 non-null    float64
 6   price                             181 non-null    float64
 7   property_description              180 non-null    object 
 8   property_overview                 0 non-null      float64
 9   property_url                      181 non-null    object 
 10  purpose                           181 non-null    object 
 11  city                              181 non-null    object 
 12  locality

In [72]:
btib_df.rename(columns={
    "Lift-amenity":"lift-amenity",
    "Generator-amenity":"generator-amenity",
    "Gas Supply-amenity":"gas_supply-amenity",
    "Built-in Kitchen Cabinet-amenity": "built-in-kitchen-and-cabinet-amenity",
    "Reception Desk & Lobby-amenity":"reception-desk-and-lobby-amenity",
    "Tiled Floor-amenity":"tiled-floor-amenity",
    "Dry Kitchen-amenity":"dry-kitchen-amenity",
    "Parking-amenity":"parking-spaces-amenity",
    "Prayer Room-amenity":"prayer-room-amenity",
    "Cable TV Provision-amenity ":"cable-tv-provision-amenity",
    "Intercom-amenity":"intercom-amenity",
    "Filtered Water-amenity":"filtered-water-amenity",
    "Open Terrace-amenity":"balcony-or-terrace-amenity",
    "Electricity Supply-amenity":"electricity-supply-amenity",
    "Air-Condition Provision-amenity":"air-condition-and-provision-amenity",
    "Fire Extinguisher-amenity":"fire-extinguisher-amenity"
},inplace=True)

In [73]:
btib_df.shape

(181, 32)

In [74]:
merged_df.shape[0] + btib_df.shape[0]

35646

In [75]:
merged_df = pd.concat([merged_df, btib_df],ignore_index=True)
merged_df.shape

(35646, 124)

<span id="tidying-merged-csvs" ></span>

## Tidying the merged csvs

In [76]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35646 entries, 0 to 35645
Columns: 124 entries, area to gas_supply-amenity
dtypes: float64(14), object(110)
memory usage: 33.7+ MB


The datatypes of the column are good. Now we are going to explore the content of the columns.

In [77]:
print(merged_df.columns.to_list())

['area', 'building_type', 'building_nature', 'image_url', 'num_bath_rooms', 'num_bed_rooms', 'price', 'property_description', 'property_overview', 'property_url', 'purpose', 'city', 'locality', 'address', 'balcony-or-terrace-amenity', 'flooring-amenity', 'parking-spaces-amenity', 'view-amenity', 'lobby-in-building-amenity', 'electricity-backup-amenity', 'elevators-in-building-amenity', 'floor-level-amenity', 'cctv-security-amenity', 'maintenance-staff-amenity', 'cleaning-services-amenity', 'service-elevators-amenity', 'intercom-amenity', 'atm-facility-amenity', 'freehold-amenity', 'broadband-internet-amenity', 'double-glazed-windows-amenity', 'storage-areas-amenity', '24-hours-concierge-amenity', 'waste-disposal-amenity', 'lawn-or-garden-amenity', 'prayer-room-amenity', 'facilities-for-disabled-amenity', 'conference-room-amenity', 'furnished-amenity', 'swimming-pool-amenity', 'steam-room-amenity', 'sauna-amenity', 'jacuzzi-amenity', 'barbeque-area-amenity', 'central-heating-amenity', '

In [78]:
merged_df.isna().sum()

area                                     2272
building_type                             181
building_nature                           316
image_url                               18334
num_bath_rooms                              0
                                        ...  
built-in-kitchen-and-cabinet-amenity    35465
filtered-water-amenity                  35465
dry-kitchen-amenity                     35465
electricity-supply-amenity              35465
gas_supply-amenity                      35465
Length: 124, dtype: int64

In [79]:
merged_df.iloc[:,:10].isna().sum()

area                     2272
building_type             181
building_nature           316
image_url               18334
num_bath_rooms              0
num_bed_rooms               0
price                    1068
property_description    17387
property_overview       18093
property_url               25
dtype: int64

In [80]:
merged_df["building_type"].unique()

array(['Apartment', 'Shop', 'Floor', 'Office', 'Building', 'Plot',
       'Duplex', 'Warehouse', 'Factory', 'Apartment/Flats',
       'Office space', 'Duplex Home', 'Showroom / Shop / Restaurant',
       'Independent House', 'Land', 'Garage', 'House', 'Commercial Space',
       'Commerical - Other', 'Apartment, Commercial',
       'Commercial property', nan], dtype=object)

In [81]:
merged_df["building_type"].value_counts()

Apartment                       26291
Office                           2156
Building                         1420
Land                             1342
Shop                             1060
Floor                             885
Plot                              809
Garage                            457
Apartment/Flats                   378
Commercial Space                  249
House                             221
Duplex                             77
Office space                       52
Warehouse                          30
Factory                            19
Duplex Home                         7
Commerical - Other                  4
Independent House                   2
Showroom / Shop / Restaurant        2
Apartment, Commercial               2
Commercial property                 2
Name: building_type, dtype: int64

**TODO**:
* define how to deal with the `building_type` whose value counts are less than 200 (drop them ? group them under the same category, for example `Commercial - other` or `Residence - Other` ?)

In [82]:
merged_df.loc[ merged_df["building_type"].isna() ]

Unnamed: 0,area,building_type,building_nature,image_url,num_bath_rooms,num_bed_rooms,price,property_description,property_overview,property_url,...,reception-desk-and-lobby-amenity,Cable TV Provision-amenity,lift-amenity,tiled-floor-amenity,Emergency Exit-amenity,built-in-kitchen-and-cabinet-amenity,filtered-water-amenity,dry-kitchen-amenity,electricity-supply-amenity,gas_supply-amenity
35465,1517.0,,Residential,,3.0,3.0,12000000.0,Are you searching for a beautiful home to buy ...,,https://btibrokeragebd.com/property/1517-sft-a...,...,True,True,True,True,False,False,False,False,True,False
35466,935.0,,Residential,,2.0,2.0,5700000.0,Are you searching for a beautiful home to buy ...,,https://btibrokeragebd.com/property/935-sft-ap...,...,False,True,True,True,True,False,False,False,True,True
35467,1636.0,,Residential,,4.0,3.0,14000000.0,Are you searching for a beautiful home to buy ...,,https://btibrokeragebd.com/property/1636-sft-a...,...,True,True,True,True,True,False,False,False,True,True
35468,2520.0,,Residential,,3.0,3.0,43000000.0,Are you searching for a beautiful home to buy ...,,https://btibrokeragebd.com/property/2520-sft-a...,...,False,True,True,True,False,False,True,False,True,True
35469,1352.0,,Residential,,3.0,3.0,11000000.0,Are you searching for a beautiful home to buy ...,,https://btibrokeragebd.com/property/1352-sft-a...,...,False,True,True,True,True,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35641,1650.0,,Residential,,4.0,3.0,17000000.0,Are you searching for a beautiful home to buy ...,,https://btibrokeragebd.com/property/1650-sft-a...,...,True,True,True,True,False,False,False,True,True,False
35642,1240.0,,Residential,,2.0,2.0,13000000.0,Are you searching for a beautiful home to buy ...,,https://btibrokeragebd.com/property/1240-sft-a...,...,False,True,True,True,True,False,False,True,True,False
35643,1250.0,,Residential,,2.0,3.0,9500000.0,Are you searching for a beautiful home to buy ...,,https://btibrokeragebd.com/property/1250sft-ap...,...,True,True,True,False,True,False,False,True,True,False
35644,1464.0,,Residential,,3.0,3.0,22000000.0,Are you searching for a beautiful home to buy ...,,https://btibrokeragebd.com/property/1464-sft-a...,...,True,True,True,False,True,False,False,False,True,True


**TODO**:
* define which value to give as default to the `building_type` where the value is NaN, or how to deal with those NaN values.

In [83]:
merged_df["building_nature"].unique()

array(['Residential', 'Commercial', nan, 'Apartment'], dtype=object)

Some `building_nature` are `Apartment`. They should be replaced by `Residential`. 

In [84]:
# Giving the samples which building_nature is Apartment the value of Residential
merged_df.loc[ merged_df["building_nature"]=="Apartment", ["building_nature"] ] = "Residential"

# making sure Apartment were successfully replaced by 
merged_df["building_nature"].unique()

array(['Residential', 'Commercial', nan], dtype=object)

In [85]:
# Selecting the samples without building_nature
merged_df_without_nature = merged_df.loc[ merged_df["building_nature"].isna() ]
merged_df_without_nature

Unnamed: 0,area,building_type,building_nature,image_url,num_bath_rooms,num_bed_rooms,price,property_description,property_overview,property_url,...,reception-desk-and-lobby-amenity,Cable TV Provision-amenity,lift-amenity,tiled-floor-amenity,Emergency Exit-amenity,built-in-kitchen-and-cabinet-amenity,filtered-water-amenity,dry-kitchen-amenity,electricity-supply-amenity,gas_supply-amenity
17329,1350.0,Apartment,,,3.0,3.0,1500000.0,Shares of 1350 Sqft land share in Banasree M B...,,https://www.bdstall.com/details/near-hatirjhee...,...,,,,,,,,,,
17330,1250.0,Apartment,,,3.0,3.0,2500000.0,Shares of 1250 Sqft land will be sold on the o...,,https://www.bdstall.com/details/banasree-near-...,...,,,,,,,,,,
17331,1252.0,Apartment,,,3.0,3.0,5200000.0,A 1252 Sqft flat will be sold in Arshi Nagar a...,,https://www.bdstall.com/details/near-mohammadp...,...,,,,,,,,,,
17332,1200.0,Apartment,,,3.0,3.0,5200000.0,Shares of 1200 sqft apartment in South Keran...,,https://www.bdstall.com/details/south-keraniga...,...,,,,,,,,,,
17333,1500.0,Apartment,,,3.0,3.0,6500000.0,1500 Sqft flat land share will sale in importa...,,https://www.bdstall.com/details/paltan-vijay-n...,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17640,,Apartment,,,1.0,2.0,7500.0,The pleasant environment will be rented on the...,Beautiful View,https://www.toleter.com/property/bd222/,...,,,,,,,,,,
17641,16140.0,Apartment,,,3.0,5.0,5000.0,Well-to-do living and easy movement! If you ar...,"Built in Year : 2015,Size 1500 Sq. Meter,Floor...",https://www.toleter.com/property/bd101/,...,,,,,,,,,,
17642,16140.0,Apartment,,,3.0,5.0,5000.0,Well-to-do living and easy movement! If you ar...,"Built in Year : 2015,Size 1500 Sq. Meter,Floor...",https://www.toleter.com/property/bd102/,...,,,,,,,,,,
17643,,Apartment,,,3.0,3.0,16000.0,"House # 03, Rod # 4/1, Ward No # 37, Negar, Se...","Parking Spaces,Balcony,Elevator,Others Main Fe...",https://www.toleter.com/property/bd244/,...,,,,,,,,,,


In [86]:
merged_df_without_nature["building_type"].unique()

array(['Apartment'], dtype=object)

All samples without `building_nature` are of type `Apartment`. So their `building_nature` should be set to `Residential`.

In [87]:
merged_df.loc[ (merged_df["building_nature"].isna()) & 
             ( merged_df["building_type"]=="Apartment") , ["building_nature"] ] = "Residential"

In [88]:
# Checking if their are still properties without nature
merged_df.loc[ merged_df["building_nature"].isna() ].T

area
building_type
building_nature
image_url
num_bath_rooms
...
built-in-kitchen-and-cabinet-amenity
filtered-water-amenity
dry-kitchen-amenity
electricity-supply-amenity
gas_supply-amenity


In [89]:
merged_df["building_nature"].unique()

array(['Residential', 'Commercial'], dtype=object)

In [90]:
# merged_df_without_nature.loc[ merged_df_without_nature["building_nature"].isna() & 
#               merged_df_without_nature["building_type"]=="Apartment", ["building_nature"] ] = "Residential"

In [91]:
del merged_df_without_nature

In [92]:
merged_df["garage"].fillna(0, inplace=True)

In [93]:
merged_df.reset_index(inplace=True)

<span id="creating-csv-without-amenities" ></span>

## Creating csv without amenities

In [94]:
no_amenity_vars = ['id','area', 'building_type', 'building_nature', 'image_url', 'num_bath_rooms', 'num_bed_rooms', 
                 'price', 'property_description', 'property_overview', 'property_url', 'purpose', 'city', 
                 'locality', 'address','garage','year_built']

In [95]:
no_amenity_df = merged_df[ no_amenity_vars ]
no_amenity_df.head(3).T

Unnamed: 0,0,1,2
id,bproperty-0,bproperty-1,bproperty-2
area,1185.0,2464.0,1140.0
building_type,Apartment,Apartment,Apartment
building_nature,Residential,Residential,Residential
image_url,https://images-cdn.bproperty.com/thumbnails/15...,https://images-cdn.bproperty.com/thumbnails/15...,https://images-cdn.bproperty.com/thumbnails/15...
num_bath_rooms,0.0,4.0,0.0
num_bed_rooms,3.0,3.0,3.0
price,6100000.0,28900000.0,7500000.0
property_description,Grab This 1185 Sq Ft Beautiful Flat Is Vacant ...,A Vibrant 2464 Sq Ft Residential Flat For Sale...,1140 Sq Ft Nicely Planned Apartment Is Availab...
property_overview,This flat consists of facilities you can think...,Ready to move in somewhere with everything nea...,A spacious 1140 Square Feet apartment in Mirp...


In [96]:
no_amenity_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35646 entries, 0 to 35645
Data columns (total 17 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   id                    35646 non-null  object 
 1   area                  33374 non-null  float64
 2   building_type         35465 non-null  object 
 3   building_nature       35646 non-null  object 
 4   image_url             17312 non-null  object 
 5   num_bath_rooms        35646 non-null  float64
 6   num_bed_rooms         35646 non-null  float64
 7   price                 34578 non-null  float64
 8   property_description  18259 non-null  object 
 9   property_overview     17553 non-null  object 
 10  property_url          35621 non-null  object 
 11  purpose               35632 non-null  object 
 12  city                  35110 non-null  object 
 13  locality              35046 non-null  object 
 14  address               30507 non-null  object 
 15  garage             

In [97]:
no_amenity_df["building_type"].unique()

array(['Apartment', 'Shop', 'Floor', 'Office', 'Building', 'Plot',
       'Duplex', 'Warehouse', 'Factory', 'Apartment/Flats',
       'Office space', 'Duplex Home', 'Showroom / Shop / Restaurant',
       'Independent House', 'Land', 'Garage', 'House', 'Commercial Space',
       'Commerical - Other', 'Apartment, Commercial',
       'Commercial property', nan], dtype=object)

In [98]:
no_amenity_df["building_nature"].unique()

array(['Residential', 'Commercial'], dtype=object)

In [99]:
no_amenity_df["building_nature"].isna().sum()

0

In [100]:
no_amenity_df.shape

(35646, 17)

In [101]:
# Create folder in which to save cleaned dataset
if not os.path.exists(merged_data_folder):
    os.makedirs(merged_data_folder)
    print(f"Create folder '{merged_data_folder}'")
else:
    print(f"Folder '{merged_data_folder}' already exists")

Folder '../../../data/Merged_Data' already exists


In [102]:
# Save merged datasets to csv
no_amenity_df.to_csv(f"{merged_data_folder}/no_amenity.csv", index=False)

In [103]:
# Load saved csv (to make sure it was successfully save)
saved_no_amenity_df = pd.read_csv(f"{merged_data_folder}/no_amenity.csv")
saved_no_amenity_df.head(3).T

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


Unnamed: 0,0,1,2
id,bproperty-0,bproperty-1,bproperty-2
area,1185.0,2464.0,1140.0
building_type,Apartment,Apartment,Apartment
building_nature,Residential,Residential,Residential
image_url,https://images-cdn.bproperty.com/thumbnails/15...,https://images-cdn.bproperty.com/thumbnails/15...,https://images-cdn.bproperty.com/thumbnails/15...
num_bath_rooms,0.0,4.0,0.0
num_bed_rooms,3.0,3.0,3.0
price,6100000.0,28900000.0,7500000.0
property_description,Grab This 1185 Sq Ft Beautiful Flat Is Vacant ...,A Vibrant 2464 Sq Ft Residential Flat For Sale...,1140 Sq Ft Nicely Planned Apartment Is Availab...
property_overview,This flat consists of facilities you can think...,Ready to move in somewhere with everything nea...,A spacious 1140 Square Feet apartment in Mirp...


In [104]:
# v=5
# type(v), type(v)==int

In [105]:
# # Initializing list
# test_list = [1, 6, 3, 5, 3, 4]
 
# # Checking if 4 exists in list
# # using in
# if (4 in test_list):
#     print("Element Exists")

<span id="creating-csv-with-amenities" ></span>

## Creating csv with amenities

Since there are so many amenities, I am going to categorize them, end have the count of how many such categories each sample has.

In [106]:
print(no_amenity_vars)

['id', 'area', 'building_type', 'building_nature', 'image_url', 'num_bath_rooms', 'num_bed_rooms', 'price', 'property_description', 'property_overview', 'property_url', 'purpose', 'city', 'locality', 'address', 'garage', 'year_built']


In [107]:
amenity_vars=[] # will contain the columns that hold amenity information
columns = merged_df.columns.to_list()

for col in columns:
    if col not in no_amenity_vars:
        amenity_vars.append(col)

print(amenity_vars)

['index', 'balcony-or-terrace-amenity', 'flooring-amenity', 'parking-spaces-amenity', 'view-amenity', 'lobby-in-building-amenity', 'electricity-backup-amenity', 'elevators-in-building-amenity', 'floor-level-amenity', 'cctv-security-amenity', 'maintenance-staff-amenity', 'cleaning-services-amenity', 'service-elevators-amenity', 'intercom-amenity', 'atm-facility-amenity', 'freehold-amenity', 'broadband-internet-amenity', 'double-glazed-windows-amenity', 'storage-areas-amenity', '24-hours-concierge-amenity', 'waste-disposal-amenity', 'lawn-or-garden-amenity', 'prayer-room-amenity', 'facilities-for-disabled-amenity', 'conference-room-amenity', 'furnished-amenity', 'swimming-pool-amenity', 'steam-room-amenity', 'sauna-amenity', 'jacuzzi-amenity', 'barbeque-area-amenity', 'central-heating-amenity', 'business-center-amenity', 'first-aid-medical-center-amenity', 'day-care-center-amenity', 'shared-kitchen-amenity', 'cafeteria-or-canteen-amenity', 'laundry-facility-amenity', 'security-staff-amen

In [108]:
merged_df[ amenity_vars[:50] ].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35646 entries, 0 to 35645
Data columns (total 50 columns):
 #   Column                            Non-Null Count  Dtype 
---  ------                            --------------  ----- 
 0   index                             35646 non-null  int64 
 1   balcony-or-terrace-amenity        14576 non-null  object
 2   flooring-amenity                  16398 non-null  object
 3   parking-spaces-amenity            17774 non-null  object
 4   view-amenity                      16413 non-null  object
 5   lobby-in-building-amenity         9257 non-null   object
 6   electricity-backup-amenity        10127 non-null  object
 7   elevators-in-building-amenity     13443 non-null  object
 8   floor-level-amenity               15396 non-null  object
 9   cctv-security-amenity             10967 non-null  object
 10  maintenance-staff-amenity         9777 non-null   object
 11  cleaning-services-amenity         12445 non-null  object
 12  service-elevators-

I want to understand the type of values we have in the amenity columns.

In [109]:
merged_df[ amenity_vars[1] ].unique()

array(['yes', nan, 'no', False, True], dtype=object)

In [110]:
merged_df[ amenity_vars[2] ].unique()

array(['yes', nan], dtype=object)

In [111]:
amenity_vars[9]

'cctv-security-amenity'

In [112]:
merged_df[ amenity_vars[9] ].unique()

array(['yes', nan, 'Yes', 'No'], dtype=object)

I have the following groups of values:
* `numbers` : 1, 2, 3
* `strings` : " 1", "Yes", "No"
* `np.NaN`

I assume there may also be `True/False` values

In [113]:
# type(True)

In [114]:

# Creating amenity categories columns, and giving them default value
merged_df["relaxation_amenity_count"] = 0
merged_df["security_amenity_count"] = 0
merged_df["maintenance_or_cleaning_amenity_count"] = 0
merged_df["social_amenity_count"] = 0
merged_df["expendable_amenity_count"] = 0
merged_df["expendable_amenity_count"] = 0 
merged_df["service_staff_amenity_count"] = 0
merged_df["service_staff_amenity_count"] = 0
merged_df["unclassify_amenity_count"] = 0

In [115]:
"""
    Loop through each sample, while :
       - counting the number of each category amenity it has
       - translating property_description and property_overview to English (some text are in Benguali)
"""
# init the Google API translator
translator = Translator()

unclassified_vars=[] # will contain columns that are not in amenity_vars nor in no_amenity_vars (for debug purpose)


for index, row in merged_df.iterrows(): # loop through each sample
    
    # The code may take time, log in the console to keep track of things
    if index==0 or index%1000==0:
        print(f"Currently processing sample {index}...")
    
    # Count the number of amenity per category for the current sample
    for av in amenity_vars:
#         print(f"av : {av}")
        av_content = row[av] # merged_df.loc[index,av] #
        av_weight=None
        
        # if the content of the amenity variable is ['yes', a number greater than 0, or True] , 
        #      then av_weight take 1 as value
        if pd.isna(av_content): # If the content of that variable for the current sample is np.NaN, do nothing
            av_weight=0
        elif type(av_content)==str: # if it is a string
            if av_content.strip().lower()=="no" or av_content.strip().lower()=="0" \
                or av_content.strip().lower()=="0.0" or av_content.strip().lower()=="na":
                av_weight=0
            else:
                av_weight=1
        elif type(av_content)==int or type(av_content)==float:
            if av_content>0:
                av_weight=1
            else:
                av_weight=0
        elif type(av_content)==bool:
            if av_content==False:
                av_weight=0
            else:
                av_weight=1
        else:
            raise Exception(f"Amenity value '{av_content}' not taken into account for sample {index} and column {av}")
    
        # Updating the count of the corresponding amenity category
        if is_relaxation_amenity(av):
            merged_df.loc[index,"relaxation_amenity_count"] += av_weight
        elif is_security_amenity(av):
            merged_df.loc[index,"security_amenity_count"] += av_weight
        elif is_maintenance_or_cleaning_amenity(av):
            merged_df.loc[index,"maintenance_or_cleaning_amenity_count"] += av_weight
        elif is_social_amenity(av):
            merged_df.loc[index,"social_amenity_count"] += av_weight
        elif is_expendable_amenity(av):
            merged_df.loc[index,"expendable_amenity_count"] += av_weight
        elif is_service_staff_amenity(av):
            merged_df.loc[index,"service_staff_amenity_count"] += av_weight
        elif is_unclassify_amenity(av):
            merged_df.loc[index,"unclassify_amenity_count"] += av_weight
        else:
            unclassified_dic={
                "index":index,
                "column":av
            }
            unclassified_vars.append(unclassified_dic)
            
            
            
    # Translating text from Benguali to English
    
#     # retrieve the property_overview and property_description
#     overview = row["property_overview"]
#     description = row["property_description"] 
    
#     # translate text
#     translated_ov_obj = translator.translate(overview.strip(), src="bn", dest="en")
#     translated_desc_obj = translator.translate(description.strip(), src="bn", dest="en")
    
#     translated_ov = translated_ov_obj.text
#     translated_desc = translated_desc_obj.text
    
#     # updating the translated columns of the sample in the dataframe
#     merged_df.loc[index, "property_overview"] = translated_ov
#     merged_df.loc[index, "property_description"] = translated_desc
        
print("Processing has come to an end")
print("----------"*3)
print("unclassified_vars : ")
print(unclassified_vars)

Currently processing sample 0...
Currently processing sample 1000...
Currently processing sample 2000...
Currently processing sample 3000...
Currently processing sample 4000...
Currently processing sample 5000...
Currently processing sample 6000...
Currently processing sample 7000...
Currently processing sample 8000...
Currently processing sample 9000...
Currently processing sample 10000...
Currently processing sample 11000...
Currently processing sample 12000...
Currently processing sample 13000...
Currently processing sample 14000...
Currently processing sample 15000...
Currently processing sample 16000...
Currently processing sample 17000...
Currently processing sample 18000...
Currently processing sample 19000...
Currently processing sample 20000...
Currently processing sample 21000...
Currently processing sample 22000...
Currently processing sample 23000...
Currently processing sample 24000...
Currently processing sample 25000...
Currently processing sample 26000...
Currently proc

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [116]:
# merged_df.loc[0,"balcony-or-terrace-amenity"]

In [117]:
merged_df.shape

(35646, 132)

In [118]:
merged_df.iloc[:, -7:].sample(10)

Unnamed: 0,relaxation_amenity_count,security_amenity_count,maintenance_or_cleaning_amenity_count,social_amenity_count,expendable_amenity_count,service_staff_amenity_count,unclassify_amenity_count
10061,0,2,2,0,4,0,3
29122,0,0,0,0,0,0,0
11070,0,1,1,0,4,0,3
7870,0,1,3,0,2,0,3
5043,0,1,0,0,3,0,3
5403,0,2,2,1,5,0,3
9929,0,1,1,0,3,0,3
35602,0,0,0,0,0,0,0
3221,0,2,2,0,2,0,4
16016,0,2,1,0,2,0,3


In [119]:
merged_df.shape

(35646, 132)

In [120]:
# Dropping amenity variables
merged_df.drop(amenity_vars,axis=1,inplace=True)
merged_df.shape

(35646, 24)

In [121]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35646 entries, 0 to 35645
Data columns (total 24 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   area                                   33374 non-null  float64
 1   building_type                          35465 non-null  object 
 2   building_nature                        35646 non-null  object 
 3   image_url                              17312 non-null  object 
 4   num_bath_rooms                         35646 non-null  float64
 5   num_bed_rooms                          35646 non-null  float64
 6   price                                  34578 non-null  float64
 7   property_description                   18259 non-null  object 
 8   property_overview                      17553 non-null  object 
 9   property_url                           35621 non-null  object 
 10  purpose                                35632 non-null  object 
 11  ci

In [122]:
merged_df.head(2).T

Unnamed: 0,0,1
area,1185.0,2464.0
building_type,Apartment,Apartment
building_nature,Residential,Residential
image_url,https://images-cdn.bproperty.com/thumbnails/15...,https://images-cdn.bproperty.com/thumbnails/15...
num_bath_rooms,0.0,4.0
num_bed_rooms,3.0,3.0
price,6100000.0,28900000.0
property_description,Grab This 1185 Sq Ft Beautiful Flat Is Vacant ...,A Vibrant 2464 Sq Ft Residential Flat For Sale...
property_overview,This flat consists of facilities you can think...,Ready to move in somewhere with everything nea...
property_url,https://www.bproperty.com/en/property/details-...,https://www.bproperty.com/en/property/details-...


In [123]:
# Save merged datasets to csv
merged_df.to_csv(f"{merged_data_folder}/merged_datasets.csv", index=False)

In [124]:
# Load saved csv (to make sure it was successfully save)
saved_merged_df = pd.read_csv(f"{merged_data_folder}/merged_datasets.csv")
saved_merged_df.head(3).T

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


Unnamed: 0,0,1,2
area,1185.0,2464.0,1140.0
building_type,Apartment,Apartment,Apartment
building_nature,Residential,Residential,Residential
image_url,https://images-cdn.bproperty.com/thumbnails/15...,https://images-cdn.bproperty.com/thumbnails/15...,https://images-cdn.bproperty.com/thumbnails/15...
num_bath_rooms,0.0,4.0,0.0
num_bed_rooms,3.0,3.0,3.0
price,6100000.0,28900000.0,7500000.0
property_description,Grab This 1185 Sq Ft Beautiful Flat Is Vacant ...,A Vibrant 2464 Sq Ft Residential Flat For Sale...,1140 Sq Ft Nicely Planned Apartment Is Availab...
property_overview,This flat consists of facilities you can think...,Ready to move in somewhere with everything nea...,A spacious 1140 Square Feet apartment in Mirp...
property_url,https://www.bproperty.com/en/property/details-...,https://www.bproperty.com/en/property/details-...,https://www.bproperty.com/en/property/details-...
