In [134]:
print("Hello World")

Hello World


In [135]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re

from slugify import slugify
import os
import sys

sys.path.append("../functions")
from address_extractor import get_detailed_address

%matplotlib inline

In [136]:
raw_data_folder="../../../data/Raw_Data"
cleaned_data_folder="../../../data/CLeaned_Data"

btibrokeragebd_folder= f"{raw_data_folder}/btibrokeragebd"
cleaned_btibrokeragebd_folder= f"{cleaned_data_folder}/btibrokeragebd"

## Accessing Raw Data csv

In [137]:
btibrokeragebd_df = pd.read_csv(f"{btibrokeragebd_folder}/btibrokeragebd.csv")
btibrokeragebd_df.head()

Unnamed: 0,amenities,area,building_type,commercial_type,location,num_bath_rooms,num_bed_rooms,page,price,property_description,property_overview,property_url,purpose
0,"Air-Condition Provision,Cable TV Provision,Ele...",1517 sft,Residential,,"Uttara, Dhaka",3,3.0,,12000000,Are you searching for a beautiful home to buy ...,,https://btibrokeragebd.com/property/1517-sft-a...,For Sale
1,"Air-Condition Provision,Cable TV Provision,Ele...",935 sft,Residential,,"Uttarkhan, Dhaka",2,2.0,,5700000,Are you searching for a beautiful home to buy ...,,https://btibrokeragebd.com/property/935-sft-ap...,For Sale
2,"Air-Condition Provision,Cable TV Provision,Ele...",1636 sft,Residential,,"Uttara, Dhaka",4,3.0,,14000000,Are you searching for a beautiful home to buy ...,,https://btibrokeragebd.com/property/1636-sft-a...,For Sale
3,"Air-Condition Provision,Cable TV Provision,Ele...",2520 sft,Residential,,"Banani, Dhaka",3,3.0,,43000000,Are you searching for a beautiful home to buy ...,,https://btibrokeragebd.com/property/2520-sft-a...,For Sale
4,"Air-Condition Provision,Cable TV Provision,Ele...",1352 sft,Residential,,"Farmgate, Dhaka",3,3.0,,11000000,Are you searching for a beautiful home to buy ...,,https://btibrokeragebd.com/property/1352-sft-a...,For Sale


In [138]:
btibrokeragebd_df.shape

(181, 13)

In [139]:
btibrokeragebd_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 181 entries, 0 to 180
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   amenities             181 non-null    object 
 1   area                  181 non-null    object 
 2   building_type         181 non-null    object 
 3   commercial_type       0 non-null      float64
 4   location              181 non-null    object 
 5   num_bath_rooms        181 non-null    int64  
 6   num_bed_rooms         154 non-null    float64
 7   page                  0 non-null      float64
 8   price                 181 non-null    object 
 9   property_description  180 non-null    object 
 10  property_overview     0 non-null      float64
 11  property_url          181 non-null    object 
 12  purpose               181 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 18.5+ KB


## Cleaning Data

### area

In [140]:
for index, row in btibrokeragebd_df.iterrows(): # loop through each sample
    
    # The code may take time, log in the console to keep track of things
    if index==0 or index%1000==0:
        print(f"Currently processing sample {index}...")
        
    # retrieve the area
    sample_area = btibrokeragebd_df.loc[index, "area"]
    if 'Floor' not in sample_area: #there was error in the data, some area entries had floor info. 
        splitted_sample_area = sample_area.split()
    else:
        btibrokeragebd_df.loc[index, "area"] = None
    # making sure there is only the value and the unit in sample_area
    if len(splitted_sample_area)>2:
        print(f"Sample of index {index} has a suspicious value as area: {sample_area}")
        break

    check_area = splitted_sample_area[0] 
    area = float( splitted_sample_area[0].replace(",","") ) # will contain the area; eg: 1345
    area_unit = splitted_sample_area[1].lower() # will contain the unit; eg: sqft
    
    # making sure all units are taken into account
    if area_unit not in ["sft","katha"]:
        print(f"Sample of index {index} has a unit not taken into account for its area: {sample_area}")
        continue    
   
    # converting katha area to sqft area (1 Katha = 720 sqft => Thanks @Kausthab Dutta Phukan )
    if area_unit=="katha":
        area *= 720
        
    # updating the area of the sample in the dataframe
    btibrokeragebd_df.loc[index, "area"] = area
    
print("Processing has come to an end")

# Converting area to decimal
btibrokeragebd_df["area"] = btibrokeragebd_df["area"].astype(float)

Currently processing sample 0...
Processing has come to an end


In [141]:
btibrokeragebd_df.area.dtype

dtype('float64')

### building_type

In [142]:
btibrokeragebd_df.rename(columns={
    "building_type":"building_nature"
    }, inplace=True)

In [143]:
btibrokeragebd_df.head(2).T

Unnamed: 0,0,1
amenities,"Air-Condition Provision,Cable TV Provision,Ele...","Air-Condition Provision,Cable TV Provision,Ele..."
area,1517.0,935.0
building_nature,Residential,Residential
commercial_type,,
location,"Uttara, Dhaka","Uttarkhan, Dhaka"
num_bath_rooms,3,2
num_bed_rooms,3.0,2.0
page,,
price,12000000,5700000
property_description,Are you searching for a beautiful home to buy ...,Are you searching for a beautiful home to buy ...


### num_bed_rooms & num_bathrooms

In [144]:
btibrokeragebd_df.num_bath_rooms.fillna(0, inplace=True)
btibrokeragebd_df.num_bed_rooms.fillna(0, inplace=True)

btibrokeragebd_df['num_bath_rooms'] = btibrokeragebd_df['num_bath_rooms'].astype(float)

In [145]:
btibrokeragebd_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 181 entries, 0 to 180
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   amenities             181 non-null    object 
 1   area                  181 non-null    float64
 2   building_nature       181 non-null    object 
 3   commercial_type       0 non-null      float64
 4   location              181 non-null    object 
 5   num_bath_rooms        181 non-null    float64
 6   num_bed_rooms         181 non-null    float64
 7   page                  0 non-null      float64
 8   price                 181 non-null    object 
 9   property_description  180 non-null    object 
 10  property_overview     0 non-null      float64
 11  property_url          181 non-null    object 
 12  purpose               181 non-null    object 
dtypes: float64(6), object(7)
memory usage: 18.5+ KB


### price

In [146]:
for index, row in btibrokeragebd_df.iterrows(): # loop through each sample
    
    # The code may take time, log in the console to keep track of things
    if index==0 or index%1000==0:
        print(f"Currently processing sample {index}...")
        
    # retrieve the price
    sample_price = btibrokeragebd_df.loc[index, "price"]

    if 'PerMonth' in sample_price:
        price = re.findall('\d+', sample_price)[0]
        btibrokeragebd_df.loc[index, 'price'] = price
print("Processing has come to an end")
btibrokeragebd_df['price'] = btibrokeragebd_df['price'].astype(float)

Currently processing sample 0...
Processing has come to an end


In [147]:
btibrokeragebd_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 181 entries, 0 to 180
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   amenities             181 non-null    object 
 1   area                  181 non-null    float64
 2   building_nature       181 non-null    object 
 3   commercial_type       0 non-null      float64
 4   location              181 non-null    object 
 5   num_bath_rooms        181 non-null    float64
 6   num_bed_rooms         181 non-null    float64
 7   page                  0 non-null      float64
 8   price                 181 non-null    float64
 9   property_description  180 non-null    object 
 10  property_overview     0 non-null      float64
 11  property_url          181 non-null    object 
 12  purpose               181 non-null    object 
dtypes: float64(7), object(6)
memory usage: 18.5+ KB


In [148]:
btibrokeragebd_df["purpose"] = btibrokeragebd_df["purpose"].apply(lambda x: x.split(" ")[1] )

In [149]:
btibrokeragebd_df.head(5)

Unnamed: 0,amenities,area,building_nature,commercial_type,location,num_bath_rooms,num_bed_rooms,page,price,property_description,property_overview,property_url,purpose
0,"Air-Condition Provision,Cable TV Provision,Ele...",1517.0,Residential,,"Uttara, Dhaka",3.0,3.0,,12000000.0,Are you searching for a beautiful home to buy ...,,https://btibrokeragebd.com/property/1517-sft-a...,Sale
1,"Air-Condition Provision,Cable TV Provision,Ele...",935.0,Residential,,"Uttarkhan, Dhaka",2.0,2.0,,5700000.0,Are you searching for a beautiful home to buy ...,,https://btibrokeragebd.com/property/935-sft-ap...,Sale
2,"Air-Condition Provision,Cable TV Provision,Ele...",1636.0,Residential,,"Uttara, Dhaka",4.0,3.0,,14000000.0,Are you searching for a beautiful home to buy ...,,https://btibrokeragebd.com/property/1636-sft-a...,Sale
3,"Air-Condition Provision,Cable TV Provision,Ele...",2520.0,Residential,,"Banani, Dhaka",3.0,3.0,,43000000.0,Are you searching for a beautiful home to buy ...,,https://btibrokeragebd.com/property/2520-sft-a...,Sale
4,"Air-Condition Provision,Cable TV Provision,Ele...",1352.0,Residential,,"Farmgate, Dhaka",3.0,3.0,,11000000.0,Are you searching for a beautiful home to buy ...,,https://btibrokeragebd.com/property/1352-sft-a...,Sale


### location

In [150]:
# testing the get_detailed_address
get_detailed_address(btibrokeragebd_df["location"][0])

{'city': 'Dhaka', 'area': 'Uttara', 'address': ''}

In [151]:
# Create new columns
btibrokeragebd_df["city"] = np.NaN
btibrokeragebd_df["locality"] = np.NaN
btibrokeragebd_df["address"] = np.nan

In [152]:
# New code

"""
    Loop through `location` column, while splitting each location to city, zone, address and add them
        to the relevant column
"""

for index, row in btibrokeragebd_df.iterrows(): # loop through each sample
    
    # The code may take time, log in the console to keep track of things
    if index==0 or index%1000==0:
        print(f"Currently processing sample {index}...")
        
    # retrieve the location
    location = btibrokeragebd_df.loc[index, "location"]
    
    # split the location to dictionary with Area, City, Address as keys
    location_dict = get_detailed_address(location)
    
    city = location_dict.get("city", np.NaN)
    locality = location_dict.get("area", np.NaN)
    address = location_dict.get("address", np.NaN)

    # updating the relevant columns of the sample in the dataframe
    btibrokeragebd_df.loc[index, "city"] = city
    btibrokeragebd_df.loc[index, "locality"] = locality
    btibrokeragebd_df.loc[index, "address"] = address

print("Processing has come to an end")

Currently processing sample 0...
Processing has come to an end


In [153]:
# Making sure the columns were splitted efficiently
btibrokeragebd_df[ ["location","city","locality","address"] ]

Unnamed: 0,location,city,locality,address
0,"Uttara, Dhaka",Dhaka,Uttara,
1,"Uttarkhan, Dhaka",Dhaka,Uttarkhan,
2,"Uttara, Dhaka",Dhaka,Uttara,
3,"Banani, Dhaka",Dhaka,Banani,
4,"Farmgate, Dhaka",Dhaka,Farmgate,
...,...,...,...,...
176,"Dhanmondi, Dhaka",Dhaka,Dhanmondi,
177,"Kallayanpur, Dhaka",Dhaka,,Kallayanpur
178,"Dhanmondi, Dhaka",Dhaka,Dhanmondi,
179,"Lalmatia, Dhaka",Dhaka,Lalmatia,


### amenities

In [154]:
amenities = []

for index, row in btibrokeragebd_df.iterrows(): # loop through each sample
    
    # The code may take time, log in the console to keep track of things
    if index==0 or index%1000==0:
        print(f"Currently processing sample {index}...")
        
    # If current sample doen't have amenities, go to the next one
    if pd.isna(btibrokeragebd_df.loc[index, "amenities"]):
        continue
    
    # retrieve the amenities
    sample_amenities = str(btibrokeragebd_df.loc[index, "amenities"])
    splitted_sample_amenities = sample_amenities.split(',')
    
    # Go through and making list of each avalailbe amenity
    for key in splitted_sample_amenities:
        if key not in amenities:
            amenities.append(key)
        

# get a set of all unique amenities in the dataframe
all_amenities = set(amenities)
 
    # create a new column for each amenity and mark it as True or False based on whether it exists in the amenities list for that row
for amenity in all_amenities:
    btibrokeragebd_df[f"{amenity}-amenity"] = btibrokeragebd_df['amenities'].apply(lambda x: amenity in x)

# drop the original amenities column since we no longer need it
btibrokeragebd_df.drop('amenities', axis=1, inplace=True)


Currently processing sample 0...


### Final Touch

In [155]:
# dropping unneccesary columns
btibrokeragebd_df.drop(columns=['commercial_type', 'page', 'property_overview', 'location'], inplace=True)
btibrokeragebd_df.insert(1, 'building_type', value=np.nan)
btibrokeragebd_df.insert(3, 'image_url', value=np.nan)
btibrokeragebd_df.insert(8, 'property_overview', value=np.nan)

In [156]:
btibrokeragebd_df.head(3).T

Unnamed: 0,0,1,2
area,1517.0,935.0,1636.0
building_type,,,
building_nature,Residential,Residential,Residential
image_url,,,
num_bath_rooms,3.0,2.0,4.0
num_bed_rooms,3.0,2.0,3.0
price,12000000.0,5700000.0,14000000.0
property_description,Are you searching for a beautiful home to buy ...,Are you searching for a beautiful home to buy ...,Are you searching for a beautiful home to buy ...
property_overview,,,
property_url,https://btibrokeragebd.com/property/1517-sft-a...,https://btibrokeragebd.com/property/935-sft-ap...,https://btibrokeragebd.com/property/1636-sft-a...


### Save clean dataset

In [157]:
# Create folder in which to save cleaned dataset
if not os.path.exists(cleaned_btibrokeragebd_folder):
    os.makedirs(cleaned_btibrokeragebd_folder)
    print(f"Create folder '{cleaned_btibrokeragebd_folder}'")
else:
    print(f"Folder '{cleaned_btibrokeragebd_folder}' already exists")

Folder '../../../data/CLeaned_Data/btibrokeragebd' already exists


In [158]:
# Save cleaned dataset to csv
btibrokeragebd_df.to_csv(f"{cleaned_btibrokeragebd_folder}/cleaned_btibrokeragebd.csv", index=False)

In [159]:
# Load saved csv (to make sure it was successfully save)
clean_btibrokeragebd_df = pd.read_csv(f"{cleaned_btibrokeragebd_folder}/cleaned_btibrokeragebd.csv")
clean_btibrokeragebd_df.head(3).T

Unnamed: 0,0,1,2
area,1517.0,935.0,1636.0
building_type,,,
building_nature,Residential,Residential,Residential
image_url,,,
num_bath_rooms,3.0,2.0,4.0
num_bed_rooms,3.0,2.0,3.0
price,12000000.0,5700000.0,14000000.0
property_description,Are you searching for a beautiful home to buy ...,Are you searching for a beautiful home to buy ...,Are you searching for a beautiful home to buy ...
property_overview,,,
property_url,https://btibrokeragebd.com/property/1517-sft-a...,https://btibrokeragebd.com/property/935-sft-ap...,https://btibrokeragebd.com/property/1636-sft-a...


In [160]:
btibrokeragebd_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 181 entries, 0 to 180
Data columns (total 31 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   area                              181 non-null    float64
 1   building_type                     0 non-null      float64
 2   building_nature                   181 non-null    object 
 3   image_url                         0 non-null      float64
 4   num_bath_rooms                    181 non-null    float64
 5   num_bed_rooms                     181 non-null    float64
 6   price                             181 non-null    float64
 7   property_description              180 non-null    object 
 8   property_overview                 0 non-null      float64
 9   property_url                      181 non-null    object 
 10  purpose                           181 non-null    object 
 11  city                              181 non-null    object 
 12  locality