In [2]:
# Uncomment to change the width of the page

from IPython.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

In [3]:
# Import
import pandas as pd
import numpy as np

In [4]:
"""
 The functions below are developed by @Shariar Hossain Omee

 get_detailed_address() :
    - Take a full comma separated address as input
    - Split the address into City, Area, Address
    - Return a dictionary containing City, Area (a.k.a. "Locality"), Address as keys

This function splits input address according to the commas, then it checks each separated string with the values in
the arrays which we pre-defined in the function. It will return one area name under the 'Area' key and one city name
under the 'City' key, if it could match them with the pre-defined areas and cities in the function , the rest of the
string or address will be under the 'Address' key.

For example,
    Input --> "Block M, South Banasree Project, Banasree, Dhaka"
    Output --> {"city": "Dhaka", "area": "Banasree", "address": "Block M, South Banasree Project"}

Note: The input has to be comma separated in order to get meaningful output like the example above. Otherwise, it
won't recognize the address properly.

please, see the commented notebook of bproperty (https://github.com/OmdenaAI/dhaka-bangladesh-real-estate-
recommendation/blob/main/src/tasks/task-2-data-preprocessing/bproperty%20--%20cleaning/bproperty%20--%20cleaning.ipynb)
for use.

Note: This function doesn't have all the areas and cities of Bangladesh yet. I am adding them periodically according
to the upload cleaned dataset.
"""


def get_detailed_address(address):
    try:
        # converting the initial letter of each word to a capital letter of input.
        address = address.title()

        # defining output dictionary
        address_dict = {"city": "", "area": "", "address": ""}

        # splitting the input according to commas
        splitted_address = address.split(',')

        # getting each splitted and checking them with pre-defined address and area names.
        for i in reversed(splitted_address):

            # calling get_city_name() and passing name from splitted address
            if get_city_name(i.strip().replace('.', '')):
                # assigning matched city name under the "city" key.
                address_dict["city"] = i.strip().replace('.', '')
                # removing the matched name from the splitted address list.
                splitted_address.remove(i)

            # calling get_area_name() and passing name from splitted address
            elif get_area_name(i.strip().replace('.', '')):
                # assigning matched area name under the "area" key.
                address_dict["area"] = i.strip().replace('.', '')
                # removing the matched name from the splitted address list.
                splitted_address.remove(i)

        # joining the rest of the input and assigning it under the "address" key.
        address_dict["address"] = ','.join(splitted_address)

        # returning the output dictionary
        return address_dict

    except:

        # if any exception occurs, it assigns the whole input under the "address" key and return the dictionary
        return {"city": "", "area": "", "address": address}


def get_city_name(name):

    # a list containing different cities of Bangladesh
    cities = ['Dhaka', 'Chattogram', 'Narayanganj City', 'Gazipur', 'Sylhet', 'Barishal', 'Bhairab', 'Bogura',
              'Brahmanbaria', 'Chandpur', 'Chittagong', 'Chowmuhani', 'Chuadanga', 'Coxs Bazar',
              'Cumilla', 'Cumilla Sadar Dakshin', 'Dinajpur', 'Faridpur', 'Feni', 'Gazipur', 'Jamalpur',
              'Jashore', 'Jhenaidah', 'Khulna', 'Kishoreganj', 'Kushtia', 'Maijdee', 'Mymensingh',
              'Naogaon', 'Narayanganj', 'Narsingdi', 'Nawabganj', 'Pabna', 'Rajshahi', 'Rangpur', 'Saidpur', 'Satkhira',
              'Savar', 'Siddhirganj', 'Sirajganj', 'Sreepur', 'Tangail', 'Tarabo', 'Tongi']

    try:

        # if it finds match with the input, it returns true.
        cities.index(name)
        return True

    except:

        # if it doesn't find any match with the input, it returns false.
        return False


def get_area_name(name):

    # a list containing different areas of Bangladesh
    areas = ['10 No. North Kattali Ward', '11 No. South Kattali Ward', '15 No. Bagmoniram Ward',
             '16 No. Chawk Bazaar Ward', '22 No. Enayet Bazaar Ward', '29 No. West Madarbari Ward',
             '30 No. East Madarbari Ward', '31 No. Alkoron Ward', '32 No. Andarkilla Ward',
             '33 No. Firingee Bazaar Ward', '36 Goshail Danga Ward', '4 No Chandgaon Ward',
             '7 No. West Sholoshohor Ward', '9 No. North Pahartali Ward', 'Adabor', 'Aftab Nagar', 'Aftabnagar',
             'Agargaon', 'Airport', 'Akkelpur', 'Ambarkhana', 'Araihazar', 'Badda', 'Bagerhat Sadar', 'Bagha',
             'Bakalia', 'Banani', 'Banani Dohs', 'Banashree', 'Banasree', 'Bandar', 'Bandarban Sadar',
             'Banglamotor', 'Banglamotors', 'Bangshal', 'Banshkhali', 'Barguna Sadar', 'Baridhara',
             'Baridhara Dohs', 'Barishal City', 'Basabo', 'Bashabo', 'Bashundhara', 'Bashundhara R-A',
             'Bashundhara R/A', 'Bashundhara RA', 'Bashundhara Riverview', 'Bayazid', 'Belabo', 'Bhairab',
             'Bhaluka', 'Bhandaria', 'Bhashantek ', 'Bhola Sadar', 'Birampur', 'Boalkhali', 'Boalmari',
             'Bogura Sadar', 'Bosila', 'Botiaghata', 'Brahmanbaria Sadar', 'Cantonment', 'Chack Bazar',
             'Chandanpur', 'Chandpur Sadar', 'Chandra', 'Chapainawabganj Sadar', 'Chattogram City',
             'Chauddagram', 'Chawkbazar', 'Chhagalnaiya', 'Chunarughat', "Cox's Bazar Sadar", 'Cumilla City',
             'DOHS Banani', 'DOHS Baridhara', 'DOHS Mirpur', 'DOHS Mohakhali', 'Dakshin Khan', 'Dakshinsurma',
             'Daskhinkhan', 'Debidwar', 'Demra', 'Dhamrai', 'Dhanmondi', 'Digholia', 'Dinajpur Sadar', 'Dohar ',
             'Double Mooring', 'Dumni', 'East Nasirabad', 'East Rampura', 'Eskaton', 'Fakirhat',
             'Faridpur Sadar', 'Farmgate', 'Fatulla', 'Fenchuganj', 'Feni Sadar', 'Firojshah Colony',
             'Fulbaria', 'Gaibandha Sadar', 'Gajaria', 'Gandaria ', 'Gazipur Sadar', 'Gazipur Sadar Upazila',
             'Ghatail', 'Gopalganj Sadar', 'Gulistan', 'Gulshan', 'Gulshan 1', 'Gulshan 2', 'Habiganj Sadar',
             'Halishahar', 'Hathazari', 'Hatirpool', 'Hazaribag', 'Hazaribag ', 'Ibrahimpur', 'Jaintiapur',
             'Jalalabad Housing Society', 'Jamal Khan', 'Jamalpur Sadar', 'Jashore Sadar', 'Jatra Bari',
             'Jatrabari', 'Jhalakathi Sadar', 'Jhenaidah Sadar', 'Joar Sahara', 'Joypurhat Sadar', 'Kachukhet',
             'Kadamtali', 'Kafrul', 'Kakrail', 'Kalabagan', 'Kalachandpur', 'Kalapara', 'Kaliakair', 'Kaliganj',
             'Kalkini', 'Kallaynpur', 'Kamarkhand', 'Kamrangir Char', 'Kamrangirchar', 'Karnafuli',
             'Karwan Bazar', 'Kathalbagan', 'Kazir Dewri', 'Keraniganj', 'Khilgaon', 'Khilkhet', 'Khulna City',
             'Khulshi', 'Kotwali', 'Kuril', 'Kushtia Sadar', 'Lakshmipur Sadar', 'Lal Khan Bazaar', 'Lalbag',
             'Lalbagh', 'Lalmatia', 'Lalpur', 'Madaripur Sadar', 'Maghbazar', 'Magura Sadar', 'Malibagh',
             'Manikganj Sadar', 'Maniknagar', 'Mirpur', 'Mirsharai', 'Modhubag', 'Moghbazar', 'Mohakhali',
             'Mohakhali Dohs', 'Mohammadpur', 'Mohammadpur ', 'Mongla', 'Motijheel', 'Moulvibazar Sadar',
             'Mugda', 'Mugda Para', 'Mugdapara', 'Muktagacha', 'Munshiganj Sadar', 'Muradpur',
             'Mymensingh City', 'Nadda', 'Nandipara', 'Nangalkot', 'Naogaon Sadar', 'Narayanganj',
             'Narsingdi Sadar', 'Natore Sadar', 'Netrokona Sadar', 'New Market', 'Niketan', 'Nikunja',
             'Nilphamari Sadar', 'Noakhali Sadar', 'North  Nandipara', 'North Shahjahanpur', 'Pabna Sadar',
             'Pakundia', 'Pallabi ', 'Paltan', 'Panchagarh Sadar', 'Panchlaish', 'Paribagh', 'Patenga',
             'Patuakhali Sadar', 'Purbachal', 'Puthia', 'Railway Colony', 'Rajasthali', 'Rajbari Sadar',
             'Rajoir', 'Rajshahi City', 'Ramna', 'Rampura', 'Rangpur City', 'Ranisankail', 'Riaj Uddin Bazar',
             'Rupganj', 'Rupnagar', 'Rupsha', 'Sabujbag', 'Sagorika Bscic Industrial Area', 'Sakhipur',
             'Sarishabari', 'Satkhira Sadar', 'Savar', 'Senpara Porbota', 'Shah Ali', 'Shahbag ', 'Shahbagh',
             'Shahjahanpur', 'Shajahanpur', 'Shantinagar', 'Shariatpur Sadar', 'Shegunbagicha',
             'Sher E Bangla Nagar ', 'Sherpur Sadar', 'Shibpur', 'Shiddheswari', 'Shiddhirganj', 'Sholokbahar',
             'Shyamoli', 'Shyampur', 'Shyampur ', 'Siddeshwari', 'Singiar', 'Sirajganj Sadar', 'Sitakunda',
             'Sonargaon', 'South Banasree', 'Sreemangal', 'Sreepur', 'Sunamganj Sadar', 'Sutrapur',
             'Sylhet City', 'Taltali', 'Taltola', 'Tangail Sadar', 'Tarakanda', 'Tejgaon', 'Tejgaon I/A',
             'Tetulia', 'Thakurgaon Sadar', 'Tongi', 'Turag', 'Ullapara', 'Uttar Khan', 'Uttar Lalkhan',
             'Uttara', 'Uttara East', 'Uttara West', 'Uttarkhan', 'Vatara ', 'Wari', 'West Khulshi',
             'West Rampura', 'Zafrabad', 'Zindabazar', 'Akhaura', 'Akkelpur', 'Alamdanga', 'Badarganj', 'Bagerhat',
             'Bagha', 'Bajitpur', 'Bandarban', 'Banshkhali', 'Baraigram', 'Barguna', 'Barlekha', 'Barura', 'Basurhat',
             'Beani Bazar', 'Belkuchi', 'Benapole', 'Bera', 'Bhairab', 'Bhaluka', 'Bhanga', 'Bhangura', 'Bheramara',
             'Bhola', 'Bhuapur', 'Birampur', 'Birganj', 'Boalmari', 'Chakaria', 'Chandanaish', 'Chandina', 'Chandpur',
             'Char Fasson', 'Charghat', 'Chatkhil', 'Chauddagram', 'Chaugachha', 'Chaumohoni', 'Chhagalnaiya',
             'Chhatak', 'Chhengarchar', 'Chuadanga', 'Chunarughat', 'Coxs Bazar', 'Daganbhuiyan', 'Darshana',
             'Daudkandi', 'Debidwar', 'Derai', 'Dewanganj', 'Dhamrai', 'Dhanbari', 'Dohar', 'Dupchanchia', 'Durgapur',
             'Durgapur', 'Faridganj', 'Faridpur', 'Feni', 'Fulbaria', 'Gabtali', 'Gaffargaon', 'Gaibandha', 'Galachipa',
             'Gangni', 'Gaurnadi', 'Ghatail', 'Ghoraghat', 'Ghorashal', 'Goalunda Ghat', 'Gobindaganj', 'Godagari',
             'Golapganj', 'Gopalganj', 'Gopalpur', 'Gopalpur', 'Gouripur', 'Gurudaspur', 'Habiganj', 'Hajiganj',
             'Hakimpur', 'Haragacha', 'Harinakundu', 'Hatiya', 'Homna', 'Hossainpur', 'Ishwardi', 'Ishwarganj',
             'Islampur', 'Jagannathpur', 'Jaipurhat', 'Jajira', 'Jaldhaka', 'Jhalakati', 'Jhenaidah', 'Jhikargacha',
             'Jibannagar', 'Kachua', 'Kalaroa', 'Kalia', 'Kaliakair', 'Kaliganj', 'Kaliganj', 'Kalihati', 'Kalkini',
             'Kanaighat', 'Kanchan', 'Karimganj', 'Kasba', 'Katakhali', 'Katiadi', 'Kendua', 'Keshabpur', 'Kesharhat',
             'Khagrachhari', 'Kishoreganj', 'Kotchandpur', 'Kulaura', 'Kuliarchar', 'Kumarkhali', 'Kurigram', 'Kushtia',
             'Laksham', 'Lakshmipur', 'Lalmohan', 'Lalmonirhat', 'Lama', 'Lohagara', 'Madarganj', 'Madaripur',
             'Madhabdi', 'Madhabpur', 'Madhupur', 'Magura', 'Maheshkhali', 'Maheshpur', 'Manikganj', 'Manirampur',
             'Matiranga', 'Matlab', 'Maulvi Bazar', 'Mehendiganj', 'Meherpur', 'Melandaha', 'Mirkadim', 'Mirpur',
             'Mirzapur', 'Mohanganj', 'Mongla', 'Morrelganj', 'Muksudpur', 'Muktagachha', 'Muladi', 'Mundumala',
             'Munshiganj', 'Nabiganj', 'Nabinagar', 'Nageshwari', 'Nakla', 'Nalchiti', 'Nalitabari', 'Nandail',
             'Nangalkot', 'Naohata', 'Narail', 'Naria', 'Narsingdi', 'Natore', 'Nazipur', 'Netrakona', 'Nilphamari',
             'Noakhali', 'Noapara', 'Pakundia', 'Panchagarh', 'Panchbibi', 'Pangsha', 'Parbatipur', 'Parshuram',
             'Patgram', 'Patiya', 'Patuakhali', 'Phulbari', 'Phulpur', 'Pirganj', 'Pirojpur', 'Puthia', 'Rahanpur',
             'Raipur', 'Raipura', 'Rajbari', 'Ramganj', 'Ramgarh', 'Ramgati', 'Rangamati', 'Rangunia', 'Raozan',
             'Saidpur', 'Sakhipur', 'Sandwip', 'Santahar', 'Santhia', 'Sarishabari', 'Satkania', 'Satkhira', 'Savar',
             'Senbagh', 'Setabganj', 'Shahjadpur', 'Shahrasti', 'Shailkupa', 'Shaistaganj', 'Shariatpur', 'Sherpur',
             'Sherpur', 'Shibchar', 'Shibganj', 'Shibganj', 'Shibpur', 'Singair', 'Singra', 'Sirajganj', 'Sitakunda',
             'Sonagazi', 'Sonaimuri', 'Sonargaon', 'Sonatala', 'Sreebardi', 'Sreemangal', 'Sreepur', 'Sujanagar',
             'Sunamganj', 'Swarupkati', 'Tanore', 'Tarabo', 'Teknaf', 'Thakurgaon', 'Trishal', 'Ulipur', 'Ullahpara']

    try:

        # if it finds match with the input, it returns true.
        areas.index(name)
        return True

    except:

        # if it doesn't find any match with the input, it returns false.
        return False

In [5]:
# Define folder locations
raw_data_folder="."
cleaned_data_folder="."

In [6]:
df_bdstall = pd.read_csv(f"{raw_data_folder}/Farjana_bdstall.csv")

In [7]:
# A quick look at the raw data
df_bdstall.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,48,49,50,51,52,53,54,55,56,57
Amenities,"4 months ago,Bangladesh,Land Share,Drawing,Din...","4 months ago,Bangladesh,Land Share,1 Drawing,1...","5 months ago,Bangladesh,New Apartment,1 Drawin...","5 months ago,Bangladesh,New Apartment, Drawing...","5 months ago,Bangladesh,Land Share,Drawing, Di...","5 months ago,Bangladesh,Land Share, Drawing , ...","5 months ago,Bangladesh,New Apartment,2 Verand...","4 months ago,Bangladesh,Land Share,Drawing,Din...","4 months ago,Bangladesh,New Apartment,Drawing,...","4 months ago,Bangladesh,Land Share,Drawing,Din...",...,"11 months ago,Bangladesh,New Apartment,1 Drawi...","11 months ago,Bangladesh,Used Apartment,1 Draw...","11 months ago,Bangladesh,New Apartment,1 Drawi...","11 months ago,Bangladesh,New Apartment,1 Drawi...","11 months ago,Bangladesh,Land Share,1 Drawing,...","11 months ago,Bangladesh,New Apartment,1 Drawi...","11 months ago,Bangladesh,New Apartment,1 Drawi...","11 months ago,Bangladesh,Land Share,1 Drawing,...","11 months ago,Bangladesh,Used Apartment,1 Draw...","11 months ago,Bangladesh,New Apartment,1 Drawi..."
Bathroom,3 Bathroom,3 Bathroom,3 Bathroom,3 Bathroom,3 Bathroom,3 Bathroom,3 Bathroom,3 Bathroom,2 Bathroom,2 Bathroom,...,3 Bathroom,2 Bathroom,2 Bathroom,3 Bathroom,3 Bathroom,4 Bathroom,3 Bathroom,3 Bathroom,4 Bathroom,3 Bathroom
Bed,3 Bed,3 Bed,3 Bed,3 Bed,3 Bed,,,3 Bed,3 Bed,3 Bed,...,3 Bed,2 Bed,3 Bed,3 Bed,3 Bed,4 Bed,,3 Bed,4 Bed,3 Bed
Description,Shares of 1350 Sqft land share in Banasree M B...,Shares of 1250 Sqft land will be sold on the o...,A 1252 Sqft flat will be sold in Arshi Nagar a...,Shares of 1200 sqft apartment in South Keran...,1500 Sqft flat land share will sale in importa...,Shares of 1300 Sqft land share in Aftabnagar ...,1350 sqft apartments will be sold next to Ramp...,Shares of 1200 sqft land share in Rampura Bana...,1200 Sqft apartment will be sold in a noise fr...,Shares of 1050 Sqft land share in Trimohoni M...,...,Almost Ready flat of 1200 Sqft will be sold on...,Near Faraji Hospital in Banasree G block a 950...,One 1450 Sqft flat will be sold on the 4th flo...,Ready flats of 1300 Sqft will be sold in the C...,Shares of 1400 sqft flats and 100 sqft shops w...,Ready flats 2150 Sqft will be sold in the F-b...,A ready flat of 1430 Sqft will be sold in Chit...,Shares of 1350 sqft of land will be sold at Gr...,"Near Faraji Hospital in Banasree G Block, an 1...",A ready flat on the 7th floor of 1650 Sqft wit...
Location,Banasree M Block.,"Opposite Banasree Police Park, Rampura",Arshi Nagar,"Bashundhara Riverview , Block - A.",Bijoy Nagar,"Aftabnagar M Block , Sector - 4.",Banasree E Block.,Banasree F Block.,North Nandipara,Trimohoni,...,"Ulon Road, West Rampura","G Block , Banasree",South Banasree B Block,chawkbazar,"Banasree M Block, Rampura",F Block,Chandanpur,"Green Model Town, Mugda",G Block,Askardighi.
PricePerMonth,1500000,2500000,5200000,5200000,6500000,3500000,8500000,3500000,5200000,1700000,...,6000000,5000000,6500000,6500000,1800000,16500000,7000000,1550000,10000000,10200000
PropertyType,Apartment,Apartment,Apartment,Apartment,Apartment,Apartment,Apartment,Apartment,Apartment,Apartment,...,Apartment,Apartment,Apartment,Apartment,Apartment,Apartment,Apartment,Apartment,Apartment,Apartment
Size,1350 Sqft,1250 Sqft,1252 Sqft,1200 Sqft,1500 Sqft,1300 Sqft,1350 Sqft,1200 Sqft,1200 Sqft,1050 Sqft,...,1200 Sqft,950 Sqft,1450 Sqft,1300 Sqft,1400 Sqft Apartment &100 Sqft Shop,2150 Sqft,1430 Sqft,1350 Sqft,1800 Sqft,1650 Sqft
Status,Out of Stock,Out of Stock,Out of Stock,Out of Stock,Out of Stock,Out of Stock,Out of Stock,Out of Stock,Out of Stock,Out of Stock,...,Out of Stock,Out of Stock,Out of Stock,Out of Stock,Out of Stock,Out of Stock,Out of Stock,Out of Stock,Out of Stock,Out of Stock
url,https://www.bdstall.com/details/near-hatirjhee...,https://www.bdstall.com/details/banasree-near-...,https://www.bdstall.com/details/near-mohammadp...,https://www.bdstall.com/details/south-keraniga...,https://www.bdstall.com/details/paltan-vijay-n...,https://www.bdstall.com/details/aftabnagar-m-b...,https://www.bdstall.com/details/banasree-near-...,https://www.bdstall.com/details/banasree-f-blo...,https://www.bdstall.com/details/south-banasree...,https://www.bdstall.com/details/trimohoni-mast...,...,https://www.bdstall.com/details/west-rampura-u...,https://www.bdstall.com/details/banasree-g-blo...,https://www.bdstall.com/details/south-banasree...,https://www.bdstall.com/details/chittagong-cha...,https://www.bdstall.com/details/banasree-comme...,https://www.bdstall.com/details/bashundhara-f-...,https://www.bdstall.com/details/chittagong-cha...,https://www.bdstall.com/details/near-motijheel...,https://www.bdstall.com/details/banasree-near-...,https://www.bdstall.com/details/chittagong-ask...


**How much data do we have in the raw data file**

In [8]:
df_bdstall.shape

(58, 10)

**Print a list of all column names and their the type of values in each column**

In [9]:
print(df_bdstall.columns)

Index(['Amenities', 'Bathroom', 'Bed', 'Description', 'Location',
       'PricePerMonth', 'PropertyType', 'Size', 'Status', 'url'],
      dtype='object')


In [10]:
print(df_bdstall.dtypes)

Amenities        object
Bathroom         object
Bed              object
Description      object
Location         object
PricePerMonth     int64
PropertyType     object
Size             object
Status           object
url              object
dtype: object


**For reference**<br>
**Filer structure requirements in #task-2-data-preprocessing:** <br>
The sample from files (screenshots) provided by @Ekoue LOGOSU-TEKO
<img src="CSV_sample-1.png" alt="Alternative text" />
<img src="CSV_sample-2.png" alt="Alternative text" />


## Construct a cleaned version of the raw CSV


In [11]:
# Creat a new DataFrame for storing the cleaned data
df_bdstall_new = pd.DataFrame()

In [12]:
df_bdstall["Size"].unique()

array(['1350 Sqft', '1250 Sqft', '1252 Sqft', '1200 Sqft', '1500 Sqft',
       '1300 Sqft ', '1050 Sqft', '1400 Sqft', '1750 Sqft', '1300 Sqft',
       '920 Sqft', '1110 Sqft', '2500 Sqft', '1800 Sqft', '960 Sqft',
       '1150 Sqft', '1365 Sqft', '1200  Sqft', '1420 sft', '1050 Sqft ',
       '1100 Sqf', '850 Sqft', '1134 Sqft', '1100 sqft', '1651 Sqft',
       '1575 Square Feet', '2400  Sqft', '950 Sqft', '1450 Sqft',
       '1400 Sqft Apartment &100 Sqft Shop', '2150  Sqft', '1430 Sqft',
       '1650 Sqft'], dtype=object)

In [13]:
df_bdstall_new["area"] = pd.DataFrame(df_bdstall["Size"])

# df_bdstall_new["area"] = pd.DataFrame(df_bdstall["Size"].str.split(" ", expand=True))


# Loop through each element in the 'Size' column and extract the numeric value
for i in range(len(df_bdstall_new['area'])):
    df_bdstall_new['area'][i] = df_bdstall_new['area'][i].split(" ")[0]

# Change data type to float
df_bdstall_new["area"] = df_bdstall_new["area"].astype(float)

In [14]:
# Sanity checks
print(df_bdstall_new["area"].unique())
print()
print(df_bdstall_new.dtypes)


[1350. 1250. 1252. 1200. 1500. 1300. 1050. 1400. 1750.  920. 1110. 2500.
 1800.  960. 1150. 1365. 1420. 1100.  850. 1134. 1651. 1575. 2400.  950.
 1450. 2150. 1430. 1650.]

area    float64
dtype: object


In [15]:
# Check the original values
df_bdstall['PropertyType'].unique()

array(['Apartment'], dtype=object)

In [16]:
df_bdstall_new["building_type"] = df_bdstall['PropertyType']

In [17]:
# Sanity checks
print(df_bdstall_new["building_type"].unique())
print()
print(df_bdstall_new["building_type"].value_counts())

['Apartment']

Apartment    58
Name: building_type, dtype: int64


In [18]:
# The info about 'Residential' vs. 'Commercial' is not available anywhere in the raw data
# Although from some of the URLs it seems that many/most of them are 'Residential', it won't
# be right to just assign 'Residential' to "building_nature" column
df_bdstall_new["building_nature"] = 'NA'


# The following strategy for extracting info from 'Description' doesn't work well
# 
# # Create sets of matching words
# resid_str = {'residential', 'Residential'}
# commer_str = {'commercial', 'Commercial'}

# df_bdstall_new["building_nature"] = np.where(df_bdstall["Description"].apply(lambda x: any(word in x for word in resid_str)), "Residential", 
#                                      np.where(df_bdstall["Description"].apply(lambda x: any(word in x for word in commer_str)), 'Commercial', "NA"))

In [19]:
print(df_bdstall_new["building_nature"].value_counts())

NA    58
Name: building_nature, dtype: int64


In [20]:
# df_bdstall_new[df_bdstall_new["building_nature"] == 'Commercial'].T

In [21]:
print(df_bdstall['Bathroom'].unique())
print()
print(df_bdstall['Bed'].unique())

['3 Bathroom' '2 Bathroom' '4 Bathroom' '5 Bathroom' nan]

['3 Bed' nan '4 Bed' '2 Bed' '5 Bed']


In [22]:
df_bdstall['Bathroom'].dtype
# df_bdstall_new.dtypes

dtype('O')

In [23]:
df_bdstall_new['num_bath_rooms'] = df_bdstall['Bathroom']
df_bdstall_new['num_bed_rooms'] = df_bdstall['Bed']

# Replace the NaNs in the number of bathrooms and bedrooms with a zero (*string*)
# (since NaNs are treated as float and then the string function split() gives error)
df_bdstall_new["num_bath_rooms"].fillna('0', inplace=True)
df_bdstall_new["num_bed_rooms"].fillna('0', inplace=True)

# Loop through each element in columns and modify the string
for i in range(len(df_bdstall_new['num_bath_rooms'])):
    df_bdstall_new['num_bath_rooms'][i] = df_bdstall_new['num_bath_rooms'][i].split(" ")[0]
    df_bdstall_new['num_bed_rooms'][i] = df_bdstall_new['num_bed_rooms'][i].split(" ")[0]

# Change data type to float
df_bdstall_new['num_bath_rooms'] = df_bdstall_new['num_bath_rooms'].astype(float)
df_bdstall_new['num_bed_rooms'] = df_bdstall_new['num_bed_rooms'].astype(float)

df_bdstall_new.dtypes

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_bdstall_new['num_bath_rooms'][i] = df_bdstall_new['num_bath_rooms'][i].split(" ")[0]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_bdstall_new['num_bed_rooms'][i] = df_bdstall_new['num_bed_rooms'][i].split(" ")[0]


area               float64
building_type       object
building_nature     object
num_bath_rooms     float64
num_bed_rooms      float64
dtype: object

In [24]:
# Sanity checks
print(df_bdstall_new['num_bath_rooms'].unique())
print()
print(df_bdstall_new['num_bed_rooms'].unique())

print()

print(df_bdstall_new['num_bath_rooms'].value_counts())
print()
print(df_bdstall_new['num_bed_rooms'].value_counts())

[3. 2. 4. 5. 0.]

[3. 0. 4. 2. 5.]

3.0    34
2.0    10
0.0     7
4.0     5
5.0     2
Name: num_bath_rooms, dtype: int64

3.0    40
0.0     6
4.0     6
2.0     5
5.0     1
Name: num_bed_rooms, dtype: int64


In [25]:
df_bdstall['PricePerMonth'].describe()

count    5.800000e+01
mean     5.292155e+06
std      3.746155e+06
min      7.450000e+05
25%      2.350000e+06
50%      5.200000e+06
75%      6.675000e+06
max      2.000000e+07
Name: PricePerMonth, dtype: float64

In [26]:
# Add the price column as it is
# NOTE: although the original column name is 'PricePerMonth', 
# after visiting some links the prices seems to be for 'Sale' 
df_bdstall_new["price"] = df_bdstall['PricePerMonth']

In [27]:
# Add the property_description column as it is
df_bdstall_new["property_description"] = df_bdstall['Description']


# https://www.bdstall.com/details/near-hatirjheel-banasree-1350-sqft-apartment-land-share-83817/

In [28]:
# There doesn't seem to be any extra overview info available other than the description included above
df_bdstall_new["property_overview"] = "NA"

In [29]:
# Add the URL column as it is
df_bdstall_new["property_url"] = df_bdstall["url"]

In [30]:
# Although the original 'price' column had name 'PricePerMonth', 
# after visiting some links and based on the details in the description 
# the prices seems to be for 'Sale'

# Create sets of matching words
sale_str = {'sale', 'Sale', 'sold', 'Sold'}
rent_str = {'rent', 'Rent'}

df_bdstall_new["purpose"] = np.where(df_bdstall["Description"].apply(lambda x: any(word in x for word in sale_str)), "Sale", 
                                     np.where(df_bdstall["Description"].apply(lambda x: any(word in x for word in rent_str)), 'Rent', "NA"))

In [29]:
# Sanity checks
print(df_bdstall_new["purpose"].unique())
print()
print(df_bdstall_new["purpose"].value_counts())

print()
print(df_bdstall_new[df_bdstall_new["purpose"] == 'NA'])

['Sale' 'NA']

Sale    56
NA       2
Name: purpose, dtype: int64

      area building_type building_nature  num_bath_rooms  num_bed_rooms  \
34  1420.0     Apartment              NA             0.0            0.0   
45  1575.0     Apartment              NA             0.0            0.0   

       price                               property_description  \
34   9800000  Sukrabad, Dhanmondi Dhaka is an 8-story luxury...   
45  11500000  This apartment will be a single unit and will ...   

   property_overview                                       property_url  \
34                NA  https://www.bdstall.com/details/1420-sqft-read...   
45                NA  https://www.bdstall.com/details/bashundhara-so...   

   purpose  
34      NA  
45      NA  


In [44]:
# Extract some address info from the 'Description' column in the raw data

# NOTE: The 'Location' column doesn't have a city name, so we need to extract it from 'Description' column instead
# of using @Shariar Hossain Omee's get_detailed_address() function. But we can use '@Shariar Hossain Omee' for getting
# the 'locality' and 'address' info from the 'Location' column


# Define a function to check for the presence of words in a string
def find_word_in_string(string, word_list):
    for i, word in enumerate(word_list):
        if word in string:
            return word
    return 'NA'

# Create a list of cities in Bangladesh
cities = ['Bandar', 'Barishal', 'Bhairab', 'Bogura', 'Brahmanbaria', 'Chandpur', 'Chattogram', 'Chittagong', 
          'Chowmuhani', 'Chuadanga', "Cox's Bazar", 'Cumilla', 'Cumilla Sadar Dakshin', 'Dhaka', 
          'Dinajpur', 'Faridpur', 'Feni', 'Gazipur', 'Jamalpur', 'Jashore', 'Jhenaidah', 'Kaliakair', 
          'Khulna', 'Kishoreganj', 'Kushtia', 'Maijdee', 'Mymensingh', 'Naogaon', 'Narayanganj', 
          'Narsingdi', 'Nawabganj', 'Pabna', 'Rajshahi', 'Rangpur', 'Saidpur', 'Satkhira', 'Savar', 
          'Siddhirganj', 'Sirajganj', 'Sreepur', 'Sylhet', 'Tangail', 'Tarabo', 'Tongi']
# NOTE:
# 'Chattogram' is district and also a city
# 'Chattogram' is also called 'Chittagong'


df_bdstall_new['city'] = df_bdstall['Description'].apply(find_word_in_string, args=(cities,))


In [45]:
print(df_bdstall_new['city'].value_counts())
# print()
# print(df_bdstall_new['district'].value_counts())
# print()
# print(df_bdstall_new['municipality'].value_counts())

NA             38
Dhaka          11
Chittagong      7
Narayanganj     2
Name: city, dtype: int64


In [46]:
# 
# @Ekoue LOGOSU-TEKO:
# 
# Hello @channel.
# @Shariar Hossain Omee
#  has created a function to split location into relevant parts. 
# It can be found here: 
# https://github.com/OmdenaAI/dhaka-bangladesh-real-estate-recommendation/blob/main/src/tasks/task-2-data-preprocessing/functions/address_extractor.py
# 
# The code return a dictionary having the following keys: City, Area, Address. They are to become the following columns in the cleaned dataset:
# City -> city
# Area -> locality
# Address -> address

# NOTE: The location column doesn't have a city name, so we need to extract it from 'Description' column instead
# of using @Shariar Hossain Omee's get_detailed_address() function. But we can use '@Shariar Hossain Omee' for getting
# the 'locality' info from 'Location' column


# use get_detailed_address() getting the locality and address info
# city = []
locality = []
address = []

# iterate through the 'name' column
for index, desc in df_bdstall.iterrows():
    detailed_address = get_detailed_address(desc['Location'])
#     city.append(detailed_address['city'])
    locality.append(detailed_address['area'])
    address.append(detailed_address['address'])


In [47]:
# Add the locality and address info
df_bdstall_new['locality'] = locality
df_bdstall_new['address'] = address

In [48]:
df_bdstall_new.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,48,49,50,51,52,53,54,55,56,57
area,1350.0,1250.0,1252.0,1200.0,1500.0,1300.0,1350.0,1200.0,1200.0,1050.0,...,1200.0,950.0,1450.0,1300.0,1400.0,2150.0,1430.0,1350.0,1800.0,1650.0
building_type,Apartment,Apartment,Apartment,Apartment,Apartment,Apartment,Apartment,Apartment,Apartment,Apartment,...,Apartment,Apartment,Apartment,Apartment,Apartment,Apartment,Apartment,Apartment,Apartment,Apartment
building_nature,,,,,,,,,,,...,,,,,,,,,,
num_bath_rooms,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,2.0,2.0,...,3.0,2.0,2.0,3.0,3.0,4.0,3.0,3.0,4.0,3.0
num_bed_rooms,3.0,3.0,3.0,3.0,3.0,0.0,0.0,3.0,3.0,3.0,...,3.0,2.0,3.0,3.0,3.0,4.0,0.0,3.0,4.0,3.0
price,1500000,2500000,5200000,5200000,6500000,3500000,8500000,3500000,5200000,1700000,...,6000000,5000000,6500000,6500000,1800000,16500000,7000000,1550000,10000000,10200000
property_description,Shares of 1350 Sqft land share in Banasree M B...,Shares of 1250 Sqft land will be sold on the o...,A 1252 Sqft flat will be sold in Arshi Nagar a...,Shares of 1200 sqft apartment in South Keran...,1500 Sqft flat land share will sale in importa...,Shares of 1300 Sqft land share in Aftabnagar ...,1350 sqft apartments will be sold next to Ramp...,Shares of 1200 sqft land share in Rampura Bana...,1200 Sqft apartment will be sold in a noise fr...,Shares of 1050 Sqft land share in Trimohoni M...,...,Almost Ready flat of 1200 Sqft will be sold on...,Near Faraji Hospital in Banasree G block a 950...,One 1450 Sqft flat will be sold on the 4th flo...,Ready flats of 1300 Sqft will be sold in the C...,Shares of 1400 sqft flats and 100 sqft shops w...,Ready flats 2150 Sqft will be sold in the F-b...,A ready flat of 1430 Sqft will be sold in Chit...,Shares of 1350 sqft of land will be sold at Gr...,"Near Faraji Hospital in Banasree G Block, an 1...",A ready flat on the 7th floor of 1650 Sqft wit...
property_overview,,,,,,,,,,,...,,,,,,,,,,
property_url,https://www.bdstall.com/details/near-hatirjhee...,https://www.bdstall.com/details/banasree-near-...,https://www.bdstall.com/details/near-mohammadp...,https://www.bdstall.com/details/south-keraniga...,https://www.bdstall.com/details/paltan-vijay-n...,https://www.bdstall.com/details/aftabnagar-m-b...,https://www.bdstall.com/details/banasree-near-...,https://www.bdstall.com/details/banasree-f-blo...,https://www.bdstall.com/details/south-banasree...,https://www.bdstall.com/details/trimohoni-mast...,...,https://www.bdstall.com/details/west-rampura-u...,https://www.bdstall.com/details/banasree-g-blo...,https://www.bdstall.com/details/south-banasree...,https://www.bdstall.com/details/chittagong-cha...,https://www.bdstall.com/details/banasree-comme...,https://www.bdstall.com/details/bashundhara-f-...,https://www.bdstall.com/details/chittagong-cha...,https://www.bdstall.com/details/near-motijheel...,https://www.bdstall.com/details/banasree-near-...,https://www.bdstall.com/details/chittagong-ask...
purpose,Sale,Sale,Sale,Sale,Sale,Sale,Sale,Sale,Sale,Sale,...,Sale,Sale,Sale,Sale,Sale,Sale,Sale,Sale,Sale,Sale


In [49]:
# Save cleaned dataset to csv
df_bdstall_new.to_csv(f"{cleaned_data_folder}/Farjana_bdstall-cleaned-Umesh.csv", index=False)

In [50]:
# Open the saved CSVs and check the data
df_test = pd.read_csv(f"{cleaned_data_folder}/Farjana_bdstall-cleaned-Umesh.csv")
df_test.describe(include='all').T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
area,58.0,,,,1343.396552,320.39126,850.0,1200.0,1300.0,1400.0,2500.0
building_type,58.0,1.0,Apartment,58.0,,,,,,,
building_nature,0.0,,,,,,,,,,
num_bath_rooms,58.0,,,,2.62069,1.167207,0.0,2.0,3.0,3.0,5.0
num_bed_rooms,58.0,,,,2.741379,1.068863,0.0,3.0,3.0,3.0,5.0
price,58.0,,,,5292155.172414,3746154.649732,745000.0,2350000.0,5200000.0,6675000.0,20000000.0
property_description,58.0,58.0,Shares of 1350 Sqft land share in Banasree M B...,1.0,,,,,,,
property_overview,0.0,,,,,,,,,,
property_url,58.0,58.0,https://www.bdstall.com/details/near-hatirjhee...,1.0,,,,,,,
purpose,56.0,1.0,Sale,56.0,,,,,,,


In [42]:
# # Open the saved CSVs and check the data
# df_test = pd.read_csv(f"{cleaned_data_folder}/pbazaar-cleaned_WithSomeExtraInfo-Umesh.csv")
# df_test.describe(include='all').T

<br>

## TEMP!!



In [43]:
# df_bdstall_new.describe(include='all').T

In [44]:
# df_bdstall['Amenities'].head()