In [513]:
# Uncomment to change the width of the page

from IPython.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

In [1]:
# Imports
import pandas as pd
import numpy as np
import re
from googletrans import Translator


In [493]:
# =============================================================================================================
# A Python function for translating text from a Series to English (pandas.core.series.Series)
# =============================================================================================================


# NOTE: 
# -- This function may not always work because Google translate limits the number of request and 
#    throws exceptions after N no. of requests where N could be anywhere between 80 - 200.
#    Exception => "ReadTimeout: The read operation timed out"
# -- Google translate also puts a limit on the amount of text it translates at a time (15k)

def translate_Series(Series):
#     
# Specifying multiple translation URLs makes it randomly chooses a domain which probably also avoids the timeout problem. 
# Here there are only nine domains, but in theory one could give a list of all to have better chances for the request 
# to not go to the same domain.
# 
    translator = Translator(service_urls=['translate.google.com', 'translate.google.co.kr', 'translate.google.co.in', 
                                          'translate.google.de', 'translate.google.co.uk', 'translate.google.fr',
                                          'translate.google.it', 'translate.google.es', 'translate.google.nl'])
    Series_trans = []

#     A counter for re-creating the 'translator' object
    j = 0

#     loop through the Series and translate each element
    for i in range(0, len(Series)):
#         print()
        print(f"i = {i}")
#         print()
        
        string = str(Series[i])
        translated_text = translator.translate(string).text
        
#         for an empty string google translate strangely returns 'into' as the translation
#         change that to an empty string
        if(translated_text == 'into'):
            translated_text = ''

        Series_trans.append(translated_text)
        j += 1
        
#         If j > 50 re-create the 'translator' object
        if j > 50:
            del translator
            translator = Translator(service_urls=['translate.google.com', 'translate.google.co.kr', 'translate.google.co.in', 
                                                  'translate.google.de', 'translate.google.co.uk', 'translate.google.fr',
                                                  'translate.google.it', 'translate.google.es', 'translate.google.nl'])
            j = 0
#             print()
#             print("****** j > 50 *****")
#             print()
    return (Series_trans)


In [647]:
"""
 The functions below are developed by @Shariar Hossain Omee

 get_detailed_address() :
    - Take a full comma separated address as input
    - Split the address into City, Area, Address
    - Return a dictionary containing City, Area (a.k.a. "Locality"), Address as keys

This function splits input address according to the commas, then it checks each separated string with the values in
the arrays which we pre-defined in the function. It will return one area name under the 'Area' key and one city name
under the 'City' key, if it could match them with the pre-defined areas and cities in the function , the rest of the
string or address will be under the 'Address' key.

For example,
    Input --> "Block M, South Banasree Project, Banasree, Dhaka"
    Output --> {"city": "Dhaka", "area": "Banasree", "address": "Block M, South Banasree Project"}

Note: The input has to be comma separated in order to get meaningful output like the example above. Otherwise, it
won't recognize the address properly.

please, see the commented notebook of bproperty (https://github.com/OmdenaAI/dhaka-bangladesh-real-estate-
recommendation/blob/main/src/tasks/task-2-data-preprocessing/bproperty%20--%20cleaning/bproperty%20--%20cleaning.ipynb)
for use.

Note: This function doesn't have all the areas and cities of Bangladesh yet. I am adding them periodically according
to the upload cleaned dataset.
"""


def get_detailed_address(address):
    try:
        # converting the initial letter of each word to a capital letter of input.
        address = address.title()

        # defining output dictionary
        address_dict = {"city": "", "area": "", "address": ""}

        # splitting the input according to commas
        splitted_address = address.split(',')

        # getting each splitted and checking them with pre-defined address and area names.
        for i in reversed(splitted_address):

            # calling get_city_name() and passing name from splitted address
            if get_city_name(i.strip().replace('.', '')):
                # assigning matched city name under the "city" key.
                address_dict["city"] = i.strip().replace('.', '')
                # removing the matched name from the splitted address list.
                splitted_address.remove(i)

            # calling get_area_name() and passing name from splitted address
            elif get_area_name(i.strip().replace('.', '')):
                # assigning matched area name under the "area" key.
                address_dict["area"] = i.strip().replace('.', '')
                # removing the matched name from the splitted address list.
                splitted_address.remove(i)

        # joining the rest of the input and assigning it under the "address" key.
        address_dict["address"] = ','.join(splitted_address)

        # returning the output dictionary
        return address_dict

    except:

        # if any exception occurs, it assigns the whole input under the "address" key and return the dictionary
        return {"city": "", "area": "", "address": address}


def get_city_name(name):

    # a list containing different cities of Bangladesh
    cities = ['Dhaka', 'Chattogram', 'Narayanganj City', 'Gazipur', 'Sylhet', 'Barishal', 'Bhairab', 'Bogura',
              'Brahmanbaria', 'Chandpur', 'Chittagong', 'Chowmuhani', 'Chuadanga', 'Coxs Bazar',
              'Cumilla', 'Cumilla Sadar Dakshin', 'Dinajpur', 'Faridpur', 'Feni', 'Gazipur', 'Jamalpur',
              'Jashore', 'Jhenaidah', 'Khulna', 'Kishoreganj', 'Kushtia', 'Maijdee', 'Mymensingh',
              'Naogaon', 'Narayanganj', 'Narsingdi', 'Nawabganj', 'Pabna', 'Rajshahi', 'Rangpur', 'Saidpur', 'Satkhira',
              'Savar', 'Siddhirganj', 'Sirajganj', 'Sreepur', 'Tangail', 'Tarabo', 'Tongi']

    try:

        # if it finds match with the input, it returns true.
        cities.index(name)
        return True

    except:

        # if it doesn't find any match with the input, it returns false.
        return False


def get_area_name(name):

    # a list containing different areas of Bangladesh
    areas = ['10 No. North Kattali Ward', '11 No. South Kattali Ward', '15 No. Bagmoniram Ward',
             '16 No. Chawk Bazaar Ward', '22 No. Enayet Bazaar Ward', '29 No. West Madarbari Ward',
             '30 No. East Madarbari Ward', '31 No. Alkoron Ward', '32 No. Andarkilla Ward',
             '33 No. Firingee Bazaar Ward', '36 Goshail Danga Ward', '4 No Chandgaon Ward',
             '7 No. West Sholoshohor Ward', '9 No. North Pahartali Ward', 'Adabor', 'Aftab Nagar', 'Aftabnagar',
             'Agargaon', 'Airport', 'Akkelpur', 'Ambarkhana', 'Araihazar', 'Badda', 'Bagerhat Sadar', 'Bagha',
             'Bakalia', 'Banani', 'Banani Dohs', 'Banashree', 'Banasree', 'Bandar', 'Bandarban Sadar',
             'Banglamotor', 'Banglamotors', 'Bangshal', 'Banshkhali', 'Barguna Sadar', 'Baridhara',
             'Baridhara Dohs', 'Barishal City', 'Basabo', 'Bashabo', 'Bashundhara', 'Bashundhara R-A',
             'Bashundhara R/A', 'Bashundhara RA', 'Bashundhara Riverview', 'Bayazid', 'Belabo', 'Bhairab',
             'Bhaluka', 'Bhandaria', 'Bhashantek ', 'Bhola Sadar', 'Birampur', 'Boalkhali', 'Boalmari',
             'Bogura Sadar', 'Bosila', 'Botiaghata', 'Brahmanbaria Sadar', 'Cantonment', 'Chack Bazar',
             'Chandanpur', 'Chandpur Sadar', 'Chandra', 'Chapainawabganj Sadar', 'Chattogram City',
             'Chauddagram', 'Chawkbazar', 'Chhagalnaiya', 'Chunarughat', "Cox's Bazar Sadar", 'Cumilla City',
             'DOHS Banani', 'DOHS Baridhara', 'DOHS Mirpur', 'DOHS Mohakhali', 'Dakshin Khan', 'Dakshinsurma',
             'Daskhinkhan', 'Debidwar', 'Demra', 'Dhamrai', 'Dhanmondi', 'Digholia', 'Dinajpur Sadar', 'Dohar ',
             'Double Mooring', 'Dumni', 'East Nasirabad', 'East Rampura', 'Eskaton', 'Fakirhat',
             'Faridpur Sadar', 'Farmgate', 'Fatulla', 'Fenchuganj', 'Feni Sadar', 'Firojshah Colony',
             'Fulbaria', 'Gaibandha Sadar', 'Gajaria', 'Gandaria ', 'Gazipur Sadar', 'Gazipur Sadar Upazila',
             'Ghatail', 'Gopalganj Sadar', 'Gulistan', 'Gulshan', 'Gulshan 1', 'Gulshan 2', 'Habiganj Sadar',
             'Halishahar', 'Hathazari', 'Hatirpool', 'Hazaribag', 'Hazaribag ', 'Ibrahimpur', 'Jaintiapur',
             'Jalalabad Housing Society', 'Jamal Khan', 'Jamalpur Sadar', 'Jashore Sadar', 'Jatra Bari',
             'Jatrabari', 'Jhalakathi Sadar', 'Jhenaidah Sadar', 'Joar Sahara', 'Joypurhat Sadar', 'Kachukhet',
             'Kadamtali', 'Kafrul', 'Kakrail', 'Kalabagan', 'Kalachandpur', 'Kalapara', 'Kaliakair', 'Kaliganj',
             'Kalkini', 'Kallaynpur', 'Kamarkhand', 'Kamrangir Char', 'Kamrangirchar', 'Karnafuli',
             'Karwan Bazar', 'Kathalbagan', 'Kazir Dewri', 'Keraniganj', 'Khilgaon', 'Khilkhet', 'Khulna City',
             'Khulshi', 'Kotwali', 'Kuril', 'Kushtia Sadar', 'Lakshmipur Sadar', 'Lal Khan Bazaar', 'Lalbag',
             'Lalbagh', 'Lalmatia', 'Lalpur', 'Madaripur Sadar', 'Maghbazar', 'Magura Sadar', 'Malibagh',
             'Manikganj Sadar', 'Maniknagar', 'Mirpur', 'Mirsharai', 'Modhubag', 'Moghbazar', 'Mohakhali',
             'Mohakhali Dohs', 'Mohammadpur', 'Mohammadpur ', 'Mongla', 'Motijheel', 'Moulvibazar Sadar',
             'Mugda', 'Mugda Para', 'Mugdapara', 'Muktagacha', 'Munshiganj Sadar', 'Muradpur',
             'Mymensingh City', 'Nadda', 'Nandipara', 'Nangalkot', 'Naogaon Sadar', 'Narayanganj',
             'Narsingdi Sadar', 'Natore Sadar', 'Netrokona Sadar', 'New Market', 'Niketan', 'Nikunja',
             'Nilphamari Sadar', 'Noakhali Sadar', 'North  Nandipara', 'North Shahjahanpur', 'Pabna Sadar',
             'Pakundia', 'Pallabi ', 'Paltan', 'Panchagarh Sadar', 'Panchlaish', 'Paribagh', 'Patenga',
             'Patuakhali Sadar', 'Purbachal', 'Puthia', 'Railway Colony', 'Rajasthali', 'Rajbari Sadar',
             'Rajoir', 'Rajshahi City', 'Ramna', 'Rampura', 'Rangpur City', 'Ranisankail', 'Riaj Uddin Bazar',
             'Rupganj', 'Rupnagar', 'Rupsha', 'Sabujbag', 'Sagorika Bscic Industrial Area', 'Sakhipur',
             'Sarishabari', 'Satkhira Sadar', 'Savar', 'Senpara Porbota', 'Shah Ali', 'Shahbag ', 'Shahbagh',
             'Shahjahanpur', 'Shajahanpur', 'Shantinagar', 'Shariatpur Sadar', 'Shegunbagicha',
             'Sher E Bangla Nagar ', 'Sherpur Sadar', 'Shibpur', 'Shiddheswari', 'Shiddhirganj', 'Sholokbahar',
             'Shyamoli', 'Shyampur', 'Shyampur ', 'Siddeshwari', 'Singiar', 'Sirajganj Sadar', 'Sitakunda',
             'Sonargaon', 'South Banasree', 'Sreemangal', 'Sreepur', 'Sunamganj Sadar', 'Sutrapur',
             'Sylhet City', 'Taltali', 'Taltola', 'Tangail Sadar', 'Tarakanda', 'Tejgaon', 'Tejgaon I/A',
             'Tetulia', 'Thakurgaon Sadar', 'Tongi', 'Turag', 'Ullapara', 'Uttar Khan', 'Uttar Lalkhan',
             'Uttara', 'Uttara East', 'Uttara West', 'Uttarkhan', 'Vatara ', 'Wari', 'West Khulshi',
             'West Rampura', 'Zafrabad', 'Zindabazar', 'Akhaura', 'Akkelpur', 'Alamdanga', 'Badarganj', 'Bagerhat',
             'Bagha', 'Bajitpur', 'Bandarban', 'Banshkhali', 'Baraigram', 'Barguna', 'Barlekha', 'Barura', 'Basurhat',
             'Beani Bazar', 'Belkuchi', 'Benapole', 'Bera', 'Bhairab', 'Bhaluka', 'Bhanga', 'Bhangura', 'Bheramara',
             'Bhola', 'Bhuapur', 'Birampur', 'Birganj', 'Boalmari', 'Chakaria', 'Chandanaish', 'Chandina', 'Chandpur',
             'Char Fasson', 'Charghat', 'Chatkhil', 'Chauddagram', 'Chaugachha', 'Chaumohoni', 'Chhagalnaiya',
             'Chhatak', 'Chhengarchar', 'Chuadanga', 'Chunarughat', 'Coxs Bazar', 'Daganbhuiyan', 'Darshana',
             'Daudkandi', 'Debidwar', 'Derai', 'Dewanganj', 'Dhamrai', 'Dhanbari', 'Dohar', 'Dupchanchia', 'Durgapur',
             'Durgapur', 'Faridganj', 'Faridpur', 'Feni', 'Fulbaria', 'Gabtali', 'Gaffargaon', 'Gaibandha', 'Galachipa',
             'Gangni', 'Gaurnadi', 'Ghatail', 'Ghoraghat', 'Ghorashal', 'Goalunda Ghat', 'Gobindaganj', 'Godagari',
             'Golapganj', 'Gopalganj', 'Gopalpur', 'Gopalpur', 'Gouripur', 'Gurudaspur', 'Habiganj', 'Hajiganj',
             'Hakimpur', 'Haragacha', 'Harinakundu', 'Hatiya', 'Homna', 'Hossainpur', 'Ishwardi', 'Ishwarganj',
             'Islampur', 'Jagannathpur', 'Jaipurhat', 'Jajira', 'Jaldhaka', 'Jhalakati', 'Jhenaidah', 'Jhikargacha',
             'Jibannagar', 'Kachua', 'Kalaroa', 'Kalia', 'Kaliakair', 'Kaliganj', 'Kaliganj', 'Kalihati', 'Kalkini',
             'Kanaighat', 'Kanchan', 'Karimganj', 'Kasba', 'Katakhali', 'Katiadi', 'Kendua', 'Keshabpur', 'Kesharhat',
             'Khagrachhari', 'Kishoreganj', 'Kotchandpur', 'Kulaura', 'Kuliarchar', 'Kumarkhali', 'Kurigram', 'Kushtia',
             'Laksham', 'Lakshmipur', 'Lalmohan', 'Lalmonirhat', 'Lama', 'Lohagara', 'Madarganj', 'Madaripur',
             'Madhabdi', 'Madhabpur', 'Madhupur', 'Magura', 'Maheshkhali', 'Maheshpur', 'Manikganj', 'Manirampur',
             'Matiranga', 'Matlab', 'Maulvi Bazar', 'Mehendiganj', 'Meherpur', 'Melandaha', 'Mirkadim', 'Mirpur',
             'Mirzapur', 'Mohanganj', 'Mongla', 'Morrelganj', 'Muksudpur', 'Muktagachha', 'Muladi', 'Mundumala',
             'Munshiganj', 'Nabiganj', 'Nabinagar', 'Nageshwari', 'Nakla', 'Nalchiti', 'Nalitabari', 'Nandail',
             'Nangalkot', 'Naohata', 'Narail', 'Naria', 'Narsingdi', 'Natore', 'Nazipur', 'Netrakona', 'Nilphamari',
             'Noakhali', 'Noapara', 'Pakundia', 'Panchagarh', 'Panchbibi', 'Pangsha', 'Parbatipur', 'Parshuram',
             'Patgram', 'Patiya', 'Patuakhali', 'Phulbari', 'Phulpur', 'Pirganj', 'Pirojpur', 'Puthia', 'Rahanpur',
             'Raipur', 'Raipura', 'Rajbari', 'Ramganj', 'Ramgarh', 'Ramgati', 'Rangamati', 'Rangunia', 'Raozan',
             'Saidpur', 'Sakhipur', 'Sandwip', 'Santahar', 'Santhia', 'Sarishabari', 'Satkania', 'Satkhira', 'Savar',
             'Senbagh', 'Setabganj', 'Shahjadpur', 'Shahrasti', 'Shailkupa', 'Shaistaganj', 'Shariatpur', 'Sherpur',
             'Sherpur', 'Shibchar', 'Shibganj', 'Shibganj', 'Shibpur', 'Singair', 'Singra', 'Sirajganj', 'Sitakunda',
             'Sonagazi', 'Sonaimuri', 'Sonargaon', 'Sonatala', 'Sreebardi', 'Sreemangal', 'Sreepur', 'Sujanagar',
             'Sunamganj', 'Swarupkati', 'Tanore', 'Tarabo', 'Teknaf', 'Thakurgaon', 'Trishal', 'Ulipur', 'Ullahpara']

    try:

        # if it finds match with the input, it returns true.
        areas.index(name)
        return True

    except:

        # if it doesn't find any match with the input, it returns false.
        return False

In [2]:
# Define folder locations
raw_data_folder="."
cleaned_data_folder="."

In [417]:
# **********  NOTE **********
# Uncomment the following line when running the first time
# It's commented out to avoid unintentional 'run' of the cell 
# (NOTE: the google translations later are time consuming and modify this DF; 
# we don't want to read the data into it again and overwrite the translations)
# 
# df_toleter = pd.read_csv(f"{raw_data_folder}/Farjana_toleter.csv")


# Keep the original data from comparison since the above will get modified (translated etc.)
# for various data extraction purposes
df_toleter_orig = pd.read_csv(f"{raw_data_folder}/Farjana_toleter.csv")

In [418]:
# A quick look at the raw data
df_toleter_orig.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,248,249,250,251,252,253,254,255,256,257
Amenities,"kitchens: 1,Common Bathroom","kitchens: 1,Common Bathroom,Dining Room","kitchens: 1,Attached Bathroom,Common Bathroom,...","kitchens: 1,Attached Bathroom,Common Bathroom,...","kitchens: 1,Attached Bathroom,Common Bathroom,...","kitchens: 1,Attached Bathroom,Dining Room,Sitt...","kitchens: 1,Attached Bathroom,Common Bathroom,...","kitchens: 1,Attached Bathroom,Dining Room,Sitt...","kitchens: 1,Attached Bathroom,Dining Room,Sitt...","kitchens: 1,Common Bathroom",...,"kitchens: 1,Attached Bathroom",kitchens: 1,kitchens: 1,kitchens: 1,"kitchens: 1,Attached Bathroom,Common Bathroom,...",kitchens: 1,"kitchens: 3,Attached Bathroom,Common Bathroom,...","kitchens: 3,Attached Bathroom,Common Bathroom,...","kitchens: 1,Attached Bathroom,Dining Room",
Bathroom,Bathrooms: 1,Bathrooms: 1,Bathrooms: 1,Bathrooms: 2,Bathrooms: 1,Bathrooms: 1,Bathrooms: 2,Bathrooms: 2,Bathrooms: 1,Bathrooms: 1,...,Bathrooms: 1,Bathrooms: 2,Bathrooms: 1,Bathrooms: 2,Bathrooms: 2,Bathrooms: 1,Bathrooms: 3,Bathrooms: 3,Bathrooms: 3,
Bedroom,Bedrooms: 1,Bedrooms: 1,Bedrooms: 1,Bedrooms: 2,Bedrooms: 1,Bedrooms: 1,Bedrooms: 2,Bedrooms: 2,Bedrooms: 1,Bedrooms: 1,...,Bedrooms: 1,Bedrooms: 2,Bedrooms: 3,Bedrooms: 2,Bedrooms: 3,Bedrooms: 2,Bedrooms: 5,Bedrooms: 5,Bedrooms: 3,Bedrooms: 2
Description,"Short-Term Rentals are available with Kitchen,...","Short-Term Rentals with Kitchen, TV, Refrigera...",Two Room Furnished Serviced Apartment RENT in ...,রাজশাহী-ঢাকা হাইওয়ে সংলগ্ন দেওয়ানপাড়া মোড় ...,Two Room Furnished Serviced Apartment RENT in ...,Two Room Furnished Serviced Apartment RENT in ...,Fully Furnished Two Bedroom Serviced Apartment...,Fully Furnished Two Bedroom Serviced Apartment...,Two Room Furnished Serviced Apartment RENT in ...,1 Bedroom Furnished Serviced Apartments for Re...,...,TO-LET 🏛চার তলা ভবনের নিচতলায় ফ্ল্যাট ভাড়া হ...,এই মাস থেকে /এপ্রিল মাসের ১ তারিখ থেকে উঠা যাব...,◆বাসা ভাড়া দেওয়া হবে ( ৫তলা বিল্ডিং এর ২তলায়) ...,বয়রা আজিজার মোর হতে সামান্য দুরে নতুন বিল্ডিং...,Very good ☺️ condition flat. Apartment brand new,মনোরম পরিবেশ দ্বিতীয় তলায় ভাড়া হবে। Dinning sp...,Well-to-do living and easy movement! If you ar...,Well-to-do living and easy movement! If you ar...,"House#03, Rod#4/1, Ward no#37, Anandha Nagar, ...",এপ্রিল মাস ২০২২ ইং থেকে নতুন বিল্ডিং এ বাসা ভা...
Location,"JCX TOWER, 1136/A, Block-I, Level 5, Japan Str...","JCX TOWER, 1136/A, Block-I, Level 5, Japan Str...","JCX TOWER, 1136/A, Block-I, Level 5 Japan Stre...","Dewanpara More,","JCX TOWER, 1136/A, Block-I, Level 5 Japan Stre...","JCX TOWER, 1136/A, Block-I, Level 5 Japan Stre...","JCX TOWER, 1136/A, Block-I, Level 5, Japan Str...","JCX TOWER, 1136/A, Block-I, Level 5 Japan Stre...","JCX TOWER, 1136/A, Block-I, Level 5 Japan Stre...","JCX TOWER, 1136/A, Block-I, Level 5, Japan Str...",...,"আদর্শ পাড়া ঈদগা মাঠসংলগ্ন,",বালিয়াপুকুর বড়বটতালা শাহিদ টাওয়ারের পিছে (উপরভ...,"ইউসেফ স্কুলের গলি, হাফিজনগর,","Boyra,","Road 25 house no39 d block mirpur pollobi 12,","Tootpara Farid Molla Mor,","ধানমণ্ডি, রাস্তা- ৩২ (পুরাতন), বাড়ি- ০৩, ফ্ল্য...","ধানমণ্ডি, রাস্তা- ৩২ (পুরাতন), বাড়ি- ০৩, ফ্ল্য...","House#03, Road#4/1, Ward no#37, Anandha Nagar,...",","
MainFeatures,"Built in Year : 2020,Parking Spaces,Electricit...","Built in Year : 2020,Parking Spaces,Electricit...","Built in Year : 2020,Size 375 Sq. Meter,Floors...","Floors: 3,Parking Spaces,Beautiful View,Balcon...","Built in Year : 2020,Size 375 Sq. Meter,Floors...","Built in Year : 2020,Size 375 Sq. Meter,Floors...","Built in Year : 2020,Parking Spaces,Electricit...","Built in Year : 2020,Size 800 Sq. Meter,Floors...","Built in Year : 2020,Size 375 Sq. Meter,Floors...","Built in Year : 2020,Parking Spaces,Electricit...",...,"Parking Spaces,CCTV Security",,,Parking Spaces,"Built in Year : 2022,Size 10500 Sq. Meter,Floo...",Beautiful View,"Built in Year : 2015,Size 1500 Sq. Meter,Floor...","Built in Year : 2015,Size 1500 Sq. Meter,Floor...","Parking Spaces,Balcony,Elevator,Others Main Fe...",
NearByLocation,"Nearby Schools,Nearby Hospitals,Shopping Malls...","Nearby Schools,Nearby Hospitals,Shopping Malls...","Airport ( 20 km ),Nearby Schools,Nearby Hospit...","Nearby Schools,Public Transport","Airport ( 20 km ),Nearby Schools,Nearby Hospit...","Airport ( 20 km ),Nearby Schools,Nearby Hospit...","Nearby Schools,Nearby Hospitals,Shopping Malls...","Airport ( 20 km ),Nearby Schools,Nearby Hospit...","Airport ( 20 km ),Nearby Schools,Nearby Hospit...","Nearby Schools,Nearby Hospitals,Shopping Malls...",...,,,,Nearby Schools,"Airport ( 2 km ),Nearby Schools,Nearby Hospita...",,"Airport ( 5 km ),Nearby Schools,Nearby Hospita...","Airport ( 5 km ),Nearby Schools,Nearby Hospita...",,
OtherFacilities,"Maintenance Staff,Security Staff","Maintenance Staff,Security Staff","Maintenance Staff,Security Staff,Facilities fo...",,"Maintenance Staff,Security Staff,Facilities fo...","Maintenance Staff,Security Staff,Facilities fo...","Maintenance Staff,Security Staff","Maintenance Staff,Security Staff,Facilities fo...","Maintenance Staff,Security Staff",Maintenance Staff,...,,,,,Security Staff,,"Maintenance Staff,Security Staff,Facilities fo...","Maintenance Staff,Security Staff,Facilities fo...",,
PricePerMonth,31000 Tk,36000 Tk,36000 Tk,9000 Tk,36000 Tk,36000 Tk,48000 Tk,48000 Tk,36000 Tk,31000 Tk,...,3800 Tk,7000 Tk,12000 Tk,8000 Tk,19000 Tk,7500 Tk,5000 Tk,5000 Tk,16000 Tk,9500 Tk
PropertyType,Flat / Apartment,Flat / Apartment,Flat / Apartment,Flat / Apartment,Flat / Apartment,Flat / Apartment,Flat / Apartment,Flat / Apartment,Flat / Apartment,Flat / Apartment,...,Flat / Apartment,Flat / Apartment,Flat / Apartment,Flat / Apartment,Flat / Apartment,Flat / Apartment,Flat / Apartment,Flat / Apartment,Flat / Apartment,Flat / Apartment


In [427]:
# 
# **********  NOTE **********
# 
# Uncomment the following line when running the first time
# It's commented out to avoid unintentional 'run' of the cell 
# NOTE: the google translations later are time consuming and unreliable (see function 'translate_Series' above)
# 

# df_toleter['Description'] = translate_Series(df_toleter['Description'])

In [426]:
# 
# **********  NOTE **********
# 
# Uncomment the following line when running the first time
# It's commented out to avoid unintentional 'run' of the cell 
# NOTE: the google translations later are time consuming and unreliable (see function 'translate_Series' above)
# 

# df_toleter['Location'] = translate_Series(df_toleter['Location'])

In [424]:
# A quick check of the translations
df_toleter.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,248,249,250,251,252,253,254,255,256,257
Amenities,"kitchens: 1,Common Bathroom","kitchens: 1,Common Bathroom,Dining Room","kitchens: 1,Attached Bathroom,Common Bathroom,...","kitchens: 1,Attached Bathroom,Common Bathroom,...","kitchens: 1,Attached Bathroom,Common Bathroom,...","kitchens: 1,Attached Bathroom,Dining Room,Sitt...","kitchens: 1,Attached Bathroom,Common Bathroom,...","kitchens: 1,Attached Bathroom,Dining Room,Sitt...","kitchens: 1,Attached Bathroom,Dining Room,Sitt...","kitchens: 1,Common Bathroom",...,"kitchens: 1,Attached Bathroom",kitchens: 1,kitchens: 1,kitchens: 1,"kitchens: 1,Attached Bathroom,Common Bathroom,...",kitchens: 1,"kitchens: 3,Attached Bathroom,Common Bathroom,...","kitchens: 3,Attached Bathroom,Common Bathroom,...","kitchens: 1,Attached Bathroom,Dining Room",
Bathroom,Bathrooms: 1,Bathrooms: 1,Bathrooms: 1,Bathrooms: 2,Bathrooms: 1,Bathrooms: 1,Bathrooms: 2,Bathrooms: 2,Bathrooms: 1,Bathrooms: 1,...,Bathrooms: 1,Bathrooms: 2,Bathrooms: 1,Bathrooms: 2,Bathrooms: 2,Bathrooms: 1,Bathrooms: 3,Bathrooms: 3,Bathrooms: 3,
Bedroom,Bedrooms: 1,Bedrooms: 1,Bedrooms: 1,Bedrooms: 2,Bedrooms: 1,Bedrooms: 1,Bedrooms: 2,Bedrooms: 2,Bedrooms: 1,Bedrooms: 1,...,Bedrooms: 1,Bedrooms: 2,Bedrooms: 3,Bedrooms: 2,Bedrooms: 3,Bedrooms: 2,Bedrooms: 5,Bedrooms: 5,Bedrooms: 3,Bedrooms: 2
Description,"Short-Term Rentals are available with Kitchen,...","Short-Term Rentals with Kitchen, TV, Refrigera...",Two Room Furnished Serviced Apartment RENT in ...,A whole new building in the Dewanpara intersec...,Two Room Furnished Serviced Apartment RENT in ...,Two Room Furnished Serviced Apartment RENT in ...,Fully Furnished Two Bedroom Serviced Apartment...,Fully Furnished Two Bedroom Serviced Apartment...,Two Room Furnished Serviced Apartment RENT in ...,1 Bedroom Furnished Serviced Apartments for Re...,...,Flat will be rented on the bottom floor of the...,"From this month /April, the third lock of the ...",The house will be rented (on the 2nd floor of ...,Boyra Aziza More is a little distance from the...,Very good ☺️ condition flat. Apartment brand new,The pleasant environment will be rented on the...,Well-to-do living and easy movement! If you ar...,Well-to-do living and easy movement! If you ar...,"House # 03, Rod # 4/1, Ward No # 37, Negar, Se...",The new building will be rented from April 202...
Location,"JCX TOWER, 1136/A, Block-I, Level 5, Japan Str...","JCX TOWER, 1136/A, Block-I, Level 5, Japan Str...","JCX TOWER, 1136/A, Block-I, Level 5 Japan Stre...","Dewanpara More,","JCX TOWER, 1136/A, Block-I, Level 5 Japan Stre...","JCX TOWER, 1136/A, Block-I, Level 5 Japan Stre...","JCX TOWER, 1136/A, Block-I, Level 5, Japan Str...","JCX TOWER, 1136/A, Block-I, Level 5 Japan Stre...","JCX TOWER, 1136/A, Block-I, Level 5 Japan Stre...","JCX TOWER, 1136/A, Block-I, Level 5, Japan Str...",...,"The ideal ladder is the Eidga field adjoining,",Baliapuku's big Batala Nahid Tower behind (Bha...,"Eusef School alley, Hafiznagar,","Boyra,","Road 25 house no 39 d block mirpur pallabi 12,","Tootpara Farid Molla Mor,","Dhanmondi, Road- 12 (Old), Home- 1, Flat-B4,","Dhanmondi, Road- 12 (Old), Home- 1, Flat-B4,","House # 03, Road # 4/1, Ward No # 37, NAKEAR, ...",","
MainFeatures,"Built in Year : 2020,Parking Spaces,Electricit...","Built in Year : 2020,Parking Spaces,Electricit...","Built in Year : 2020,Size 375 Sq. Meter,Floors...","Floors: 3,Parking Spaces,Beautiful View,Balcon...","Built in Year : 2020,Size 375 Sq. Meter,Floors...","Built in Year : 2020,Size 375 Sq. Meter,Floors...","Built in Year : 2020,Parking Spaces,Electricit...","Built in Year : 2020,Size 800 Sq. Meter,Floors...","Built in Year : 2020,Size 375 Sq. Meter,Floors...","Built in Year : 2020,Parking Spaces,Electricit...",...,"Parking Spaces,CCTV Security",,,Parking Spaces,"Built in Year : 2022,Size 10500 Sq. Meter,Floo...",Beautiful View,"Built in Year : 2015,Size 1500 Sq. Meter,Floor...","Built in Year : 2015,Size 1500 Sq. Meter,Floor...","Parking Spaces,Balcony,Elevator,Others Main Fe...",
NearByLocation,"Nearby Schools,Nearby Hospitals,Shopping Malls...","Nearby Schools,Nearby Hospitals,Shopping Malls...","Airport ( 20 km ),Nearby Schools,Nearby Hospit...","Nearby Schools,Public Transport","Airport ( 20 km ),Nearby Schools,Nearby Hospit...","Airport ( 20 km ),Nearby Schools,Nearby Hospit...","Nearby Schools,Nearby Hospitals,Shopping Malls...","Airport ( 20 km ),Nearby Schools,Nearby Hospit...","Airport ( 20 km ),Nearby Schools,Nearby Hospit...","Nearby Schools,Nearby Hospitals,Shopping Malls...",...,,,,Nearby Schools,"Airport ( 2 km ),Nearby Schools,Nearby Hospita...",,"Airport ( 5 km ),Nearby Schools,Nearby Hospita...","Airport ( 5 km ),Nearby Schools,Nearby Hospita...",,
OtherFacilities,"Maintenance Staff,Security Staff","Maintenance Staff,Security Staff","Maintenance Staff,Security Staff,Facilities fo...",,"Maintenance Staff,Security Staff,Facilities fo...","Maintenance Staff,Security Staff,Facilities fo...","Maintenance Staff,Security Staff","Maintenance Staff,Security Staff,Facilities fo...","Maintenance Staff,Security Staff",Maintenance Staff,...,,,,,Security Staff,,"Maintenance Staff,Security Staff,Facilities fo...","Maintenance Staff,Security Staff,Facilities fo...",,
PricePerMonth,31000 Tk,36000 Tk,36000 Tk,9000 Tk,36000 Tk,36000 Tk,48000 Tk,48000 Tk,36000 Tk,31000 Tk,...,3800 Tk,7000 Tk,12000 Tk,8000 Tk,19000 Tk,7500 Tk,5000 Tk,5000 Tk,16000 Tk,9500 Tk
PropertyType,Flat / Apartment,Flat / Apartment,Flat / Apartment,Flat / Apartment,Flat / Apartment,Flat / Apartment,Flat / Apartment,Flat / Apartment,Flat / Apartment,Flat / Apartment,...,Flat / Apartment,Flat / Apartment,Flat / Apartment,Flat / Apartment,Flat / Apartment,Flat / Apartment,Flat / Apartment,Flat / Apartment,Flat / Apartment,Flat / Apartment


In [67]:
df_toleter.shape

(258, 12)

## Construct a cleaned version of the raw CSV


In [685]:
# Creat a new DataFrame for storing the cleaned data
df_toleter_new = pd.DataFrame()


### Area:

In [686]:
# There is no column that captures the area (size) of the property, but the information
# is included in the column along with other text which needs to be extracted using 
# a regular expression

# Search for a pattern: "Size XX Sq. Meter"
pattern = r'Size\s+(\d+)\s+Sq\.\s+Meter'
# pattern = r'(\d+)\s+sqm'

# Apply the pattern to the 'MainFeatures' column using str.extract()
df_toleter_new['area'] = df_toleter['MainFeatures'].str.extract(pattern)

# Convert Sq. Meter to Sq. feet
df_toleter_new['area'] = df_toleter_new['area'].astype(float) * 10.76

In [687]:
# Sanity checks
print(df_toleter_new['area'].dtype)
print()
print(df_toleter_new['area'].notna().value_counts())

float64

True     169
False     89
Name: area, dtype: int64


### Building type & nature:

In [688]:
df_toleter['PropertyType'].unique()

array(['Flat / Apartment'], dtype=object)

In [689]:
# Extract the 'building_type' info from the "PropertyType" column
df_toleter_new['building_type'] = np.where(df_toleter["PropertyType"].str.contains("Apartment"), "Apartment", "NA")

In [690]:
# Sanity checks
print(df_toleter_new['building_type'].dtype)
print()
print(df_toleter_new['building_type'].value_counts())

object

Apartment    258
Name: building_type, dtype: int64


In [691]:
# The info about 'Residential' vs. 'Commercial' is not available anywhere in the raw data
# Although from some of the URLs it seems that many/most of them are 'Residential', it won't
# be right to just assign 'Residential' to "building_nature" column
df_toleter_new["building_nature"] = 'NA'

In [692]:
# Sanity checks
print(df_toleter_new['building_nature'].dtype)
print()
print(df_toleter_new['building_nature'].value_counts())

object

NA    258
Name: building_nature, dtype: int64


### Bathrooms & bedrooms:

In [693]:
print(df_toleter['Bathroom'].unique())
print()
print(df_toleter['Bedroom'].unique())

['Bathrooms: 1' 'Bathrooms: 2' 'Bathrooms: 3' nan 'Bathrooms: 4']

['Bedrooms: 1' 'Bedrooms: 2' 'Bedrooms: 3' nan 'Bedrooms: 4' 'Bedrooms: 5']


In [694]:
df_toleter_new['num_bath_rooms'] = df_toleter['Bathroom']
df_toleter_new['num_bed_rooms'] = df_toleter['Bedroom']

# Replace the NaNs in the number of bathrooms and bedrooms with a zero (*string*)
# (since NaNs are treated as float and then the string function split() gives error)
df_toleter_new["num_bath_rooms"].fillna('Bathrooms: 0', inplace=True)
df_toleter_new["num_bed_rooms"].fillna('Bedrooms: 0', inplace=True)

# Loop through each element in columns and modify the string
for i in range(len(df_toleter_new['num_bath_rooms'])):
    df_toleter_new['num_bath_rooms'][i] = df_toleter_new['num_bath_rooms'][i].split(" ")[1]
    df_toleter_new['num_bed_rooms'][i] = df_toleter_new['num_bed_rooms'][i].split(" ")[1]

# Change data type to float
df_toleter_new['num_bath_rooms'] = df_toleter_new['num_bath_rooms'].astype(float)
df_toleter_new['num_bed_rooms'] = df_toleter_new['num_bed_rooms'].astype(float)

df_toleter_new.dtypes

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_toleter_new['num_bath_rooms'][i] = df_toleter_new['num_bath_rooms'][i].split(" ")[1]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_toleter_new['num_bed_rooms'][i] = df_toleter_new['num_bed_rooms'][i].split(" ")[1]


area               float64
building_type       object
building_nature     object
num_bath_rooms     float64
num_bed_rooms      float64
dtype: object

In [695]:
# Sanity checks
print(df_toleter_new['num_bath_rooms'].unique())
print()
print(df_toleter_new['num_bed_rooms'].unique())

print()

print(df_toleter_new['num_bath_rooms'].value_counts())
print()
print(df_toleter_new['num_bed_rooms'].value_counts())

[1. 2. 3. 0. 4.]

[1. 2. 3. 0. 4. 5.]

2.0    141
1.0     95
3.0     11
0.0      9
4.0      2
Name: num_bath_rooms, dtype: int64

2.0    146
1.0     71
3.0     30
0.0      8
5.0      2
4.0      1
Name: num_bed_rooms, dtype: int64


### Price:

In [696]:
df_toleter['PricePerMonth'].describe()

count          258
unique          44
top       48000 Tk
freq            53
Name: PricePerMonth, dtype: object

In [697]:
# Are there NAs
df_toleter['PricePerMonth'].notna().sum() == len(df_toleter)

True

In [698]:
# Add the price column as it is
# NOTE: although the original column name is 'PricePerMonth', 
# after visiting some links the prices seems to be for 'Sale' 
df_toleter_new["price"] = df_toleter['PricePerMonth'].str.replace(' Tk', '')

# df['text'] = df['text'].str.replace(substring_to_exclude, '')

In [699]:
# Replace empty strings with '0' string
df_toleter_new["price"] = df_toleter_new["price"].apply(lambda x: x if x != '' else '0')

# df['values'] = df['values'].apply(lambda x: x if x != '' else '0')

In [700]:
# Convert to float
df_toleter_new["price"] = df_toleter_new["price"].astype(float)

In [701]:
# Sanity checks
print(df_toleter_new["price"].describe())
print()
print(df_toleter_new["price"].value_counts())

# df_toleter_new["price"].value_counts()

count       258.000000
mean      25994.810078
std       24095.550806
min           0.000000
25%        5000.000000
50%       30000.000000
75%       48000.000000
max      170000.000000
Name: price, dtype: float64

48000.0     53
36000.0     38
1.0         25
55000.0     18
2.0         18
35000.0     12
5000.0       9
5500.0       9
6500.0       7
30000.0      6
6000.0       6
0.0          5
7500.0       5
8000.0       4
12000.0      3
7000.0       3
4000.0       3
16000.0      3
9500.0       2
31000.0      2
19000.0      2
170000.0     2
9000.0       2
15000.0      2
22000.0      1
2000.0       1
13500.0      1
4700.0       1
4500.0       1
23000.0      1
8500.0       1
90000.0      1
2500.0       1
11000.0      1
17000.0      1
14000.0      1
4600.0       1
18000.0      1
10000.0      1
13000.0      1
45000.0      1
28000.0      1
3800.0       1
Name: price, dtype: int64


### Property description, overview, URL and purpose:

In [702]:
# Add 'Description' as the 'property_description' column
df_toleter_new["property_description"] = df_toleter['Description']

# Add 'MainFeatures' as the 'property_description' column
df_toleter_new["property_overview"] = df_toleter['MainFeatures']

# Add the URL column as it is
df_toleter_new["property_url"] = df_toleter["url"]

# Extract the 'purpose' info from the 'Status' column
df_toleter_new["purpose"] = np.where(df_toleter["Status"].str.contains("Rent"), "Rent", 
                                     np.where(df_toleter["Status"].str.contains("Sale"), "Sale", "NA"))

In [703]:
# Sanity checks
print(df_toleter_new[["property_description", "property_overview", 'property_url', "purpose"]])
print()
print(df_toleter_new["purpose"].value_counts())


                                  property_description  \
0    Short-Term Rentals are available with Kitchen,...   
1    Short-Term Rentals with Kitchen, TV, Refrigera...   
2    Two Room Furnished Serviced Apartment RENT in ...   
3    A whole new building in the Dewanpara intersec...   
4    Two Room Furnished Serviced Apartment RENT in ...   
..                                                 ...   
253  The pleasant environment will be rented on the...   
254  Well-to-do living and easy movement! If you ar...   
255  Well-to-do living and easy movement! If you ar...   
256  House # 03, Rod # 4/1, Ward No # 37, Negar, Se...   
257  The new building will be rented from April 202...   

                                     property_overview  \
0    Built in Year : 2020,Parking Spaces,Electricit...   
1    Built in Year : 2020,Parking Spaces,Electricit...   
2    Built in Year : 2020,Size 375 Sq. Meter,Floors...   
3    Floors: 3,Parking Spaces,Beautiful View,Balcon...   
4    Built in

### Address (city and address columns):

In [704]:
# 
# @Ekoue LOGOSU-TEKO:
# 
# Hello @channel.
# @Shariar Hossain Omee
#  has created a function to split location into relevant parts. 
# It can be found here: 
# https://github.com/OmdenaAI/dhaka-bangladesh-real-estate-recommendation/blob/main/src/tasks/task-2-data-preprocessing/functions/address_extractor.py
# 
# The code return a dictionary having the following keys: City, Area, Address. They are to become the following columns in the cleaned dataset:
# City -> city
# Area -> locality
# Address -> address

# NOTE: The location column doesn't have a city name, so we need to extract it from 'Description' column instead
# of using @Shariar Hossain Omee's get_detailed_address() function. But we can use '@Shariar Hossain Omee' for getting
# the 'locality' info from 'Location' column


# Extract the 'city' info from the 'Description' column in the raw data

# NOTES:
# 'Chattogram' is district and also a city
# 'Chattogram' is also called 'Chittagong'

# Define a function to check for the presence of words in a string
def find_word_in_string(string, word_list):
    for i, word in enumerate(word_list):
        if word in string:
            return word
    return 'NA'

# Create a list of cities in Bangladesh
cities = ['Bandar', 'Barishal', 'Bhairab', 'Bogura', 'Brahmanbaria', 'Chandpur', 'Chattogram', 'Chittagong', 
          'Chowmuhani', 'Chuadanga', "Cox's Bazar", 'Cumilla', 'Cumilla Sadar Dakshin', 'Dhaka', 
          'Dinajpur', 'Faridpur', 'Feni', 'Gazipur', 'Jamalpur', 'Jashore', 'Jhenaidah', 'Kaliakair', 
          'Khulna', 'Kishoreganj', 'Kushtia', 'Maijdee', 'Mymensingh', 'Naogaon', 'Narayanganj', 
          'Narsingdi', 'Nawabganj', 'Pabna', 'Rajshahi', 'Rangpur', 'Saidpur', 'Satkhira', 'Savar', 
          'Siddhirganj', 'Sirajganj', 'Sreepur', 'Sylhet', 'Tangail', 'Tarabo', 'Tongi']


df_toleter_new['city'] = df_toleter['Description'].apply(find_word_in_string, args=(cities,))


In [705]:
print(df_toleter_new['city'].value_counts())


NA           218
Dhaka         16
Khulna        13
Rajshahi       5
Rangpur        3
Jhenaidah      2
Pabna          1
Name: city, dtype: int64


In [706]:
# use get_detailed_address() getting the locality and address info
locality = []
address = []

# iterate through the 'name' column
for index, desc in df_toleter.iterrows():
    detailed_address = get_detailed_address(desc['Location'])
    locality.append(detailed_address['area'])
    address.append(detailed_address['address'])


In [707]:
# Add the locality and address info
df_toleter_new['locality'] = locality
df_toleter_new['address'] = address

In [708]:
# Sanity checks
print("-- Dtypes --")

print(df_toleter_new.dtypes)

print()

print(df_toleter_new.describe(include='all').T)



-- Dtypes --
area                    float64
building_type            object
building_nature          object
num_bath_rooms          float64
num_bed_rooms           float64
price                   float64
property_description     object
property_overview        object
property_url             object
purpose                  object
city                     object
locality                 object
address                  object
dtype: object

                      count unique  \
area                  169.0    NaN   
building_type           258      1   
building_nature         258      1   
num_bath_rooms        258.0    NaN   
num_bed_rooms         258.0    NaN   
price                 258.0    NaN   
property_description    258    163   
property_overview       224    122   
property_url            258    258   
purpose                 258      2   
city                    258      7   
locality                258      5   
address                 258    100   

                       

In [709]:
df_toleter_new.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,248,249,250,251,252,253,254,255,256,257
area,,,4035.0,,4035.0,4035.0,,8608.0,4035.0,,...,,,,,112980.0,,16140.0,16140.0,,
building_type,Apartment,Apartment,Apartment,Apartment,Apartment,Apartment,Apartment,Apartment,Apartment,Apartment,...,Apartment,Apartment,Apartment,Apartment,Apartment,Apartment,Apartment,Apartment,Apartment,Apartment
building_nature,,,,,,,,,,,...,,,,,,,,,,
num_bath_rooms,1.0,1.0,1.0,2.0,1.0,1.0,2.0,2.0,1.0,1.0,...,1.0,2.0,1.0,2.0,2.0,1.0,3.0,3.0,3.0,0.0
num_bed_rooms,1.0,1.0,1.0,2.0,1.0,1.0,2.0,2.0,1.0,1.0,...,1.0,2.0,3.0,2.0,3.0,2.0,5.0,5.0,3.0,2.0
price,31000.0,36000.0,36000.0,9000.0,36000.0,36000.0,48000.0,48000.0,36000.0,31000.0,...,3800.0,7000.0,12000.0,8000.0,19000.0,7500.0,5000.0,5000.0,16000.0,9500.0
property_description,"Short-Term Rentals are available with Kitchen,...","Short-Term Rentals with Kitchen, TV, Refrigera...",Two Room Furnished Serviced Apartment RENT in ...,A whole new building in the Dewanpara intersec...,Two Room Furnished Serviced Apartment RENT in ...,Two Room Furnished Serviced Apartment RENT in ...,Fully Furnished Two Bedroom Serviced Apartment...,Fully Furnished Two Bedroom Serviced Apartment...,Two Room Furnished Serviced Apartment RENT in ...,1 Bedroom Furnished Serviced Apartments for Re...,...,Flat will be rented on the bottom floor of the...,"From this month /April, the third lock of the ...",The house will be rented (on the 2nd floor of ...,Boyra Aziza More is a little distance from the...,Very good ☺️ condition flat. Apartment brand new,The pleasant environment will be rented on the...,Well-to-do living and easy movement! If you ar...,Well-to-do living and easy movement! If you ar...,"House # 03, Rod # 4/1, Ward No # 37, Negar, Se...",The new building will be rented from April 202...
property_overview,"Built in Year : 2020,Parking Spaces,Electricit...","Built in Year : 2020,Parking Spaces,Electricit...","Built in Year : 2020,Size 375 Sq. Meter,Floors...","Floors: 3,Parking Spaces,Beautiful View,Balcon...","Built in Year : 2020,Size 375 Sq. Meter,Floors...","Built in Year : 2020,Size 375 Sq. Meter,Floors...","Built in Year : 2020,Parking Spaces,Electricit...","Built in Year : 2020,Size 800 Sq. Meter,Floors...","Built in Year : 2020,Size 375 Sq. Meter,Floors...","Built in Year : 2020,Parking Spaces,Electricit...",...,"Parking Spaces,CCTV Security",,,Parking Spaces,"Built in Year : 2022,Size 10500 Sq. Meter,Floo...",Beautiful View,"Built in Year : 2015,Size 1500 Sq. Meter,Floor...","Built in Year : 2015,Size 1500 Sq. Meter,Floor...","Parking Spaces,Balcony,Elevator,Others Main Fe...",
property_url,https://www.toleter.com/property/bd1683/,https://www.toleter.com/property/bd1635/,https://www.toleter.com/property/bd1650/,https://www.toleter.com/property/bd1639/,https://www.toleter.com/property/bd1640/,https://www.toleter.com/property/bd1637/,https://www.toleter.com/property/bd1645/,https://www.toleter.com/property/bd1652/,https://www.toleter.com/property/bd1656/,https://www.toleter.com/property/bd1672/,...,https://www.toleter.com/property/bd293/,https://www.toleter.com/property/bd291/,https://www.toleter.com/property/bd297/,https://www.toleter.com/property/bd256/,https://www.toleter.com/property/bd299/,https://www.toleter.com/property/bd222/,https://www.toleter.com/property/bd101/,https://www.toleter.com/property/bd102/,https://www.toleter.com/property/bd244/,https://www.toleter.com/property/bd254/
purpose,Rent,Rent,Rent,Rent,Rent,Rent,Rent,Rent,Rent,Rent,...,Rent,Rent,Rent,Rent,Rent,Rent,Rent,Sale,Rent,Rent


### Create '-amenity' columns using 'OtherFacilities' column:

In [710]:
df_toleter['OtherFacilities'].value_counts()

Maintenance Staff,Security Staff                            119
Maintenance Staff,Security Staff,Facilities for Disabled     44
Maintenance Staff                                             6
Security Staff                                                5
Maintenance Staff,Facilities for Disabled                     1
Name: OtherFacilities, dtype: int64

#### The only possible 'amenities' on _toleter_ seem to be: 
- maintenance-staff
- security-staff
- facilities-for-disabled

In [711]:
# Create the three amenities columns

df_toleter_new['maintenance-staff-amenity'] = np.where(df_toleter["OtherFacilities"].str.contains("Maintenance Staff") & df_toleter["OtherFacilities"].notna(), "yes", "NA")

df_toleter_new['security-staff-amenity'] = np.where(df_toleter["OtherFacilities"].str.contains("Security Staff") & df_toleter["OtherFacilities"].notna(), "yes", "NA")

df_toleter_new['facilities-for-disabled-amenity'] = np.where(df_toleter["OtherFacilities"].str.contains("Facilities for Disabled") & df_toleter["OtherFacilities"].notna(), "yes", "NA")

In [712]:
# Sanity checks
print(df_toleter_new['maintenance-staff-amenity'].value_counts())
print()

print(df_toleter_new['security-staff-amenity'].value_counts())
print()

print(df_toleter_new['facilities-for-disabled-amenity'].value_counts())

yes    170
NA      88
Name: maintenance-staff-amenity, dtype: int64

yes    168
NA      90
Name: security-staff-amenity, dtype: int64

NA     213
yes     45
Name: facilities-for-disabled-amenity, dtype: int64


## Final checks and save the CSV:

In [713]:
df_toleter_new.describe(include='all').T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
area,169.0,,,,11232.293964,30493.174477,1076.0,4035.0,8608.0,9415.0,387360.0
building_type,258.0,1.0,Apartment,258.0,,,,,,,
building_nature,258.0,1.0,,258.0,,,,,,,
num_bath_rooms,258.0,,,,1.620155,0.66243,0.0,1.0,2.0,2.0,4.0
num_bed_rooms,258.0,,,,1.810078,0.752929,0.0,1.0,2.0,2.0,5.0
price,258.0,,,,25994.810078,24095.550806,0.0,5000.0,30000.0,48000.0,170000.0
property_description,258.0,163.0,Two Room Furnished Serviced Apartment RENT in ...,14.0,,,,,,,
property_overview,224.0,122.0,"Built in Year : 2022,Size 800 Sq. Meter,Floors...",11.0,,,,,,,
property_url,258.0,258.0,https://www.toleter.com/property/bd1683/,1.0,,,,,,,
purpose,258.0,2.0,Rent,254.0,,,,,,,


In [714]:
df_toleter_new.dtypes.T

area                               float64
building_type                       object
building_nature                     object
num_bath_rooms                     float64
num_bed_rooms                      float64
price                              float64
property_description                object
property_overview                   object
property_url                        object
purpose                             object
city                                object
locality                            object
address                             object
maintenance-staff-amenity           object
security-staff-amenity              object
facilities-for-disabled-amenity     object
dtype: object

In [715]:
df_toleter_new.to_csv(f"{cleaned_data_folder}/Farjana_toleter-cleaned-Umesh.csv", index=False)

<!-- <br>

# <font color='red'>** NEXT =========================></font>

<br> -->