# Importing the required libraries

In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
import matplotlib
matplotlib.rcParams['figure.figsize'] = (20, 10)

# Loading the datasets

In [2]:
df = pd.read_csv("99aana.csv")
df.head()

Unnamed: 0,title,price,area,location,district,floor,room,bedroom,bathroom,livingroom,kitchen,parking,link
0,"House for Sale at Sitapaila, Kathmandu",25000000,sq m0-5-0-0,"Sitapaila, Kathmandu",Kathmandu,3.0,12.0,,3.0,,,1.0,https://99aana.com/property/house-for-sale-at-...
1,"House for sale at Lubhu, Lalitpur",14500000,sq m0-3-0-0,"Lubhu, Lalitpur",Lalitpur,2.5,7.0,4.0,3.0,,2.0,1.0,https://99aana.com/property/house-for-sale-at-...
2,"House for sale at Gothatar, Kathmandu",24400000,sq m0-4-0-0,"Gothatar, Kathmandu",Kathmandu,2.5,,,,,,1.0,https://99aana.com/property/house-for-sale-at-...
3,"House for sale at TiKathali, Lalitpur",12500000,sq m0-3-0-0,"TiKathali, Lalitpur",Lalitpur,1.5,,,,,,1.0,https://99aana.com/property/house-for-sale-at-...
4,"House for Sale at Kadaghari, Kathmandu",13100000,sq m0-3-0-0,"Kadaghari, Kathmandu",Kathmandu,2.5,,,,,,1.0,https://99aana.com/property/house-for-sale-at-...


In [3]:
df.shape

(2063, 13)

# Data cleaning

In [4]:
# remove sq m
df['area'] = df['area'].apply(lambda x: x.split('sq m')[1])
df.head()

# Controls the max num. of rows displayed when printring a dataframe
pd.set_option('display.max_rows', None)

In [5]:
# dhur/dhir paisa kattha haat aana daam. This function deals with dashed value e.g 1-2-3-4
# 1st deals with ropani-aana-paisa-dam
def clean_dashedArea(dashedArea):
    # area in sq feet
    area = 0
    if dashedArea.count('-') == 2:
        return None
    if dashedArea.count('-') == 3:
        splitted_area = dashedArea.split('-')
        # converting the splitted value into integer
        for i in range (0, len(splitted_area)):
            splitted_area[i] = float(splitted_area[i])
        area = area + 5476 * splitted_area[0] # ropani to sq.ft.
        area = area + 342.25 * splitted_area[1] # aana to sq.ft.
        area = area + 85.56 * splitted_area[2] # paisa to sq.ft.
        area = area + 21.39 * splitted_area[3] # dam to sq.ft.
        area = round(area, 2)
        return str(area)
    else:
        return dashedArea
df['area'] = df['area'].apply(clean_dashedArea)
df.dropna(subset=['area'], inplace = True)

In [6]:
# clean katha and dhur. This deals with worded values
def clean_katha_and_dhur(values):
    area = 0 # in sq. ft.
    if values == None:
        return values
    # conversion to lowercase to make it convinient if a substring exists in a string.
    # Cleaned spelling mistakes with vscode
    values = values.lower()

    # 1 Katha is equal to 3645 sq ft.
    # 1 Dhur is equal to 182.25 sq ft.
    if 'katha' in values and 'dhur' in values:
        katha = float(values.split('katha')[0].strip())
        dhur = float(values.split('katha')[1].split('dhur')[0].strip())
        area = area + katha * 3645
        area = area + dhur * 182.25
        area = round(area, 2)
        return str(area)
    
    if 'katha' in values:
        katha = float(values.split('katha')[0])
        area = area + 3645 * katha
        area = round(area, 2)
        return str(area)

    if 'dhur' in values:
        dhur = float(values.split('dhur')[0])
        area = area + dhur * 182.25
        area = round(area, 2)
        return str(area)
    return values
    
df['area'] = df['area'].apply(clean_katha_and_dhur)

In [7]:
# clean ropani aana paisa dam
# 1 Ropani = 5476 sq. ft.
# 1 Aana = 342.25 sq. ft.
# 1 Paisa = 85.56 sq. ft.
# 1 Dam = 21.39 sq. ft.
# All data containing dam was removed due to easier data cleaning. Now only dealing with aana and paisa
def clean_RAPD(values):
    area = 0 # sq. ft.
    values = values.lower()
    if 'aana' in values and 'paisa' in values:
        aana = float(values.split('aana')[0].strip())
        paisa = float(values.split('aana')[1].split('paisa')[0].strip())
        area = area + aana * 342.25
        area = area + paisa * 85.56
        area = round(area, 2)
        return str(area)
    if 'aana' in values:
        aana = float(values.split('aana')[0].strip())
        area = area + aana + 342.25
        area = round(area, 2)
        return str(area)
    return values
    # paisa does not come on its own so we dont need to write code for only paisa data edge case

df['area'] = df['area'].apply(clean_RAPD)

In [8]:
def clean_price(price):
    # rs. and negotiable removed and removed commas and convert into integer
    price = price.lower()
    price = price.replace(',', '')
    price = price.replace('rs', '')
    
    crore = 0
    lakh = 0
    total = 0     
    # initializing a variable to convert price into words into integer
    if 'crore' not in price and 'lakh' not in price:
        try:
            price = int(price)
        except:
            price = None
    else: 
        if 'crore' in price and 'lakh' in price: # like 2 crore 45 lakh
            crore = int(price.split('crore')[0])
            lakh = int(price.split('crore')[1].split('lakh')[0])
        elif 'lakh' in price and 'crore' not in price: # like 50 lakh
            lakh =  int(price.split('lakh')[0])
        elif 'crore' in price and 'lakh' not in price: # like 5 crore
            crore = int(price.split('crore')[0])
        total = crore * 10000000 + lakh * 100000
        return total
    
    return price

df['price'] = df['price'].apply(clean_price)

In [9]:
df.head()

Unnamed: 0,title,price,area,location,district,floor,room,bedroom,bathroom,livingroom,kitchen,parking,link
0,"House for Sale at Sitapaila, Kathmandu",25000000.0,1711.25,"Sitapaila, Kathmandu",Kathmandu,3.0,12.0,,3.0,,,1.0,https://99aana.com/property/house-for-sale-at-...
1,"House for sale at Lubhu, Lalitpur",14500000.0,1026.75,"Lubhu, Lalitpur",Lalitpur,2.5,7.0,4.0,3.0,,2.0,1.0,https://99aana.com/property/house-for-sale-at-...
2,"House for sale at Gothatar, Kathmandu",24400000.0,1369.0,"Gothatar, Kathmandu",Kathmandu,2.5,,,,,,1.0,https://99aana.com/property/house-for-sale-at-...
3,"House for sale at TiKathali, Lalitpur",12500000.0,1026.75,"TiKathali, Lalitpur",Lalitpur,1.5,,,,,,1.0,https://99aana.com/property/house-for-sale-at-...
4,"House for Sale at Kadaghari, Kathmandu",13100000.0,1026.75,"Kadaghari, Kathmandu",Kathmandu,2.5,,,,,,1.0,https://99aana.com/property/house-for-sale-at-...


# Train test split

# Training and predicting

# Evaluation