In [302]:
%load_ext autoreload
%autoreload 2
import sys
sys.path.append("..")

import pandas as pd
import json
import numpy as np
import ast
from utils.dummies import dummify_columns

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [303]:
with open("../config.json", 'r') as file:
    config = json.load(file)
    building_type_mapping = config["building_type_mapping"]
    for key in building_type_mapping:
        building_type_mapping[key][None] = "Other"
        
    commong_features = set(config["COMMON_FEATURES"])
    feature_mapping = config["feature_mapping"]["bnakaran"]
    condition_mapping = config["condition_type_mapping"]["bnakaran"]

### Basic Cleaning

In [304]:
bnakaran = pd.read_csv("../scraping_results/data/bnakaran_apartments.csv")
bnakaran = bnakaran.dropna(axis = 1, how="all") # remove all NaN columns 
bnakaran['room_details'] = bnakaran['room_details'].apply(ast.literal_eval)
bnakaran['additional_features'] = bnakaran['additional_features'].apply(ast.literal_eval)

bnakaran["new_building"] = bnakaran["webpage"].str\
    .contains("new-building")\
    .apply(lambda x: 1 if x else 0)

bnakaran["building_type"] = bnakaran["construction_type"]\
    .map(building_type_mapping["bnakaran"])\
    .fillna("Other")

bnakaran["condition"] = bnakaran["renovation"]\
    .map(condition_mapping)\
    .fillna("Other")


bnakaran = bnakaran.drop(columns = ["webpage", "renovation", "construction_type", "utilities"])
bnakaran.head(2)

Unnamed: 0,source,id,price,area,rooms,floor,storeys,building_type,added_in_date,additional_features,...,room_details,longitude,flooring,entrance_door,windows,heating,parking,cooling,new_building,condition
0,bnakaran,d108949,117000.0,84,3,4,5,Stone,30.11.2023,"[new wiring, persistent water, new water tubes...",...,"{'rooms': '3', 'bedrooms': '2', 'bathrooms': '...",40.208844,laminate,metal,plastic,gas boiler,,,0,Good Condition
1,bnakaran,d117911,75000.0,67,2,10,10,Other,01.12.2023,"[electricity, metal door, built-in wardrobes, ...",...,"{'rooms': '2', 'bedrooms': '1', 'kitchens': '1...",40.17787,,metal,plastic,gas boiler,,,0,Good Condition


In [305]:
bathroom_counts = []
for details in bnakaran["room_details"]:
    bathroom_count = int(details.get("bathrooms", 0))
    bathroom_counts.append(bathroom_count)

bnakaran = bnakaran.drop(columns = ["room_details"])
bnakaran["bathroom_count"] = bathroom_counts

In [306]:
f_feature_list = []
for feature_list in bnakaran["additional_features"].to_list():
    feature_list = [feature_mapping[feature] for feature in feature_list if feature in feature_mapping]
    feature_list = [feature for feature in feature_list if feature in commong_features]
    f_feature_list.append(feature_list)
f_feature_list
bnakaran["additional_features"] = f_feature_list
bnakaran = dummify_columns(bnakaran, ["additional_features"], commong_features)

In [307]:
# Air Conditioner
ac_present = bnakaran["cooling"].isna()
ac_present = ac_present.apply(lambda x: 1 if x else 0)
bnakaran["F_Air Conditioner"] = (ac_present + bnakaran["F_Air Conditioner"]).apply(lambda x: min(x, 1))

# Heating
heating_present = bnakaran["heating"].isna()
heating_present = heating_present.apply(lambda x: 1 if x else 0)
bnakaran["F_Heating System"] = (heating_present + bnakaran["F_Heating System"]).apply(lambda x: min(x, 1))
bnakaran = bnakaran.drop(columns = ["cooling", "heating"])

### Bathroom count

In [308]:
bnakaran = bnakaran.drop(columns = ["windows", "entrance_door", "parking",
                                    "flooring", "added_in_date", "visit_count"])

In [314]:
bnakaran.to_csv("../processed_data/bnakaran.csv")

In [313]:
bnakaran.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1247 entries, 0 to 1246
Data columns (total 20 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   source             1247 non-null   object 
 1   id                 1247 non-null   object 
 2   price              1247 non-null   float64
 3   area               1247 non-null   int64  
 4   rooms              1247 non-null   int64  
 5   floor              1247 non-null   int64  
 6   storeys            1247 non-null   int64  
 7   building_type      1247 non-null   object 
 8   latitude           1247 non-null   float64
 9   longitude          1247 non-null   float64
 10  new_building       1247 non-null   int64  
 11  condition          1247 non-null   object 
 12  bathroom_count     1247 non-null   int64  
 13  F_Security         1247 non-null   int64  
 14  F_Elevator         1247 non-null   int64  
 15  F_Heating System   1247 non-null   int64  
 16  F_Internet         1247 