In [157]:
%load_ext autoreload
%autoreload 2
import sys
sys.path.append("..")

import pandas as pd
import json
import numpy as np
import re
import ast
from utils.dummies import dummify_columns

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [158]:
with open("../config.json", 'r') as file:
    config = json.load(file)
    building_type_mapping = config["building_type_mapping"]
    for key in building_type_mapping:
        building_type_mapping[key][None] = "Other"
        
    commong_features = set(config["COMMON_FEATURES"])
    feature_mapping = config["feature_mapping"]["bars"]
    condition_mapping = config["condition_type_mapping"]["bars"]

### Basic Cleaning

In [159]:
bars = pd.read_csv("../scraping_results/data/bars_apartments.csv")
bars = bars.dropna(axis = 1, how="all") # remove all NaN columns 
bars['facilities'] = bars['facilities'].apply(ast.literal_eval)

new_buildings = (bars["building_type"] == "New building") & (bars["condition"] == "Without renovation")
new_buildings = new_buildings.apply(lambda x: 1 if x else 0)
bars["new_building"] = new_buildings

bars["building_type"] = bars["building_type"]\
    .map(building_type_mapping["bars"])\
    .fillna("Other")

bars["condition"] = bars["condition"]\
    .map(condition_mapping)\
    .fillna("Other")

bars = bars.drop(columns = ["webpage", "ceiling_height"])
bars.head(3)

Unnamed: 0,source,id,price,facilities,location,area,rooms,floor,storeys,building_type,condition,bathroom_count,bedrooms,new_building
0,bars,4-3-1430,130000,[],Yerevan/Arabkir/Hambardzumyan Street,60.5,2.0,3.0,13.0,Other,Needs Renovation/Repair,1.0,1.0,1
1,bars,4-3-1430.1,215000,[],Yerevan/Arabkir/Hambardzumyan Street,96.1,3.0,4.0,13.0,Other,Needs Renovation/Repair,2.0,2.0,1
2,bars,1-N-432,560000,"[First line, Building service, Building securi...",Yerevan/Small Center/Sayat-Nova Avenue,156.4,4.0,9.0,17.0,Other,Good Condition,2.0,3.0,0


### Dummifying Facilities

In [160]:
f_feature_list = []
for feature_list in bars["facilities"].to_list():
    feature_list = [feature_mapping[feature] for feature in feature_list if feature in feature_mapping]
    feature_list = [feature for feature in feature_list if feature in commong_features]
    f_feature_list.append(feature_list)

bars["facilities"] = f_feature_list
bars = dummify_columns(bars, ["facilities"], commong_features)

In [161]:
bars = bars.drop(columns = ["bedrooms"])
bars = bars.dropna()

In [162]:
bars.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 538 entries, 0 to 552
Data columns (total 19 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   source             538 non-null    object 
 1   id                 538 non-null    object 
 2   price              538 non-null    int64  
 3   location           538 non-null    object 
 4   area               538 non-null    float64
 5   rooms              538 non-null    float64
 6   floor              538 non-null    float64
 7   storeys            538 non-null    float64
 8   building_type      538 non-null    object 
 9   condition          538 non-null    object 
 10  bathroom_count     538 non-null    float64
 11  new_building       538 non-null    int64  
 12  F_Furniture        538 non-null    int64  
 13  F_Security         538 non-null    int64  
 14  F_Air Conditioner  538 non-null    int64  
 15  F_Balcony          538 non-null    int64  
 16  F_Heating System   538 non

In [170]:
bars["location"] = bars["location"]\
    .apply(lambda text: re.sub(r"\(.*?\)", "", text).strip())\
    .apply(lambda x: x.split("/")[-1])

In [172]:
from Services import GeoService, MapFeatureAggregator, AddressToCoordinateConverter

In [173]:
bars['location'] = bars['location'].apply(myrealty_format_address)
converter = AddressToCoordinateConverter("../Data Manipulation | Notebooks/streets.csv")
bars["coordinates"] = bars["location"].apply(converter.convert)

In [175]:
bars["latitude"] = bars["coordinates"].apply(lambda x: x[1])
bars["longitude"] = bars["coordinates"].apply(lambda x: x[0])
bars = bars.drop(columns = ["coordinates", "location"])

KeyError: 'coordinates'

In [178]:
bars.to_csv("../processed_data/bars.csv")

In [177]:
bars.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 538 entries, 0 to 552
Data columns (total 20 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   source             538 non-null    object 
 1   id                 538 non-null    object 
 2   price              538 non-null    int64  
 3   area               538 non-null    float64
 4   rooms              538 non-null    float64
 5   floor              538 non-null    float64
 6   storeys            538 non-null    float64
 7   building_type      538 non-null    object 
 8   condition          538 non-null    object 
 9   bathroom_count     538 non-null    float64
 10  new_building       538 non-null    int64  
 11  F_Furniture        538 non-null    int64  
 12  F_Security         538 non-null    int64  
 13  F_Air Conditioner  538 non-null    int64  
 14  F_Balcony          538 non-null    int64  
 15  F_Heating System   538 non-null    int64  
 16  F_Internet         538 non