In [125]:
%load_ext autoreload
%autoreload 2
import sys
sys.path.append("..")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [126]:
import pandas as pd
import json
import numpy as np
import ast
from utils.dummies import dummify_columns

In [127]:
with open("../config.json", 'r') as file:
    config = json.load(file)
    commong_features = set(config["COMMON_FEATURES"])
    feature_mapping = config["feature_mapping"]["myrealty"]
    condition_mapping = config["condition_type_mapping"]["myrealty"]

In [128]:
myrealty = pd.read_csv("../scraping_results/data/myrealty_apartments.csv")
myrealty = myrealty.dropna(axis = 1, how="all") # remove all NaN columns 

myrealty["new_building"] = myrealty["webpage"].str\
    .contains("new-construction")\
    .apply(lambda x: 1 if x else 0)

myrealty["bathroom_count"] = myrealty["bathroom_count"]\
    .replace(to_replace=r'\D', value='', regex=True)\
    .astype(int)

# Condition
myrealty["condition"] = myrealty["condition"].map(condition_mapping)

myrealty['facilities'] = myrealty['facilities'].apply(ast.literal_eval)

myrealty = myrealty.drop(columns = [
    "webpage",
    "view_count",
    "added_in_date",
    "ceiling_height"
])

myrealty.head(2)

Unnamed: 0,source,id,price,facilities,location,area,rooms,floor,storeys,building_type,condition,bathroom_count,new_building
0,myrealty,142711,87000,"[Heating, Internet, Hot water, Gas, Water, wat...","Yerevan, Achapnyak, Mazmanyan St",60,2,1,5,Stone,Good Condition,1,0
1,myrealty,142709,76500,"[Heating, Hot water, Electricity, Gas, Water, ...","Yerevan, Achapnyak, Bashinjaxyan St",48,2,4,5,Stone,Good Condition,1,0


In [129]:
f_feature_list = []
for feature_list in myrealty["facilities"].to_list():
    feature_list = [feature_mapping[feature] for feature in feature_list if feature in feature_mapping]
    feature_list = [feature for feature in feature_list if feature in commong_features]
    f_feature_list.append(feature_list)
    
myrealty["facilities"] = f_feature_list
myrealty = dummify_columns(myrealty, ["facilities"], commong_features)

In [130]:
myrealty.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1017 entries, 0 to 1016
Data columns (total 19 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   source             1017 non-null   object
 1   id                 1017 non-null   int64 
 2   price              1017 non-null   int64 
 3   location           1017 non-null   object
 4   area               1017 non-null   int64 
 5   rooms              1017 non-null   int64 
 6   floor              1017 non-null   int64 
 7   storeys            1017 non-null   int64 
 8   building_type      1017 non-null   object
 9   condition          1017 non-null   object
 10  bathroom_count     1017 non-null   int64 
 11  new_building       1017 non-null   int64 
 12  F_Elevator         1017 non-null   int64 
 13  F_Furniture        1017 non-null   int64 
 14  F_Heating System   1017 non-null   int64 
 15  F_Security         1017 non-null   int64 
 16  F_Internet         1017 non-null   int64 


# Converting the location into a latitude & longitude

In [135]:
from Services import GeoService, MapFeatureAggregator, AddressToCoordinateConverter
from utils.formatting import myrealty_format_address

In [136]:
myrealty['location'] = myrealty['location'].apply(myrealty_format_address)
converter = AddressToCoordinateConverter("../Data Manipulation | Notebooks/streets.csv")
myrealty["coordinates"] = myrealty["location"].apply(converter.convert)

In [147]:
myrealty["latitude"] = myrealty["coordinates"].apply(lambda x: x[1])
myrealty["longitude"] = myrealty["coordinates"].apply(lambda x: x[0])
myrealty = myrealty.drop(columns = ["coordinates", "location"])

In [152]:
myrealty.to_csv("../processed_data/myrealty.csv")

In [151]:
myrealty.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1017 entries, 0 to 1016
Data columns (total 20 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   source             1017 non-null   object 
 1   id                 1017 non-null   int64  
 2   price              1017 non-null   int64  
 3   area               1017 non-null   int64  
 4   rooms              1017 non-null   int64  
 5   floor              1017 non-null   int64  
 6   storeys            1017 non-null   int64  
 7   building_type      1017 non-null   object 
 8   condition          1017 non-null   object 
 9   bathroom_count     1017 non-null   int64  
 10  new_building       1017 non-null   int64  
 11  F_Elevator         1017 non-null   int64  
 12  F_Furniture        1017 non-null   int64  
 13  F_Heating System   1017 non-null   int64  
 14  F_Security         1017 non-null   int64  
 15  F_Internet         1017 non-null   int64  
 16  F_Balcony          1017 