In [8]:
import ast
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', '{:.8f}'.format)

In [9]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt

In [10]:
def create_dummies(row, unique_features):
    return {element: 1 if element in row else 0 for element in unique_features}

def dummify_column(df, col_name, unique_features):
    # Check if the specified column contains lists
    if df[col_name].apply(lambda x: isinstance(x, list)).all():
        # If it contains lists, flatten the lists into separate rows
        dummies = df[col_name].apply(lambda x: create_dummies(x, unique_features)).apply(pd.Series)
    else:
        # Perform one-hot encoding (dummify) on the specified column
        dummies = pd.get_dummies(df[col_name])
        
    dummies.columns = ["F_" + col for col in dummies.columns]
    # Concatenate the dummies with the original DataFrame and drop the original column
    df = pd.concat([df, dummies], axis=1)
    df.drop(columns=[col_name], inplace=True)

    return df

def dummify_columns(df, cols, unique_features):
    df_copy = df
    for col in cols:
        df_copy = dummify_column(df_copy, col, unique_features)
        
    return df_copy

In [11]:
def feature_importance(data, y_col):
    X = data.drop(y_col, axis=1)
    y = data[y_col]
    rf = RandomForestRegressor(n_estimators=100) 
    rf.fit(X, y)
    feature_importances = pd.DataFrame({
        'feature': X.columns,
        'importance': rf.feature_importances_
    })
    feature_importances = feature_importances.sort_values(
        by = 'importance',
        ascending = False
    )
    return feature_importances

def plot_feature_importance(feature_importances):
    plt.figure(figsize=(10, 6))
    plt.barh(feature_importances['feature'], feature_importances['importance'])
    plt.xlabel('Importance')
    plt.ylabel('Feature')
    plt.title('Feature Importance')
    plt.show()

In [12]:
file_paths = {
    "bars" : "../scraping_results/data/bars_apartments.csv",
    "myrealty" : "../scraping_results/data/myrealty_apartments.csv",
    "bnakaran" : "../scraping_results/data/bnakaran_apartments.csv",
}

## Bars

In [30]:
bars = pd.read_csv(file_paths["bars"])
bars.dropna(axis = 1, how = 'all', inplace = True)
bars['facilities'] = bars['facilities'].apply(ast.literal_eval)

bars_unique_features = set([item for sublist in bars['facilities'] for item in sublist])

bars_dummy = dummify_columns(bars, ["facilities", "building_type", "condition"], bars_unique_features)
bars_dummy = bars_dummy.drop(columns = ["source", "id", "location"], axis = 1)
bars_dummy["ceiling_height"] = bars_dummy["ceiling_height"].fillna(bars_dummy["ceiling_height"].median())
bars_dummy = bars_dummy.dropna(subset=['area'])
bars_dummy = bars_dummy.dropna(subset=['price'])
bars_dummy = bars_dummy.dropna()
#feature_importance(bars_dummy, "price")

In [42]:
bars_unique_features

{'Air conditioning',
 'Building security',
 'Building service',
 'Built-in furniture',
 'Closed balcony',
 'Concierge',
 'Cooling system',
 'Dishwasher',
 'Electric stove',
 'Elevator',
 'Exclusive design',
 'Fireplace',
 'First line',
 'French balcony',
 'Fridge',
 'Front balcony',
 'Furniture',
 'Garage',
 'Heating system',
 'Intercom',
 'Internet',
 'Jacuzzi',
 'Kitchen furniture',
 'Kitchen stove',
 'Mansard',
 'Mortgage opportunity',
 'Not lived in',
 'Open balcony',
 'Panorama',
 'Parking',
 'Pool',
 'Superstructure',
 'TV',
 'Terrace',
 'Transport availability',
 'Wardrobe',
 'Washing machine',
 'Wi-Fi'}

# Bars Common Feature Mapping

In [43]:
bars_common_features = {"Air conditioning": "Air Conditioner", "Building security": "Security", "Furniture": "Furniture", "Open balcony":"Balcony", "Elevator":"Elevator", "Heating system":"Heating System", "Wi-Fi":"Internet", }

# MyRealty

In [14]:
myrealty = pd.read_csv(file_paths["myrealty"])
myrealty.dropna(axis=1, how='all', inplace = True)
myrealty['facilities'] = myrealty['facilities'].apply(ast.literal_eval)
myrealty['ceiling_height'] = myrealty['ceiling_height'].str.replace('[^\d.]', '', regex=True)
myrealty['ceiling_height'] = pd.to_numeric(myrealty['ceiling_height'], errors='coerce')

In [38]:
myrealty_unique_features = set([item for sublist in myrealty['facilities'] for item in sublist])

In [39]:
myrealty_dummy = dummify_columns(
    myrealty,
    ["facilities", "building_type", "condition", "bathroom_count"],
    myrealty_unique_features
)
myrealty_dummy = myrealty_dummy.drop(columns = ["source", "id", "location", "added_in_date"], axis = 1)
myrealty_dummy = myrealty_dummy.dropna()
#feature_importance(myrealty_dummy, "price")

In [40]:
myrealty_unique_features

{'Air-conditioner',
 'Attic',
 'Balcony',
 'Basement',
 'Bilateral',
 'Building existence',
 'Central heating',
 'Close to the bus station',
 'Electricity',
 'Elevator',
 'Equipment',
 'Euro windows',
 'Fence',
 'Fireplace',
 'Furniture',
 'Garage',
 'Gas',
 'Gate',
 'Grating ',
 'Gym',
 'Heated floor',
 'Heating',
 'High first floor',
 'Hot water',
 'Internet',
 'Iron door',
 'Irrigation',
 'Laminate flooring',
 'Loggia',
 'Open balcony',
 'Park',
 'Parking',
 'Parquet',
 'Playground',
 'Roadside',
 'Sauna',
 'Security system',
 'Sewerage, Canalization',
 'Storage room',
 'Sunny',
 'Swimming pool',
 'Tile',
 'View',
 'Water',
 'water 24/7'}

# MyRealty Common Feature Mapping

In [41]:
myrealty_common_features = {"Air-conditioner": "Air Conditioner", "Security system": "Security", "Furniture": "Furniture", "Balcony":"Balcony", "Elevator":"Elevator", "Heating":"Heating System", "Internet":"Internet", }

# Bnakaran # from room_details we can get bathrooms

In [17]:
bnakaran = pd.read_csv(file_paths["bnakaran"])
bnakaran.dropna(axis=1, how='all', inplace = True)
bnakaran['additional_features'] = bnakaran['additional_features'].apply(ast.literal_eval)
bnakaran.head(2)

Unnamed: 0,source,id,price,area,rooms,floor,storeys,building_type,added_in_date,additional_features,latitude,visit_count,utilities,room_details,longitude,flooring,entrance_door,construction_type,renovation,windows,heating,parking,cooling
0,bnakaran,d54284,1100.0,46,2,7,11,special,17.11.2023,"[air conditioner, new wiring, new water tubes,...",44.51229858,383,[],"{'rooms': '2', 'bedrooms': '1', 'bathrooms': '...",40.18579865,,metal,wall building,"renovated, uninhabited",plastic,gas boiler,outdoor,air conditioner
1,bnakaran,d111383,155000.0,130,4,4,5,,16.11.2023,"[new wiring, persistent water, hardwood floori...",44.52598631,99,[],"{'rooms': '4', 'bedrooms': '3', 'bathrooms': '...",40.19985702,hardwood,metal,stone,renovated,plastic,gas boiler,outdoor,


In [18]:
bnakaran_unique_features = set([item for sublist in bnakaran['additional_features'] for item in sublist])

In [25]:
bnakaran_dummy = dummify_columns(bnakaran, ["additional_features", "building_type", "flooring", 
              "construction_type", "renovation", "entrance_door", "windows", "heating", "parking",
                                           "cooling"], bnakaran_unique_features)
bnakaran_dummy = bnakaran_dummy.drop(
    columns = ["source", "id", "added_in_date", "room_details", "utilities"], 
    axis = 1
)
bnakaran_dummy = bnakaran_dummy.dropna(subset=['price'])
bnakaran_unique_features

{'DVD player',
 'LED TV',
 'TV',
 'TV set',
 'Wi-Fi',
 'air conditioner',
 'appliances',
 'bathroom tiled',
 'bathtub',
 'built-in wardrobes',
 'cable TV',
 'coffee maker',
 'dinnerware',
 'dishwasher',
 'electricity',
 'elevator',
 'exclusive design',
 'expandable',
 'furnished',
 'gas-stove',
 'gym',
 'hairdryer',
 'hardwood flooring',
 'heating flooring',
 'home cinema',
 'intercom',
 'internet',
 'iron',
 'jacuzzi',
 'kettle',
 'kitchen furniture',
 'kitchen tiled',
 'laminatе flooring',
 'metal door',
 'microwave',
 'natural gas',
 'natural gas is nearby',
 'new water tubes',
 'new wiring',
 'open balcony',
 'oriel',
 'oven',
 'persistent water',
 'plastic windows',
 'playground',
 'pool',
 'refrigerator',
 'renovated roof',
 'satellite TV',
 'sauna',
 'sec. sys.',
 'sec. worker',
 'shower',
 'sunny',
 'three-phase wiring',
 'toaster',
 'vacuum cleaner',
 'washing machine',
 'water heater',
 'ЖК телевизор',
 'бытовая техника',
 'встроенные шкафы',
 'газ',
 'домофон',
 'душевая каб

# Bnakaran Common Feature Mapping

In [44]:
bnakaran_common_features = {
    "air-conditioner": "Air Conditioner",
    "кондиционер" : "Air Conditioner",
    "օդորակիչ" : "Air Conditioner",
    
    "sec. sys.": "Security",
    "անվ. համ." : "Security",
    
    "Furniture": "Furniture",
    "կահավորված" : "Furniture",
    'с мебелью': "Furniture",
    
    "Balcony":"Balcony",
    'բաց պատշգամբ': "Balcony",
    'открытый балкон' : "Balcony",
    
    
    "Elevator":"Elevator",
    'վերելակ': "Elevator", 
    'лифт': "Elevator",
    
    
    "Heating":"Heating System",
    
    "Internet":"Internet",
    'ինտերնետ': "Internet",
    'интернет': "Internet"
    
}
