In [6]:
import ast
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', '{:.8f}'.format)

In [7]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt

In [8]:
def create_dummies(row, unique_features):
    return {element: 1 if element in row else 0 for element in unique_features}

def dummify_column(df, col_name, unique_features):
    # Check if the specified column contains lists
    if df[col_name].apply(lambda x: isinstance(x, list)).all():
        # If it contains lists, flatten the lists into separate rows
        dummies = df[col_name].apply(lambda x: create_dummies(x, unique_features)).apply(pd.Series)
    else:
        # Perform one-hot encoding (dummify) on the specified column
        dummies = pd.get_dummies(df[col_name])
        
    dummies.columns = ["F_" + col for col in dummies.columns]
    # Concatenate the dummies with the original DataFrame and drop the original column
    df = pd.concat([df, dummies], axis=1)
    df.drop(columns=[col_name], inplace=True)

    return df

def dummify_columns(df, cols, unique_features):
    df_copy = df
    for col in cols:
        df_copy = dummify_column(df_copy, col, unique_features)
        
    return df_copy

In [9]:
def feature_importance(data, y_col):
    X = data.drop(y_col, axis=1)
    y = data[y_col]
    rf = RandomForestRegressor(n_estimators=100) 
    rf.fit(X, y)
    feature_importances = pd.DataFrame({
        'feature': X.columns,
        'importance': rf.feature_importances_
    })
    feature_importances = feature_importances.sort_values(
        by = 'importance',
        ascending = False
    )
    return feature_importances

def plot_feature_importance(feature_importances):
    plt.figure(figsize=(10, 6))
    plt.barh(feature_importances['feature'], feature_importances['importance'])
    plt.xlabel('Importance')
    plt.ylabel('Feature')
    plt.title('Feature Importance')
    plt.show()

In [10]:
file_paths = {
    "bars" : "../data/bars_apartments.csv",
    "myrealty" : "../data/myrealty_apartments.csv",
    "bnakaran" : "../data/bnakaran_apartments.csv",
}

## Bars

In [11]:
bars = pd.read_csv(file_paths["bars"])
bars.dropna(axis = 1, how = 'all', inplace = True)
bars['facilities'] = bars['facilities'].apply(ast.literal_eval)

bars_unique_features = set([item for sublist in bars['facilities'] for item in sublist])

bars_dummy = dummify_columns(bars, ["facilities", "building_type", "condition"], bars_unique_features)
bars_dummy = bars_dummy.drop(columns = ["source", "id", "location"], axis = 1)
bars_dummy["ceiling_height"] = bars_dummy["ceiling_height"].fillna(bars_dummy["ceiling_height"].median())
bars_dummy = bars_dummy.dropna(subset=['area'])
bars_dummy = bars_dummy.dropna(subset=['price'])
feature_importance(bars_dummy, "price")

Unnamed: 0,feature,importance
0,area,0.56351941
5,bathroom_count,0.20209945
25,F_Front balcony,0.1048807
6,bedrooms,0.02422271
1,rooms,0.01593061
27,F_First line,0.01140359
36,F_Transport availability,0.01093328
3,storeys,0.00867808
2,floor,0.00651796
4,ceiling_height,0.00632066


# MyRealty

In [12]:
myrealty = pd.read_csv(file_paths["myrealty"])
myrealty.dropna(axis=1, how='all', inplace = True)
myrealty['facilities'] = myrealty['facilities'].apply(ast.literal_eval)
myrealty['ceiling_height'] = myrealty['ceiling_height'].str.replace('[^\d.]', '', regex=True)
myrealty['ceiling_height'] = pd.to_numeric(myrealty['ceiling_height'], errors='coerce')

In [13]:
myrealty_unique_features = set([item for sublist in myrealty['facilities'] for item in sublist])

In [14]:
myrealty_dummy = dummify_columns(
    myrealty,
    ["facilities", "building_type", "condition"],
    myrealty_unique_features
)
myrealty_dummy = myrealty_dummy.drop(columns = ["source", "id", "location", "added_in_date"], axis = 1)
feature_importance(myrealty_dummy, "price")

Unnamed: 0,feature,importance
0,area,0.60557442
6,view_count,0.05836903
3,storeys,0.03390612
31,F_Playground,0.02095647
9,F_Gas,0.02000824
50,F_Newly repaired,0.01884333
4,ceiling_height,0.01842606
42,F_Air-conditioner,0.0162366
2,floor,0.01260161
17,F_Roadside,0.01200755


# Bnakaran # from room_details we can get bathrooms

In [19]:
bnakaran = pd.read_csv(file_paths["bnakaran"])
bnakaran.dropna(axis=1, how='all', inplace = True)
bnakaran['additional_features'] = bnakaran['additional_features'].apply(ast.literal_eval)
bnakaran.head(2)

Unnamed: 0,source,id,price,area,rooms,floor,storeys,added_in_date,additional_features,latitude,visit_count,utilities,room_details,details,longitude
0,bnakaran,d113832,300000.0,75,3,8,14,14.11.2023,"[air conditioner, heating flooring, new wiring...",44.50805864,24,[],"{'rooms': '3', 'bedrooms': '2', 'bathrooms': '...","{'Construction type': 'in situ concrete', 'new...",40.21100941
1,bnakaran,d113850,430000.0,155,4,5,5,14.11.2023,"[air conditioner, persistent water, hardwood f...",44.50699578,74,[],"{'rooms': '4', 'bedrooms': '3', 'bathrooms': '...","{'Construction type': 'stone', 'Building type'...",40.17742786


In [16]:
bnakaran_unique_features = set([item for sublist in bnakaran['additional_features'] for item in sublist])

In [17]:
bnakaran_dummy = dummify_columns(bnakaran, ["additional_features"], bnakaran_unique_features)
bnakaran_dummy = bnakaran_dummy.drop(
    columns = ["source", "id", "added_in_date", "room_details", "details", "utilities"], 
    axis = 1
)
bnakaran_dummy = bnakaran_dummy.dropna(subset=['price'])
bnakaran_dummy.head(1)

Unnamed: 0,price,area,rooms,floor,storeys,latitude,visit_count,longitude,F_open balcony,F_hardwood flooring,F_washing machine,F_internet,F_elevator,F_bathroom tiled,F_built-in wardrobes,F_intercom,F_new water tubes,F_electricity,F_refrigerator,F_cable TV,F_jacuzzi,F_kettle,F_sunny,F_sec. worker,F_pool,F_heating flooring,F_persistent water,F_dinnerware,F_expandable,F_LED TV,F_kitchen tiled,F_plastic windows,F_metal door,F_playground,F_appliances,F_Wi-Fi,F_air conditioner,F_sec. sys.,F_exclusive design,F_shower,F_furnished,F_new wiring,F_oriel,F_natural gas,F_bathtub,F_gym,F_kitchen furniture,F_laminatе flooring
0,300000.0,75,3,8,14,44.50805864,24,40.21100941,1,0,1,0,1,1,1,0,1,1,1,0,0,0,1,0,0,1,1,0,0,0,1,1,1,1,1,0,1,0,0,0,1,1,0,1,0,0,1,1


In [18]:
feature_importance(bnakaran_dummy, "price")

Unnamed: 0,feature,importance
0,area,0.43793289
4,latitude,0.18023568
18,F_cable TV,0.16115946
29,F_kitchen tiled,0.05395485
28,F_LED TV,0.02291099
27,F_expandable,0.02057558
6,longitude,0.01882975
22,F_sec. worker,0.01662522
2,floor,0.01065267
10,F_internet,0.01041451
