In [15]:
import ast
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', '{:.8f}'.format)

In [16]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt

In [17]:
def create_dummies(row, unique_features):
    return {element: 1 if element in row else 0 for element in unique_features}

def dummify_column(df, col_name, unique_features):
    # Check if the specified column contains lists
    if df[col_name].apply(lambda x: isinstance(x, list)).all():
        # If it contains lists, flatten the lists into separate rows
        dummies = df[col_name].apply(lambda x: create_dummies(x, unique_features)).apply(pd.Series)
    else:
        # Perform one-hot encoding (dummify) on the specified column
        dummies = pd.get_dummies(df[col_name])
        
    # Concatenate the dummies with the original DataFrame and drop the original column
    df = pd.concat([df, dummies], axis=1)
    df.drop(columns=[col_name], inplace=True)

    return df

def dummify_columns(df, cols, unique_features):
    df_copy = df
    for col in cols:
        df_copy = dummify_column(df_copy, col, unique_features)
        
    return df_copy

In [5]:
def feature_importance(data, y_col):
    X = data.drop(y_col, axis=1)
    y = data[y_col]
    rf = RandomForestRegressor(n_estimators=100) 
    rf.fit(X, y)
    feature_importances = pd.DataFrame({
        'feature': X.columns,
        'importance': rf.feature_importances_
    })
    feature_importances = feature_importances.sort_values(
        by = 'importance',
        ascending = False
    )
    return feature_importances

def plot_feature_importance(feature_importances):
    plt.figure(figsize=(10, 6))
    plt.barh(feature_importances['feature'], feature_importances['importance'])
    plt.xlabel('Importance')
    plt.ylabel('Feature')
    plt.title('Feature Importance')
    plt.show()

In [6]:
file_paths = {
    "bars" : "../data/bars_apartments.csv",
    "myrealty" : "../data/myrealty_apartments.csv",
    "bnakaran" : "../data/bnakaran_apartments.csv",
}

## Bars

In [7]:
bars = pd.read_csv(file_paths["bars"])
bars.dropna(axis = 1, how = 'all', inplace = True)
bars['facilities'] = bars['facilities'].apply(ast.literal_eval)

bars_unique_features = set([item for sublist in bars['facilities'] for item in sublist])

bars_dummy = dummify_columns(bars, ["facilities", "building_type", "condition"], bars_unique_features)
bars_dummy = bars_dummy.drop(columns = ["source", "id", "location"], axis = 1)
bars_dummy["ceiling_height"] = bars_dummy["ceiling_height"].fillna(bars_dummy["ceiling_height"].median())
bars_dummy = bars_dummy.dropna(subset=['area'])
bars_dummy = bars_dummy.dropna(subset=['price'])
feature_importance(bars_dummy, "price")

Unnamed: 0,feature,importance
0,area,0.58753013
24,Front balcony,0.13399124
5,bathroom_count,0.0776106
18,Parking,0.02396535
17,First line,0.02163848
6,bedrooms,0.01580645
4,ceiling_height,0.01532623
3,storeys,0.01449241
25,Transport availability,0.01421177
2,floor,0.01142483


# MyRealty

In [8]:
myrealty = pd.read_csv(file_paths["myrealty"])
myrealty.dropna(axis=1, how='all', inplace = True)
myrealty['facilities'] = myrealty['facilities'].apply(ast.literal_eval)
myrealty['ceiling_height'] = myrealty['ceiling_height'].str.replace('[^\d.]', '', regex=True)
myrealty['ceiling_height'] = pd.to_numeric(myrealty['ceiling_height'], errors='coerce')

In [9]:
myrealty_unique_features = set([item for sublist in myrealty['facilities'] for item in sublist])

In [10]:
myrealty_dummy = dummify_columns(
    myrealty,
    ["facilities", "building_type", "condition"],
    myrealty_unique_features
)
myrealty_dummy = myrealty_dummy.drop(columns = ["source", "id", "location", "added_in_date"], axis = 1)
feature_importance(myrealty_dummy, "price")

Unnamed: 0,feature,importance
0,area,0.60107105
6,view_count,0.05625174
3,storeys,0.036681
50,Newly repaired,0.01945327
8,Playground,0.01767538
4,ceiling_height,0.01648985
39,Irrigation,0.0152481
18,Air-conditioner,0.014616
5,bathroom_count,0.01445822
23,Roadside,0.01400673


# Bnakaran # from room_details we can get bathrooms

In [11]:
bnakaran = pd.read_csv(file_paths["bnakaran"])
bnakaran.dropna(axis=1, how='all', inplace = True)
bnakaran['additional_features'] = bnakaran['additional_features'].apply(ast.literal_eval)

In [12]:
bnakaran_unique_features = set([item for sublist in bnakaran['additional_features'] for item in sublist])

In [13]:
bnakaran_dummy = dummify_columns(bnakaran, ["additional_features"], bnakaran_unique_features)
bnakaran_dummy = bnakaran_dummy.drop(
    columns = ["source", "id", "added_in_date", "room_details", "details", "utilities"], 
    axis = 1
)
bnakaran_dummy = bnakaran_dummy.dropna(subset=['price'])
bnakaran_dummy.head(1)

Unnamed: 0,price,area,rooms,floor,storeys,latitude,visit_count,longitude,heating flooring,oriel,plastic windows,sec. worker,sec. sys.,bathtub,new water tubes,LED TV,pool,internet,jacuzzi,kettle,sunny,open balcony,metal door,new wiring,laminatе flooring,gym,cable TV,hardwood flooring,appliances,shower,washing machine,elevator,kitchen tiled,playground,bathroom tiled,dinnerware,refrigerator,kitchen furniture,natural gas,intercom,air conditioner,Wi-Fi,persistent water,built-in wardrobes,exclusive design,electricity,expandable,furnished
0,300000.0,75,3,8,14,44.50805864,24,40.21100941,1,0,1,0,0,0,1,0,0,0,0,0,1,1,1,1,1,0,0,0,1,0,1,1,1,1,1,0,1,1,1,0,1,0,1,1,0,1,0,1


In [14]:
feature_importance(bnakaran_dummy, "price")

Unnamed: 0,feature,importance
0,area,0.47649584
4,latitude,0.20712687
25,cable TV,0.12647117
31,kitchen tiled,0.02509292
14,LED TV,0.02483568
45,expandable,0.02429529
6,longitude,0.02131188
18,kettle,0.00974749
5,visit_count,0.00898662
16,internet,0.00889878
