In [None]:
import pandas as pd
import numpy as np

In [None]:
data = pd.read_json(path_or_buf='imoveis.json',orient='columns')

In [None]:
data

In [None]:
data.ident[0]

In [None]:
data.listing[0]

In [None]:
data_list1 = pd.json_normalize(data.ident)

In [None]:
data_list1.head()

In [None]:
data_list2 = pd.json_normalize(data.listing)

In [None]:
data_list2.head()

In [None]:
property_data = pd.concat([data_list1,data_list2],axis=1)

In [None]:
property_data.head()

In [None]:
property_data.shape 

In [None]:
for column in property_data.columns:
    print('----'*10)
    print(property_data[column].value_counts())

In [None]:
property_data= property_data[(property_data['types.usage'] == 'Residencial')&(property_data['address.city'] == 'Rio de Janeiro')]

In [None]:
property_data.head()

In [None]:
property_data.info() 

In [None]:
property_data.reset_index(drop=True, inplace=True)

In [None]:
property_data.info(verbose= False)

In [None]:
property_data = property_data.astype({
    'prices.price':'float64',
    'prices.tax.iptu':'float64',
    'prices.tax.condo':'float64',
    'features.usableAreas':'int64',
    'features.totalAreas':'int64'
})

In [None]:
property_data.info()

In [None]:
property_data['address.zone'].value_counts()

In [None]:
property_data['address.zone'] = property_data['address.zone'].replace('',np.nan) 

In [None]:
property_data['address.zone'].value_counts()

In [None]:
property_data['address.zone'].isnull().value_counts()

In [None]:
property_data.info()

In [None]:
property_data.head()

In [None]:
dict = property_data[~property_data['address.zone'].isna()].drop_duplicates(subset=['address.neighborhood']).to_dict('records')
print(dict)

In [None]:
property_data['address.zone'].isnull().sum()

In [None]:
zones_dict = {dic['address.neighborhood']:dic['address.zone']for dic in dict}

In [None]:
print(zones_dict)

In [None]:
for neighbor, zone in zones_dict.items():
    property_data.loc[property_data['address.neighborhood']==neighbor,'address.zone']=zone

In [None]:
property_data.head()

In [None]:
property_data['address.zone'].isnull().sum()

In [None]:
property_data['prices.tax.condo'].isnull().sum()

In [None]:
property_data['prices.tax.iptu'].isnull().sum()

In [None]:
property_data['prices.tax.condo'].fillna(0,inplace = True)
property_data['prices.tax.iptu'].fillna(0,inplace = True)

In [None]:
print(f"Total of null values in tax condo: {property_data['prices.tax.condo'].isnull().sum()}")
print(f"Total of null values in tax iptu: {property_data['prices.tax.iptu'].isnull().sum()}")

In [None]:
property_data.drop(['customerID','source','types.usage','address.city',
                    'address.location.lon','address.location.lat','address.neighborhood'],axis=1, inplace= True)

In [None]:
dict_columns = { 
    'types.unit':'unit','address.zone': 'zone','prices.price':'price',
    'prices.tax.condo':'tax.condo','prices.tax.iptu':'tax.iptu','features.bedrooms':'bedrooms',
    'features.bathrooms':'bathrooms','features.suites':'suites','features.parkingSpaces':'parkingSpaces',
    'features.usableAreas':'usableAreas', 'features.totalAreas':'totalAreas', 'features.floors':'floors',
    'features.unitsOnTheFloor': 'unitsOnTheFloor','features.unitFloor':'unitFloor'
}

In [None]:
property_data = property_data.rename(dict_columns, axis=1)
property_data.head()

In [None]:
column_n = property_data.select_dtypes(include=['number'])

correlation = column_n.corr()
correlation

In [None]:
!pip install seaborn

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
colors = sns.color_palette('light:salmon', as_cmap=True)
mask = np.zeros_like(correlation)
mask[np.triu_indices_from(mask)] = True

with sns.axes_style('white'):
    f, ax = plt.subplots(figsize=(13,8))
    ax = sns.heatmap(correlation, cmap=colors, mask=mask, square=True, fmt='.2f', annot=True)

In [None]:
sns.heatmap(correlation, cmap='crest')

In [None]:
plt.figure(figsize=(13,8))
mask = np.zeros_like(correlation)
mask[np.triu_indices_from(mask)] = True
sns.heatmap(correlation, cmap='crest', mask=mask, square=True, fmt='.2f', annot=True)

In [None]:
ax = sns.histplot(data=property_data, x='price', kde=True)
ax.figure.set_size_inches(20,10)
ax.set_title('Price Histogram')
ax.set_xlabel('Price')

In [None]:
!pip install scikit-learn

In [None]:
from sklearn.preprocessing import FunctionTransformer

In [None]:
transformer = FunctionTransformer(np.log1p, validate=True)
transformed_data = transformer.transform(property_data.select_dtypes(exclude=['object']))

columns_transformed_data = property_data.select_dtypes(exclude=['object']).columns

transformed_df = pd.concat([property_data.select_dtypes(include=['object']), pd.DataFrame(transformed_data, columns=columns_transformed_data)], axis=1)
transformed_df.head()

In [None]:
column_n = property_data.select_dtypes(include=['number'])
transformed_correlation = column_n.corr()
mask = np.zeros_like(transformed_correlation)
mask[np.triu_indices_from(mask)] = True

with sns.axes_style('white'):
    f, ax = plt.subplots(figsize=(13,8))
    ax = sns.heatmap(transformed_correlation, cmap=colors, mask=mask, square=True, fmt='.2f', annot=True)
    ax.set_title('Correlation between variables', fontsize=15)

In [None]:
ax = sns.histplot(data=transformed_df, x='price', kde=True)
ax.figure.set_size_inches(20, 10)
ax.set_title('Price Histogram')
ax.set_xlabel('Price')

In [None]:
categoric_variables = transformed_df.select_dtypes(include=['object']).columns

In [None]:
df_dummies = pd.get_dummies(transformed_df[categoric_variables])
df_dummies.head()

In [None]:
property_data = pd.concat([transformed_df.drop(categoric_variables, axis=1), df_dummies], axis=1)
property_data.head()

In [None]:
from sklearn.model_selection import train_test_split

X =  property_data.drop('price', axis=1)
y = property_data['price']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression()

lr.fit(X_train, y_train)

In [None]:
prevision = lr.predict(X_test)
X_test

In [None]:
np.expm1(7.49)

In [None]:
prevision

In [None]:
np.expm1(12.45)

In [None]:
np.expm1(13.25768024)

In [None]:
from sklearn.metrics import r2_score

r2 = r2_score(y_test, prevision)
r2

In [None]:
from sklearn.tree import DecisionTreeRegressor

dtr = DecisionTreeRegressor(random_state=42, max_depth=10)

In [None]:
dtr.fit(X_train, y_train)

In [None]:
dtr_prevision = dtr.predict(X_test)
dtr_prevision

In [None]:
np.expm1(13.55136531)

In [None]:
!pip install yellowbrick

In [None]:
from yellowbrick.regressor import PredictionError

fig, ax = plt.subplots(figsize=(10, 10))
pev = PredictionError(dtr)
pev.fit(X_train, y_train)
pev.score(X_test, y_test)
pev.poof()

In [None]:
r2_dtr = r2_score(y_test, dtr_prevision)
r2_dtr

In [None]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(random_state=42, max_depth=10, n_estimators=20)
rf.fit(X_train, y_train)
rf_prevision = rf.predict(X_test)

rf_prevision

In [None]:
np.expm1(13.53707348)

In [None]:
fig, ax = plt.subplots(figsize=(10,10))
pev = PredictionError(rf)
pev.fit(X_train, y_train)
pev.score(X_test, y_test)
pev.poof()

In [83]:
r2_rf = r2_score(y_test, rf_prevision)
r2_rf

0.8760096985885704

In [85]:
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

In [88]:
def get_metric(model, X_test, y_test, name):
    predict = model.predict(X_test)
    df_metrics = pd.DataFrame({
        'R2': [r2_score(y_test, predict)],
        'MSE': [mean_squared_error(y_test, predict)],
        'MAE': [mean_absolute_error(y_test, predict)]
    }, index=[name])
    return df_metrics

def table_metrics(model_rl, model_dt, model_rf, X_test, y_test):
    df_metrics_rl = get_metric(model_rl, X_test, y_test, 'Linear Regression')
    df_metrics_dt = get_metric(model_dt, X_test, y_test, 'Decision Tree')
    df_metrics_rf = get_metric(model_rf, X_test, y_test, 'Random Forest')

    return pd.concat([df_metrics_rl, df_metrics_dt, df_metrics_rf])


In [89]:
table_metrics(lr, dtr, rf, X_test, y_test)

Unnamed: 0,R2,MSE,MAE
Linear Regression,0.788068,0.173512,0.312049
Decision Tree,0.854874,0.118817,0.248978
Random Forest,0.87601,0.101513,0.23203
