<font size="+3"><strong>Predicting Price with Everything</strong></font>

In [None]:
!pip install category_encoders

In [None]:
import warnings
from glob import glob

import pandas as pd
import seaborn as sns
from category_encoders import OneHotEncoder
from ipywidgets import Dropdown, FloatSlider, IntSlider, interact
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import mean_absolute_error
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline

warnings.simplefilter(action="ignore", category=FutureWarning)

# **Prepare Data**

## **Import**

In [None]:
def wrangle(filepath):
      # Import_csv
      df = pd.read_csv(filepath)

      # Subset data: Apartments in "Capital Federal", less than 400,000
      mask_ba = df["place_with_parent_names"].str.contains("Capital Federal")
      mask_apt = df["property_type"] == "apartment"
      mask_price = df["price_aprox_usd"] < 400_000
      df = df[mask_ba & mask_apt & mask_price]

      # Split "lat-lon" column
      df[["lat", "lon"]] = df["lat-lon"].str.split(",", expand=True).astype(float)
      df.drop(columns="lat-lon", inplace=True)


      # Drop features with high null counts
      df.drop(columns = ["floor","expenses"], inplace= True)

      # Drop low and high cardinality categorical variables
      df.drop(columns= ["operation", "property_type", "currency","properati_url"], inplace=True)

      # Drop Leakey columns
      df.drop(columns= [
          "price",
          "price_aprox_local_currency",
          "price_per_m2",
          "price_usd_per_m2"
           ],
      inplace= True)

      # Drop columns with multicollinearity
      df.drop(columns=["surface_total_in_m2", "rooms"], inplace=True)

      # Subset data: Remove outliers for "surface_covered_in_m2"
      low, high = df["surface_covered_in_m2"].quantile([0.1, 0.9])
      mask_area = df["surface_covered_in_m2"].between(low, high)
      df = df[mask_area]

      # Get place name
      df["neighborhood"] = df["place_with_parent_names"].str.split("|",expand=True)[3]
      df.drop(columns="place_with_parent_names", inplace=True)

      return df

In [None]:
# Create a list that contains the filenames for all real estate CSV files
files = glob("buenos-aires-real-estate-*.csv")

# Create a list named `frames`
frames = [wrangle(file) for file in files]

# Concatenate the items in `frames` into a single DataFrame `df`
df = pd.concat(frames, ignore_index= True)
print(df.info())
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6582 entries, 0 to 6581
Data columns (total 5 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   price_aprox_usd        6582 non-null   float64
 1   surface_covered_in_m2  6582 non-null   float64
 2   lat                    6316 non-null   float64
 3   lon                    6316 non-null   float64
 4   neighborhood           6582 non-null   object 
dtypes: float64(4), object(1)
memory usage: 257.2+ KB
None


Unnamed: 0,price_aprox_usd,surface_covered_in_m2,lat,lon,neighborhood
0,112000.0,60.0,-34.564676,-58.45572,Belgrano
1,76500.0,36.0,-34.61883,-58.437779,Caballito
2,90000.0,100.0,-34.577168,-58.538654,Belgrano
3,159900.0,77.0,-34.571526,-58.455637,Colegiales
4,123875.0,50.0,-34.578688,-58.457358,Colegiales


## **Split**

In [None]:
# Create feature matrix and target vector
features = ["surface_covered_in_m2", "lat", "lon", "neighborhood"]
target = "price_aprox_usd"
y = df[target]
X = df[features]
print(X.shape)
print(y.shape)

(6582, 4)
(6582,)


In [None]:
# Create train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)

X_train shape: (5265, 4)
y_train shape: (5265,)
X_test shape: (1317, 4)
y_test shape: (1317,)


# **Build Model**

## **Baseline**

In [None]:
# Calculate the baseline mean absolute error
y_mean= y_train.mean()
y_pred_baseline =[y_mean] * len(y_train)
print("Mean apartment price:", round(y_mean, 2))
print("Baseline MAE:", mean_absolute_error(y_train, y_pred_baseline))

Mean apartment price: 132015.15
Baseline MAE: 44393.95213998732


## **Iterate**

In [None]:
# pipeline contains a `OneHotEncoder`, `SimpleImputer`, and `Ridge` predictor
model = make_pipeline(
           OneHotEncoder(use_cat_names=True),
           SimpleImputer(),
           Ridge()
)
model.fit(X_train, y_train)

## **Evaluate**

In [None]:
# Calculate the training mean absolute error
y_pred_training = model.predict(X_train)
print("Training MAE:", mean_absolute_error(y_train, y_pred_training))

Training MAE: 24176.597492147277


In [None]:
r2_square_train = metrics.r2_score(y_train, y_pred_training)
r2_square_train

0.667871010981721

In [None]:
# Calculate the testing mean absolute error
y_pred_test = pd.Series(model.predict(X_test))
print(y_pred_test.head())
mae_testing = mean_absolute_error(y_test, y_pred_test)
print("Testing MAE:", round(mae_testing, 2))

0    204540.359154
1     81165.282275
2    130971.705940
3     95786.838217
4    176212.668301
dtype: float64
Testing MAE: 24150.49


In [None]:
r2_square_test = metrics.r2_score(y_test, y_pred_test)
r2_square_test

0.6851501234151371

# **Results**

In [None]:
# Create a function returns our model’s prediction for an house price
def make_prediction(area, lat, lon, neighborhood):
  data = {
  "surface_covered_in_m2" :area,
  "lat": lat,
  "lon": lon,
  "neighborhood": neighborhood
  }
  df = pd.DataFrame(data, index = [0])
  prediction = model.predict(df).round(2)[0]
  return f"Predicted apartment price: ${prediction}"

In [None]:
make_prediction(78, -34.715, -54.095, "boca")

'Predicted apartment price: $485513.35'

In [None]:
# Creating an interactive dashboard, where a user can supply values and receive a prediction
interact(
    make_prediction,
    area=IntSlider(
          min=X_train["surface_covered_in_m2"].min(),
          max=X_train["surface_covered_in_m2"].max(),
          value=X_train["surface_covered_in_m2"].mean(),
),

    lat=FloatSlider(
         min=X_train["lat"].min(),
         max=X_train["lat"].max(),
         step=0.01,
         value=X_train["lat"].mean(),
),

    lon=FloatSlider(
         min=X_train["lon"].min(),
         max=X_train["lon"].max(),
         step=0.01,
         value=X_train["lon"].mean(),
),

    neighborhood=Dropdown(options=sorted(X_train["neighborhood"].unique())),
);

interactive(children=(IntSlider(value=53, description='area', max=101, min=30), FloatSlider(value=-34.59914860…