# Rent price in Barcelona 2014 - 2022

In [2]:
# Importing the relevant Libraries 
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt

In [3]:
# Importing the data from Kaggle 
df = pd.read_csv("Barcelona_rent_price.csv")
df.head(5)

Unnamed: 0,Year,Trimester,District,Neighbourhood,Average _rent,Price
0,2014,1,Ciutat Vella,el Raval,average rent (euro/month),589.55
1,2014,1,Ciutat Vella,Gothic Quarter,average rent (euro/month),712.79
2,2014,1,Ciutat Vella,la Barceloneta,average rent (euro/month),540.71
3,2014,1,Ciutat Vella,"Sant Pere, Santa Caterina i la Ribera",average rent (euro/month),673.44
4,2014,1,Eixample,Fort Pienc,average rent (euro/month),736.09


In [4]:
## Name of all columns
df.columns

Index(['Year', 'Trimester', 'District', 'Neighbourhood', 'Average _rent',
       'Price'],
      dtype='object')

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4622 entries, 0 to 4621
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Year           4622 non-null   int64  
 1   Trimester      4622 non-null   int64  
 2   District       4622 non-null   object 
 3   Neighbourhood  4622 non-null   object 
 4   Average _rent  4622 non-null   object 
 5   Price          4622 non-null   float64
dtypes: float64(1), int64(2), object(3)
memory usage: 216.8+ KB


In [6]:
# Checking the categorical unique values in "Year"
df["Year"].unique()

array([2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022], dtype=int64)

In [7]:
## Checking the numbers each year appeeared
df['Year'].value_counts()

2015    552
2016    552
2014    546
2017    546
2018    540
2021    540
2019    538
2020    538
2022    270
Name: Year, dtype: int64

In [8]:
# Checking the categorical unique values in "Trimester"
df["Trimester"].unique()

array([1, 2, 3, 4], dtype=int64)

In [9]:
# Checking the categorical unique values in "District"
df["District"].unique()

array(['Ciutat Vella', 'Eixample', 'Sants-Montjuic', 'Les Corts',
       'Sarria-Sant Gervasi', 'Gracia', 'Horta-Guinardo', 'Nou Barris',
       'Sant Andreu', 'Sant Marti'], dtype=object)

In [10]:
# Checking the categorical unique values in "Average _rent"
df["Average _rent"].unique()

array(['average rent (euro/month)', 'average rent per surface (euro/m2)'],
      dtype=object)

In [11]:
# Checking the categorical unique values in "Neighbourhood"
df["Neighbourhood"].unique()

array(['el Raval', 'Gothic Quarter', 'la Barceloneta',
       'Sant Pere, Santa Caterina i la Ribera', 'Fort Pienc',
       'Sagrada Familia', "la Dreta de l'Eixample",
       "l'Antiga Esquerra de l'Eixample",
       "la Nova Esquerra de l'Eixample", 'Sant Antoni', 'el Poble Sec',
       'la Marina de Port', 'la Font de la Guatlla', 'Hostafrancs',
       'la Bordeta', 'Sants - Badal', 'Sants', 'les Corts',
       'la Maternitat i Sant Ramon', 'Pedralbes',
       'Vallvidrera, el Tibidabo i les Planes', 'Sarria',
       'les Tres Torres', 'Sant Gervasi - la Bonanova',
       'Sant Gervasi - Galvany', 'el Putxet i el Farro',
       'Vallcarca i els Penitents', 'el Coll', 'la Salut',
       'la Vila de Gracia', "el Camp d'en Grassot i Gracia Nova",
       'el Baix Guinardo', 'Can Baro', 'el Guinardo',
       "la Font d'en Fargues", 'el Carmel', 'la Teixonera',
       'Sant Genis dels Agudells', 'Montbau', "la Vall d'Hebron", 'Horta',
       'Vilapicina i la Torre Llobeta', 'Porta', 'el T

In [12]:
# Checking if there are any null values 
df.isnull().sum()

Year             0
Trimester        0
District         0
Neighbourhood    0
Average _rent    0
Price            0
dtype: int64

In [13]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Year,4622.0,2017.745565,2.462828,2014.0,2016.0,2018.0,2020.0,2022.0
Trimester,4622.0,2.440502,1.116377,1.0,1.0,2.0,3.0,4.0
Price,4622.0,416.457594,442.872305,3.18,12.1,81.82,777.2075,2034.0


In [14]:
# This code is a concise way to encode categorical data like district names into numerical indices.

## Categorical values must be encoded to some constant numerical value 
## Simple technique is to use the map function 

## list(df["District"].unique()) -> gives the list of unique values 
## A list has index and element when enumerated. 
## The index itself is used as the numerical encoding. 

## Sorting helps it easier to find the code that corresponds to an element. 
## Consider using that for custom inputs 

# Encoding District 
df["District"] = df["District"].map({ele: index for index, ele in enumerate(list(df["District"].unique()))})

# Encoding Neighbourhood
df["Neighbourhood"] = df["Neighbourhood"].map({ele: index for index, ele in enumerate(list(df["Neighbourhood"].unique()))})

# Average _rent
df["Average _rent"] = df["Average _rent"].map({ele: index for index, ele in enumerate(list(df["Average _rent"].unique()))})

In [15]:
# Checking the dataframe after encoding 
df.head(30)

Unnamed: 0,Year,Trimester,District,Neighbourhood,Average _rent,Price
0,2014,1,0,0,0,589.55
1,2014,1,0,1,0,712.79
2,2014,1,0,2,0,540.71
3,2014,1,0,3,0,673.44
4,2014,1,1,4,0,736.09
5,2014,1,1,5,0,673.37
6,2014,1,1,6,0,921.4
7,2014,1,1,7,0,827.87
8,2014,1,1,8,0,716.13
9,2014,1,1,9,0,693.43


In [16]:
# Cross - verifying 
df.isnull().sum()

Year             0
Trimester        0
District         0
Neighbourhood    0
Average _rent    0
Price            0
dtype: int64

In [17]:
# Checking if the encoded values are numerical 
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4622 entries, 0 to 4621
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Year           4622 non-null   int64  
 1   Trimester      4622 non-null   int64  
 2   District       4622 non-null   int64  
 3   Neighbourhood  4622 non-null   int64  
 4   Average _rent  4622 non-null   int64  
 5   Price          4622 non-null   float64
dtypes: float64(1), int64(5)
memory usage: 216.8 KB


In [18]:
# Creating the train test split 
# Format: 80-20

# Importing the relevant library 
from sklearn.model_selection import train_test_split

# Features (x) and Label (y)
x = df.drop(columns = "Price", axis = 1)
y = df["Price"]

# The split (train - test )
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

# Evaluation Metrics - Preprepared

In [19]:
# Create evaluation function (the competition uses Root Mean Square Log Error)
from sklearn.metrics import mean_squared_log_error, mean_absolute_error

def rmsle(y_test, y_preds):
    return np.sqrt(mean_squared_log_error(y_test, y_preds))

# Create function to evaluate our model
def show_scores(model):
    train_preds = model.predict(X_train)
    test_preds = model.predict(X_test)
    scores = {"Training MAE": mean_absolute_error(y_train, train_preds),
              "Valid MAE": mean_absolute_error(y_test, test_preds),
              "Training RMSLE": rmsle(y_train, train_preds),
              "Valid RMSLE": rmsle(y_test, test_preds),
              "Training R^2": model.score(X_train, y_train),
              "Valid R^2": model.score(X_test, y_test)}
    return scores

In [20]:
from sklearn.ensemble import RandomForestRegressor

In [21]:
model = RandomForestRegressor(
                    n_jobs=-1,
                    n_estimators = 500, 
                    criterion = "absolute_error", 
                    max_samples = 3000
                    )
model.fit(X_train, y_train)

In [22]:
show_scores(model)

{'Training MAE': 9.780971279415793,
 'Valid MAE': 20.071147664864863,
 'Training RMSLE': 0.03798421719207469,
 'Valid RMSLE': 0.07396951848160971,
 'Training R^2': 0.9974658056064752,
 'Valid R^2': 0.9881097321935851}

In [23]:
model = RandomForestRegressor(
                    n_jobs=-1,
                    n_estimators = 500, 
                    criterion = "squared_error", # Is the best for the current scenario
                    max_samples = 3000
                    )
model.fit(X_train, y_train)

In [24]:
show_scores(model)

{'Training MAE': 8.868649277792853,
 'Valid MAE': 19.22173089729729,
 'Training RMSLE': 0.03388623572746188,
 'Valid RMSLE': 0.07082079851521625,
 'Training R^2': 0.9978442330325749,
 'Valid R^2': 0.988565002582616}


# ML Model - Randomized Search CV

In [25]:
from sklearn.model_selection import RandomizedSearchCV

rf_grid = {"max_samples": [3000]}

rs_model = RandomizedSearchCV(RandomForestRegressor(),
                              param_distributions=rf_grid,
                              n_iter=1000)

rs_model.fit(X_train, y_train)



ValueError: 
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
2 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Admin\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Admin\anaconda3\lib\site-packages\sklearn\ensemble\_forest.py", line 403, in fit
    n_samples_bootstrap = _get_n_samples_bootstrap(
  File "C:\Users\Admin\anaconda3\lib\site-packages\sklearn\ensemble\_forest.py", line 116, in _get_n_samples_bootstrap
    raise ValueError(msg.format(n_samples, max_samples))
ValueError: `max_samples` must be <= n_samples=2957 but got value 3000

--------------------------------------------------------------------------------
3 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Admin\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Admin\anaconda3\lib\site-packages\sklearn\ensemble\_forest.py", line 403, in fit
    n_samples_bootstrap = _get_n_samples_bootstrap(
  File "C:\Users\Admin\anaconda3\lib\site-packages\sklearn\ensemble\_forest.py", line 116, in _get_n_samples_bootstrap
    raise ValueError(msg.format(n_samples, max_samples))
ValueError: `max_samples` must be <= n_samples=2958 but got value 3000


In [26]:
show_scores(rs_model)

NotFittedError: This RandomizedSearchCV instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.