# MODULES AND CONSTANTS 

### MODULES, CONSTANT AND IMPORTS

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

### CONSTANTS

In [4]:
INPUT_TRAINING_DATA = './NYC_Airbnb/Data/development.csv'
INPUT_TESTING_DATA = './NYC_Airbnb/Data/evaluation.csv'

# READ INPUT DATA

reads the input data, organized as such:
Each file has an initial header line, containing the names of attributes at your disposal:
- <b>id</b>: a unique identifier of the listing
- <b>name</b>
- <b>host_id</b>: a unique identifier of the host
- <b>host_name</b> 
- <b>neighborhood_group</b>: neighborhood location in the city
- <b>neighborhood</b>: name of the neighborhood
- <b>latitude</b>: coordinate expressed as floating point number
- <b>longitude</b>: coordinate expressed as floating point number
- <b>room_type</b>
- <b>price</b>: price per night expressed in dollars
- <b>minimum_nights</b>: minimum nights requested by the host
- <b>number_of_reviews</b>
- <b>last_review</b>: date of the last review expressed as YYYY-MM-DD
- <b>reviews_per_month</b>: average number of reviews per month
- <b>calculated_host_listings_count</b>: amount of listing of the host
- <b>availability_365</b>: number of days when the listing is available for booking

In [5]:
def readData(path:str)->pd.DataFrame:
    return pd.read_csv(filepath_or_buffer=path, header=0, index_col=0)

# FILLS NA

Fills the found np.nan value:
<br>name                   13
<br>host_name              19
<br>last_review          8041
<br>reviews_per_month    8041

In [6]:
def replaceNa(df:pd.DataFrame)->pd.DataFrame:
    df['name'] = df['name'].replace({np.nan:'Anonimum'})
    df['host_name'] = df['host_name'].replace({np.nan:'Anonimum'})
    df['reviews_per_month'] = df['reviews_per_month'].replace({np.nan:0})
    return df.drop(columns=['last_review'])

# APPLY ONE HOT ENCODING

In [7]:
def applyOneHot(df:pd.DataFrame, encoded:list[str]=None, dropNeigh: bool=True)->pd.DataFrame:
    for column in encoded or ['room_type', 'neighbourhood_group']:
        for label in set(df[column].values):
            df[label] = df[column].map(lambda x: 1 if x == label else 0)
        df = df.drop(labels=[column], axis=1)
    
    return df.drop(columns=['neighbourhood']) if dropNeigh else df  

# FILTER DATA

Filters the data by removing the outliers:
- price higher than $\mu + 3 *\sigma$
- price lower than 2 $

In [8]:
def filterData(df:pd.DataFrame) -> pd.DataFrame:
    return df[
        (df['price'] >= 20)
        & (df['price'] <= df['price'].mean() + 1.8* df['price'].std())
    ]

# MAIN FUNCTION

In [11]:
def main()->pd.DataFrame:
    return filterData(applyOneHot(replaceNa(readData(INPUT_TRAINING_DATA)), dropNeigh=False))
    
df = main()

df = df.drop(['name', 'host_name'], axis=1)


# import seaborn as sns
# sns.pairplot(df)  # Pairwise scatter plots for all features

In [None]:
df
dfNoPrice = df.drop(columns=['price'])


array([False,  True, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False])

In [17]:
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error


xtrain, xtest, ytrain, ytest = train_test_split(dfNoPrice, df['price'], test_size=0.20)

mean_squared_error(ytest, HistGradientBoostingRegressor(categorical_features=(dfNoPrice.dtypes == 'object').values).fit(xtrain, ytrain).predict(xtest))



np.float64(3938.141571737563)