# Data Preparation and Cleansing
In the next section we will build a function to prepare and clean our dataset to be ready to be used for building our model to predict the price of bookings in seattle

In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error

In [7]:
############## Model1 ##############

def clean_data(df_1):
    '''
    INPUT
    df - pandas dataframe 
    
    OUTPUT
    X - A matrix holding all of the variables you want to consider when predicting the response
    y - the corresponding response vector
    
    This function cleans df using the following steps to produce X and y:
    1. Drop all the rows with no salaries
    2. Create X as all the columns that are not the Salary column
    3. Create y as the Salary column
    4. Drop the Salary, Respondent, and the ExpectedSalary columns from X
    5. For each numeric variable in X, fill the column with the mean value of the column.
    6. Create dummy columns for all the categorical variables in X, drop the original columns
    '''
    # Convert Price into numerical values
    
    df_1["price"] = df_1["price"].str.replace('[$,,,]',"").astype(float)

    # Assign the price column to y
    y = df_1['price']
    
    #Drop the 100% null and non relevant columns
    df_1 = df_1.drop(['monthly_price', 'security_deposit', 'license', 'square_feet', 'reviews_per_month', 'host_id', 'id','scrape_id'], axis=1)
    
    # Fill numeric columns with the mean
    num_vars = df_1.select_dtypes(include=['float', 'int']).columns
    for col in num_vars:
        df_1[col].fillna((df_1[col].mean()), inplace=True)
 
    
    # Dummy the categorical variables
    cat_vars = df_1.select_dtypes(include=['object']).copy().columns
    for var in  cat_vars:
        # for each cat add dummy var, drop original column
        df_1 = pd.concat([df_1.drop(var, axis=1), pd.get_dummies(df_1[var], prefix=var, prefix_sep='_', drop_first=True)], axis=1)
        X = df_1.drop('price', axis=1)

    return X, y
    
#Use the function to create X and y
df_1=pd.read_csv('listings.csv', thousands=',')
X, y = clean_data(df_1) 

  df_1["price"] = df_1["price"].str.replace('[$,,,]',"").astype(float)


# Predictive Models
In the next section we will build 4 models to better predict the price of bookings in seattle, we will be using mainly two techniques over the models
1. Model 1-3: Linear Regression with different preparation & cleansing techniques
2. Model 4: Random Forest Regressor 

In [8]:
#For Model 1 , Split the dataset to train and test and use linear model

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .30, random_state=42)

#Instantiate
lm_model = LinearRegression(normalize=True) 

#Fit
lm_model.fit(X_train, y_train)

#Predict
y_test_preds = lm_model.predict(X_test)

score=(r2_score(y_test, y_test_preds))

print(score)

0.329931939780963


In [10]:
############## Model2 ##############

df_1=pd.read_csv('listings.csv', thousands=',')

df_1["price"] = df_1["price"].str.replace('[$,,,]',"").astype(float)

num_vars = df_1.select_dtypes(include=['float', 'int']).columns

for col in num_vars:
    df_1[col].fillna((df_1[col].mean()), inplace=True)
    
X = df_1[['accommodates', 'bathrooms', 'bedrooms', 'beds', 'guests_included']]
y = df_1['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .30, random_state=42)


#Instantiate
lm_model = LinearRegression(normalize=True) 

#Fit - why does this break?
lm_model.fit(X_train, y_train) 

y_test_preds = lm_model.predict(X_test)

score= r2_score(y_test, y_test_preds)
print(score)

0.49992556417644574


  df_1["price"] = df_1["price"].str.replace('[$,,,]',"").astype(float)


In [25]:
############## Model3 ##############

df_1=pd.read_csv('listings.csv', thousands=',')

df_1["price"] = df_1["price"].str.replace('[$,,,]',"").astype(float)

num_vars = df_1.select_dtypes(include=['float', 'int']).columns

for col in num_vars:
    df_1[col].fillna((df_1[col].mean()), inplace=True)

df_num = df_1.select_dtypes(include=['float', 'int'])

#Choose certain categrical columns relevant to the analysis
cat = df_1.select_dtypes(include=['object']).columns
cat_vars = ['neighbourhood_group_cleansed', 'cancellation_policy', 'instant_bookable', 'host_is_superhost']
df_cat = df_1.drop(columns=[cols for cols in df_1 if cols not in cat_vars])

# Create new dataset with the numerical columsn and the chosen catoegorical columns and drop some other columns not relevant
df_pre = pd.concat([df_num, df_cat], axis=1)
df_pre.drop('license', axis=1, inplace=True)
df_pre.drop('longitude', axis=1, inplace=True)
df_pre.drop('latitude', axis=1, inplace=True)

cat_vars = df_pre.select_dtypes(include=['object']).copy().columns
for var in  cat_vars:
        # for each cat add dummy var, drop original column
    df_pre = pd.concat([df_pre.drop(var, axis=1), pd.get_dummies(df_pre[var], prefix=var, prefix_sep='_', drop_first=True)], axis=1)
    
y = df_pre['price']
X = df_pre.drop('price', axis=1)



  df_1["price"] = df_1["price"].str.replace('[$,,,]',"").astype(float)


In [21]:
#Linear Regression
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .30, random_state=42)

#Instantiate
lm_model = LinearRegression(normalize=True) 

#Fit - why does this break?
lm_model.fit(X_train, y_train) 

y_test_preds = lm_model.predict(X_test)

score= r2_score(y_test, y_test_preds)
print(score)

0.5293230147773866


In [26]:
############## Model4 ##############

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

df_1=pd.read_csv('listings.csv', thousands=',')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .30, random_state=42)

#We will use grid search and random forest to get the best hyper parameters than can give highest prediction accuracy
tuned_parameters = {'bootstrap': [True], 'max_depth': [5, 10, 20, None], 'max_features': ['auto', 'log2'], 'n_estimators': [10, 50, 100, 150, 200]}

rfr = RandomForestRegressor(random_state = 1)

g_search = GridSearchCV(estimator = rfr, param_grid = tuned_parameters, cv = 3, n_jobs = 1, verbose = 0, return_train_score=True)

#rf_model = RandomForestRegressor(n_estimators=150, 
#                               criterion='mse', random_state=7, n_jobs=-1)

#rf_model.fit(X_train, y_train) #Fit
g_search.fit(X_train, y_train);
print(g_search.best_params_)


{'bootstrap': True, 'max_depth': 10, 'max_features': 'log2', 'n_estimators': 150}


In [27]:
#We will now use the best generated params
regressor = RandomForestRegressor(bootstrap = True, max_depth=10, max_features= 'log2', n_estimators= 150)
regressor.fit(X_train, y_train)


result_train = regressor.predict(X_train)
result_test = regressor.predict(X_test)
    
test_score = r2_score(y_test, result_test)
train_score= r2_score(y_train, result_train)

print('R2 test score: '+str(test_score))
print('R2 train score: '+str(train_score))

R2 test score: 0.5713209120246411
R2 train score: 0.7565394939188758


# Conclusion
Random forest Regressor is the model that provides highest accuracy accross the other used models to predict the price of bookings