 ## **Effects of Policy on the Housing Market** 

In [1]:
# !python3 --version
# !pip3 install category-encoders

In [2]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_transformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from category_encoders import TargetEncoder

import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
df = pd.read_csv("effects-of-policy-on-the-housing-market/train.csv")
df.head()


Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,id,name,neighborhood_overview,host_id,host_name,host_response_time,host_response_rate,host_acceptance_rate,...,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,instant_bookable,calculated_host_listings_count,reviews_per_month,monthly_revenue
0,0,879,19792418,Home in Vancouver · ★4.75 · 1 bedroom · 1 bed ...,Everything you need is nearby. <br /><br />Hig...,57488206,Jessi,,,,...,4.8,4.82,4.9,4.87,4.69,4.81,f,3,0.77,2108
1,1,6416,1015650685503221866,Guest suite in Vancouver · ★New · 2 bedrooms ·...,,139792573,Daniel,within a few hours,100%,100%,...,,,,,,,f,1,,2730
2,2,1820,35265562,Guest suite in Vancouver · ★4.85 · 2 bedrooms ...,Beautiful neighbourhood close to prosperous Ma...,265504225,Alex,within an hour,100%,98%,...,4.9,4.78,4.97,4.94,4.9,4.75,f,1,3.22,2254
3,3,5346,911948980885194155,Home in Vancouver · ★5.0 · 1 bedroom · 1 bed ·...,We are located in a quiet residential neighbor...,22595056,Raymond,,,92%,...,5.0,5.0,5.0,5.0,4.86,5.0,f,1,1.28,3187
4,4,2484,46069251,Guest suite in Vancouver · ★4.93 · 1 bedroom ·...,Kitsilano at it's best! Short walk to all the ...,65683877,Yendi,within an hour,100%,95%,...,4.93,4.89,4.97,4.97,4.96,4.85,f,1,2.01,3479


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5352 entries, 0 to 5351
Data columns (total 44 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Unnamed: 0.1                    5352 non-null   int64  
 1   Unnamed: 0                      5352 non-null   int64  
 2   id                              5352 non-null   int64  
 3   name                            5352 non-null   object 
 4   neighborhood_overview           3476 non-null   object 
 5   host_id                         5352 non-null   int64  
 6   host_name                       5352 non-null   object 
 7   host_response_time              4263 non-null   object 
 8   host_response_rate              4263 non-null   object 
 9   host_acceptance_rate            4620 non-null   object 
 10  host_is_superhost               5315 non-null   object 
 11  host_listings_count             5352 non-null   int64  
 12  host_total_listings_count       53

### Getting Feature Importance

In [5]:
# Extract the first 20 columns and the 43rd column (index 42) from the DataFrame
df_first_selected = df.iloc[:, list(range(20))].copy()

# Display the result
df_first_selected.head()


Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,id,name,neighborhood_overview,host_id,host_name,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,host_total_listings_count,neighbourhood,neighbourhood_cleansed,latitude,longitude,property_type,room_type,accommodates
0,0,879,19792418,Home in Vancouver · ★4.75 · 1 bedroom · 1 bed ...,Everything you need is nearby. <br /><br />Hig...,57488206,Jessi,,,,f,3,3,"Vancouver, British Columbia, Canada",Hastings-Sunrise,49.28357,-123.05649,Entire home,Entire home/apt,2
1,1,6416,1015650685503221866,Guest suite in Vancouver · ★New · 2 bedrooms ·...,,139792573,Daniel,within a few hours,100%,100%,f,1,4,,Sunset,49.22112,-123.09232,Entire guest suite,Entire home/apt,5
2,2,1820,35265562,Guest suite in Vancouver · ★4.85 · 2 bedrooms ...,Beautiful neighbourhood close to prosperous Ma...,265504225,Alex,within an hour,100%,98%,t,1,1,"Vancouver, British Columbia, Canada",Riley Park,49.25214,-123.09817,Entire guest suite,Entire home/apt,6
3,3,5346,911948980885194155,Home in Vancouver · ★5.0 · 1 bedroom · 1 bed ·...,We are located in a quiet residential neighbor...,22595056,Raymond,,,92%,t,1,1,"Vancouver, British Columbia, Canada",Sunset,49.221709,-123.097762,Entire home,Entire home/apt,2
4,4,2484,46069251,Guest suite in Vancouver · ★4.93 · 1 bedroom ·...,Kitsilano at it's best! Short walk to all the ...,65683877,Yendi,within an hour,100%,95%,t,2,3,"Vancouver, British Columbia, Canada",Kitsilano,49.25898,-123.16179,Entire guest suite,Entire home/apt,4


In [6]:

# Extract the first 20 columns and the 43rd column (index 42) from the DataFrame
df_second_selected = df.iloc[:, 22:44].copy()

# Display the result
df_second_selected.head()

Unnamed: 0,price,minimum_nights,maximum_nights,minimum_nights_avg_ntm,maximum_nights_avg_ntm,availability_30,availability_60,availability_90,availability_365,number_of_reviews,...,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,instant_bookable,calculated_host_listings_count,reviews_per_month,monthly_revenue
0,,3,30,2.7,30.0,0,0,0,0,60,...,4.8,4.82,4.9,4.87,4.69,4.81,f,3,0.77,2108
1,$132.00,1,365,1.0,365.0,27,57,86,86,0,...,,,,,,,f,1,,2730
2,$119.00,1,30,1.0,30.0,0,0,0,9,176,...,4.9,4.78,4.97,4.94,4.9,4.75,f,1,3.22,2254
3,$263.00,2,7,2.0,7.0,30,60,90,179,7,...,5.0,5.0,5.0,5.0,4.86,5.0,f,1,1.28,3187
4,$112.00,2,180,2.4,1125.0,15,32,45,104,73,...,4.93,4.89,4.97,4.97,4.96,4.85,f,1,2.01,3479


In [7]:
df.drop(columns=['Unnamed: 0', 'Unnamed: 0.1', 'id', 'host_id', 'host_name', 'name'], inplace=True)

In [8]:
X_train = df.drop(columns=["monthly_revenue"])
y_train = df["monthly_revenue"]

In [9]:
# Mapping response time
response_time_mapping = {
    "within an hour": 4,
    "within a few hours": 3,
    "within a day": 2,
    "a few days or more": 1,
    None: 0  # For missing values
}

df['host_response_time'] = df['host_response_time'].map(response_time_mapping)
df['host_response_rate'] = df['host_response_rate'].str.rstrip('%').astype(float) / 100
df['host_acceptance_rate'] = df['host_acceptance_rate'].str.rstrip('%').astype(float) / 100
df['price'] = df['price'].str.replace(',', '').str.lstrip('$').astype(float)

df.head()

Unnamed: 0,neighborhood_overview,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,host_total_listings_count,neighbourhood,neighbourhood_cleansed,latitude,...,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,instant_bookable,calculated_host_listings_count,reviews_per_month,monthly_revenue
0,Everything you need is nearby. <br /><br />Hig...,,,,f,3,3,"Vancouver, British Columbia, Canada",Hastings-Sunrise,49.28357,...,4.8,4.82,4.9,4.87,4.69,4.81,f,3,0.77,2108
1,,3.0,1.0,1.0,f,1,4,,Sunset,49.22112,...,,,,,,,f,1,,2730
2,Beautiful neighbourhood close to prosperous Ma...,4.0,1.0,0.98,t,1,1,"Vancouver, British Columbia, Canada",Riley Park,49.25214,...,4.9,4.78,4.97,4.94,4.9,4.75,f,1,3.22,2254
3,We are located in a quiet residential neighbor...,,,0.92,t,1,1,"Vancouver, British Columbia, Canada",Sunset,49.221709,...,5.0,5.0,5.0,5.0,4.86,5.0,f,1,1.28,3187
4,Kitsilano at it's best! Short walk to all the ...,4.0,1.0,0.95,t,2,3,"Vancouver, British Columbia, Canada",Kitsilano,49.25898,...,4.93,4.89,4.97,4.97,4.96,4.85,f,1,2.01,3479


In [10]:
numerical_cols = ['minimum_nights', 'maximum_nights', 'minimum_nights_avg_ntm', 'maximum_nights_avg_ntm', 'availability_30', 
                  'availability_60', 'availability_90', 'availability_365', 'number_of_reviews', 'number_of_reviews_ltm', 
                  'review_scores_rating', 'review_scores_accuracy', 'review_scores_cleanliness', 'review_scores_checkin', 
                  'review_scores_communication', 'review_scores_location', 'review_scores_value', 'calculated_host_listings_count', 
                  'reviews_per_month']

categorical_cols = ['host_response_time', 'neighbourhood', 'neighbourhood_cleansed', 'property_type', 'room_type']
boolean_cols = ['host_is_superhost', 'instant_bookable']


In [22]:
for col in boolean_cols:
    print(col.dtype)

AttributeError: 'str' object has no attribute 'dtype'

In [11]:
# checking the number of unique values in each categorical column
for col in categorical_cols:
    print(f"{col}: {df[col].nunique()} unique values")


host_response_time: 4 unique values
neighbourhood: 7 unique values
neighbourhood_cleansed: 23 unique values
property_type: 46 unique values
room_type: 4 unique values


In [12]:
X_train = df[numerical_cols + categorical_cols + boolean_cols].copy()

# Verify the result
print("Columns in X_train:", X_train.columns)
print("No. of rows in X_train: ", len(X_train))
y_train = df['monthly_revenue']  
print("No. of rows in y_train: ",len(y_train))

Columns in X_train: Index(['minimum_nights', 'maximum_nights', 'minimum_nights_avg_ntm',
       'maximum_nights_avg_ntm', 'availability_30', 'availability_60',
       'availability_90', 'availability_365', 'number_of_reviews',
       'number_of_reviews_ltm', 'review_scores_rating',
       'review_scores_accuracy', 'review_scores_cleanliness',
       'review_scores_checkin', 'review_scores_communication',
       'review_scores_location', 'review_scores_value',
       'calculated_host_listings_count', 'reviews_per_month',
       'host_response_time', 'neighbourhood', 'neighbourhood_cleansed',
       'property_type', 'room_type', 'host_is_superhost', 'instant_bookable'],
      dtype='object')
No. of rows in X_train:  5352
No. of rows in y_train:  5352


In [19]:
print(X_train.columns)
print(numerical_cols+categorical_cols+boolean_cols)

Index(['minimum_nights', 'maximum_nights', 'minimum_nights_avg_ntm',
       'maximum_nights_avg_ntm', 'availability_30', 'availability_60',
       'availability_90', 'availability_365', 'number_of_reviews',
       'number_of_reviews_ltm', 'review_scores_rating',
       'review_scores_accuracy', 'review_scores_cleanliness',
       'review_scores_checkin', 'review_scores_communication',
       'review_scores_location', 'review_scores_value',
       'calculated_host_listings_count', 'reviews_per_month',
       'host_response_time', 'neighbourhood', 'neighbourhood_cleansed',
       'property_type', 'room_type', 'host_is_superhost', 'instant_bookable'],
      dtype='object')
['minimum_nights', 'maximum_nights', 'minimum_nights_avg_ntm', 'maximum_nights_avg_ntm', 'availability_30', 'availability_60', 'availability_90', 'availability_365', 'number_of_reviews', 'number_of_reviews_ltm', 'review_scores_rating', 'review_scores_accuracy', 'review_scores_cleanliness', 'review_scores_checkin', 'revi

In [13]:
# Create a custom transformer that wraps TargetEncoder to work within the scikit-learn pipeline structure
# receives x and y during fitting, and only transforms x afterward

class TargetEncoderWrapper(BaseEstimator, TransformerMixin):
    def __init__(self, cols=None):
        self.cols = cols
        self.encoder = TargetEncoder(cols=cols)
    
    def fit(self, X, y):
        self.encoder.fit(X[self.cols], y)
        return self
    
    def transform(self, X):
        X_transformed = X.copy()
        X_transformed[self.cols] = self.encoder.transform(X[self.cols])
        return X_transformed

In [14]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline([
            ('imputer', SimpleImputer(strategy='mean')), 
            ('scaler', StandardScaler())  
        ]), numerical_cols),
        
        ('cat', Pipeline([
            ('imputer', SimpleImputer(strategy='most_frequent')), 
            ('target_encoder', TargetEncoderWrapper(cols=categorical_cols)) 
        ]), categorical_cols),
    
        ('bool', Pipeline([
            ('boolean_to_int', FunctionTransformer(lambda x: x.replace({'t': 1, 'f': 0}), validate=False))
        ]), boolean_cols)
    ]
)

In [15]:
# X_train_transformed = preprocessor.fit_transform(X_train, y_train)
# print(X_train_transformed)
# print(X_train_transformed.shape[1])

In [16]:
from sklearn.ensemble import RandomForestRegressor
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),  
    ('model', RandomForestRegressor(random_state=42)) 
])

In [17]:
model_pipeline.fit(X_train, y_train)

print(len(X_train))  # Should return something like (num_samples, num_features)
print(y_train.shape)  # Should return something like (num_samples,)


IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

In [None]:
rf_model = model_pipeline.named_steps['model']

# Get the feature importances
feature_importance = rf_model.feature_importances_
print(len(X_train.columns))  # Should match the length of feature_importance
print(len(feature_importance))


# Create a DataFrame to display feature importance
feature_importance_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': feature_importance
}).sort_values(by='Importance', ascending=False)

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import pandas as pd

test_df = pd.read_csv("effects-of-policy-on-the-housing-market/test.csv")

predictions = model_pipeline.predict(test_df)

# Create a new DataFrame with only 'id' and 'monthly_Revenue' prediction
result_df = pd.DataFrame({
    'id': test_df['id'],  # Assuming 'id' is a column in your test_df
    'monthly_Revenue': predictions
})

result_df.to_csv('monthly_revenue_predictions.csv', index=False)



# mse = mean_squared_error(y_val, y_pred)
# mae = mean_absolute_error(y_val, y_pred)
# r2 = r2_score(y_test, y_pred)

# print(f'Mean Squared Error (MSE): {mse}')
# print(f'Mean Absolute Error (MAE): {mae}')
# print(f'R² Score: {r2}')