In [1]:
# import dependencies
import pandas as pd
import numpy as np  

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error

### **Import and make a copy of CSV**

In [2]:
# Filepath
price_filepath =  "../../../data/processed/listings_feature_matrix_scaled_price.csv"
occupancy_filepath =  "../../../data/processed/listings_feature_matrix_scaled_occupancy.csv"

# Read CSV
price_csv = pd.read_csv(price_filepath)
occupancy_csv = pd.read_csv(occupancy_filepath) 
occupancy_csv = pd.read_csv(occupancy_filepath) 

# copy and remove other target columns
price_df = price_csv.drop(columns=['id', 'estimated_occupancy_l365d', 'estimated_revenue_l365d']).copy()
revenue_df = price_csv.drop(columns=['id', 'price', 'estimated_occupancy_l365d']).copy()
occupancy_df = occupancy_csv.drop(columns=['id', 'estimated_revenue_l365d']).copy()

In [3]:
# # Check if each feature matrix is correct
print("Price Feature Matrix")
display(price_df)
print("Revenue Feature Matrix")
display(revenue_df)
print("Occupancy Feature Matrix")
display(occupancy_df)


Price Feature Matrix


Unnamed: 0,latitude,longitude,accommodates,bedrooms,beds,bathrooms_count,price,number_of_reviews_ly,host_is_superhost,host_listings_count,...,amenities_Paid_parking_on_premises,amenities_Paid_street_parking_off_premises,amenities_Patio_or_balcony,amenities_Pets_allowed,amenities_Pool,amenities_Private_entrance,amenities_Security_camera,amenities_Single_level_home,amenities_Sound_system,amenities_Washer
0,-0.250004,0.024253,-0.161239,-0.492537,-0.698373,-0.474890,450.0,-0.688757,0,0.197793,...,0,0,0,0,0,0,0,0,0,1
1,-0.555148,-0.040161,-0.645561,-0.492537,-0.698373,-0.474890,78.0,-0.688757,1,0.290279,...,0,0,0,0,0,0,0,0,0,0
2,-0.694044,-0.057544,-0.161239,-0.492537,-0.698373,-0.474890,132.0,-0.575721,1,-0.357123,...,0,1,0,1,0,1,0,0,1,1
3,-1.429381,-1.950083,-0.645561,-0.492537,0.178501,-0.474890,81.0,-0.123575,0,-0.357123,...,0,0,0,0,0,1,0,1,0,0
4,1.049792,-1.142835,1.291729,0.618537,1.055375,2.081840,236.0,0.045979,0,-0.357123,...,0,0,1,0,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9589,-0.754792,0.053015,0.323084,-0.492537,0.178501,-0.474890,170.0,-0.801794,1,-0.357123,...,0,0,1,0,0,1,0,1,0,1
9590,-0.678759,0.113648,1.291729,0.618537,1.055375,-0.474890,145.0,-0.801794,0,-0.357123,...,0,0,1,1,0,1,0,0,0,1
9591,-0.603958,0.002416,-0.645561,-0.492537,-0.698373,-0.474890,114.0,-0.801794,1,0.012821,...,0,0,0,0,0,0,1,0,0,1
9592,0.435821,1.701619,-0.645561,-0.492537,-0.698373,1.229597,55.0,-0.801794,0,-0.264637,...,0,0,0,0,0,0,1,0,0,0


Revenue Feature Matrix


Unnamed: 0,latitude,longitude,accommodates,bedrooms,beds,bathrooms_count,number_of_reviews_ly,host_is_superhost,host_listings_count,estimated_revenue_l365d,...,amenities_Paid_parking_on_premises,amenities_Paid_street_parking_off_premises,amenities_Patio_or_balcony,amenities_Pets_allowed,amenities_Pool,amenities_Private_entrance,amenities_Security_camera,amenities_Single_level_home,amenities_Sound_system,amenities_Washer
0,-0.250004,0.024253,-0.161239,-0.492537,-0.698373,-0.474890,-0.688757,0,0.197793,25200.0,...,0,0,0,0,0,0,0,0,0,1
1,-0.555148,-0.040161,-0.645561,-0.492537,-0.698373,-0.474890,-0.688757,1,0.290279,8736.0,...,0,0,0,0,0,0,0,0,0,0
2,-0.694044,-0.057544,-0.161239,-0.492537,-0.698373,-0.474890,-0.575721,1,-0.357123,29568.0,...,0,1,0,1,0,1,0,0,1,1
3,-1.429381,-1.950083,-0.645561,-0.492537,0.178501,-0.474890,-0.123575,0,-0.357123,4860.0,...,0,0,0,0,0,1,0,1,0,0
4,1.049792,-1.142835,1.291729,0.618537,1.055375,2.081840,0.045979,0,-0.357123,21240.0,...,0,0,1,0,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9589,-0.754792,0.053015,0.323084,-0.492537,0.178501,-0.474890,-0.801794,1,-0.357123,4080.0,...,0,0,1,0,0,1,0,1,0,1
9590,-0.678759,0.113648,1.291729,0.618537,1.055375,-0.474890,-0.801794,0,-0.357123,870.0,...,0,0,1,1,0,1,0,0,0,1
9591,-0.603958,0.002416,-0.645561,-0.492537,-0.698373,-0.474890,-0.801794,1,0.012821,684.0,...,0,0,0,0,0,0,1,0,0,1
9592,0.435821,1.701619,-0.645561,-0.492537,-0.698373,1.229597,-0.801794,0,-0.264637,330.0,...,0,0,0,0,0,0,1,0,0,0


Occupancy Feature Matrix


Unnamed: 0,latitude,longitude,accommodates,bedrooms,beds,bathrooms_count,price,number_of_reviews_ly,host_is_superhost,host_listings_count,...,amenities_Paid_parking_on_premises,amenities_Paid_street_parking_off_premises,amenities_Patio_or_balcony,amenities_Pets_allowed,amenities_Pool,amenities_Private_entrance,amenities_Security_camera,amenities_Single_level_home,amenities_Sound_system,amenities_Washer
0,-0.250004,0.024253,-0.161239,-0.492537,-0.698373,-0.474890,2.144775,-0.688757,0,0.197793,...,0,0,0,0,0,0,0,0,0,1
1,-0.555148,-0.040161,-0.645561,-0.492537,-0.698373,-0.474890,-0.625100,-0.688757,1,0.290279,...,0,0,0,0,0,0,0,0,0,0
2,-0.694044,-0.057544,-0.161239,-0.492537,-0.698373,-0.474890,-0.223022,-0.575721,1,-0.357123,...,0,1,0,1,0,1,0,0,1,1
3,-1.429381,-1.950083,-0.645561,-0.492537,0.178501,-0.474890,-0.602763,-0.123575,0,-0.357123,...,0,0,0,0,0,1,0,1,0,0
4,1.049792,-1.142835,1.291729,0.618537,1.055375,2.081840,0.551352,0.045979,0,-0.357123,...,0,0,1,0,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9589,-0.754792,0.053015,0.323084,-0.492537,0.178501,-0.474890,0.059923,-0.801794,1,-0.357123,...,0,0,1,0,0,1,0,1,0,1
9590,-0.678759,0.113648,1.291729,0.618537,1.055375,-0.474890,-0.126225,-0.801794,0,-0.357123,...,0,0,1,1,0,1,0,0,0,1
9591,-0.603958,0.002416,-0.645561,-0.492537,-0.698373,-0.474890,-0.357048,-0.801794,1,0.012821,...,0,0,0,0,0,0,1,0,0,1
9592,0.435821,1.701619,-0.645561,-0.492537,-0.698373,1.229597,-0.796356,-0.801794,0,-0.264637,...,0,0,0,0,0,0,1,0,0,0


### **Linear Exploration functions**

In [None]:
# Function to run model training
def train_linear_model(dataframe, target):
    
    # Prepare X and y features
    X = dataframe.drop(columns=[target])
    y = dataframe[target]
    
    # Train test split 
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
    
    # Train linear regression model
    model = LinearRegression()
    model.fit(X_train, y_train)
    
    # Make predictions
    y_predictions = model.predict(X_test)
    
    # Evaluate Predictions
    r2 = r2_score(y_test, y_predictions)
    rmse = np.sqrt(mean_squared_error(y_test, y_predictions))
    
    # print scores
    print(f"R2 score: {round((r2*100), 2)}%")          
    print(f"RMSE score: {round(rmse, 2)}")      # depicts the average error of the predictions but in the target column's values
    
    return model, X, r2, rmse

### **Train each model**

In [5]:
# Price model
price_model, price_X, price_r2, price_rmse = train_linear_model(price_df, target='price')

R2 score: 53.69%
RMSE score: 84.49


In [6]:
# Revenue model
revenue_model, revenue_X, revenue_r2, revenue_rmse = train_linear_model(revenue_df, target='estimated_revenue_l365d')

R2 score: 43.44%
RMSE score: 17776.82


In [7]:
# Occupancy model
occupancy_model, occupancy_X, occupancy_r2, occupancy_rmse = train_linear_model(occupancy_df, target='estimated_occupancy_l365d')

R2 score: 45.17%
RMSE score: 62.1


### **Linear Feature Coefficients**

In [8]:
# get features and coefficients
features = price_X.columns
coefficients = price_model.coef_

# Make a Dataframe of the features with their coefficients
feature_importance_linear = pd.DataFrame({
    'feature': features,
    'coefficient': coefficients
})

# Check Dataframe
feature_importance_linear

Unnamed: 0,feature,coefficient
0,latitude,-6.767994
1,longitude,-0.651849
2,accommodates,29.603249
3,bedrooms,15.966355
4,beds,7.476492
...,...,...
69,amenities_Private_entrance,-10.892330
70,amenities_Security_camera,0.171011
71,amenities_Single_level_home,-7.033487
72,amenities_Sound_system,22.524218


In [9]:
# sort by absolute coefficient to designate features with the most feature importance
feature_importance_linear['abs_coef'] = feature_importance_linear['coefficient'].abs()
feature_importance_linear = feature_importance_linear.sort_values('abs_coef', ascending=True)

feature_importance_linear

Unnamed: 0,feature,coefficient,abs_coef
55,amenities_Host_greets_you,0.005171,0.005171
35,standardized_property_type_vacation,0.080051,0.080051
70,amenities_Security_camera,0.171011,0.171011
45,amenities_Cleaning_available_during_stay,0.205683,0.205683
73,amenities_Washer,-0.214414,0.214414
...,...,...,...
31,standardized_property_type_Other,37.670098,37.670098
33,standardized_property_type_budget,-43.094048,43.094048
24,neighbourhood_Waterfront Communities-The Island,43.403135,43.403135
13,neighbourhood_Islington-City Centre West,-44.184017,44.184017
