In [10]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import pickle

In [11]:
with open('C:/Users/areem/cleaned_reviews_w_lang.pkl', 'rb') as file:
    reviews = pickle.load(file)

In [12]:
reviews.head()

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments,cleaned_comments,language
0,1419,38924112,2015-07-19,11308465,Marcela,Having the opportunity of arriving to Alexandr...,Having the opportunity of arriving to Alexandr...,en
1,1419,44791978,2015-08-29,9580285,Marco,We have no enough words to describe how beauty...,We have no enough words to describe how beauty...,en
2,1419,45957133,2015-09-07,38394721,Andrea,The listing was exceptional and an even better...,The listing was exceptional and an even better...,en
3,1419,67295154,2016-03-28,3515044,Shaun,Alexandra's home was amazing and in such a nea...,Alexandra's home was amazing and in such a nea...,en
4,1419,177702208,2017-08-03,13987100,Kate,Beautiful home. Very comfortable and clean. Pe...,Beautiful home. Very comfortable and clean. Pe...,en


In [13]:
reviews.fillna('none', inplace=True)

In [14]:
import re
import nltk.data
from nltk.corpus import stopwords
from bs4 import BeautifulSoup

In [15]:
stop_words = set(stopwords.words('english'))

def clean(reviews):
    letters_only=BeautifulSoup(review).get_text()
    letters_only = re.sub('[^a-zA-Z0-9]', ' ', letters_only)
    letters_only = letters_only.lower()
    words= letters_only.split()
    words = [w for w in words if w not in stop_words]
    return " ".join(words)

In [16]:
all_reviews = []
for review in tqdm(reviews['comments']):
    all_reviews.append(clean(review))

  letters_only=BeautifulSoup(review).get_text()
100%|██████████| 459355/459355 [02:28<00:00, 3086.29it/s]


In [17]:
all_reviews[5]

'alexandra incredible host stay beautiful home checked us offered help anyway could house described great neighbourhood everything easy access friends thoroughly enjoyed would recommend everyone say thank'

In [18]:
reviews['comments'][5]

'Alexandra was such an incredible host during our stay in her beautiful home. She checked up on us and offered to help in anyway she could. The house was just as described and in such a great neighbourhood, everything was very easy to access. My friends and I thoroughly enjoyed ourselves and would recommend it to everyone. All we can say is thank you!'

In [19]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [20]:
tfidfvectorizer = TfidfVectorizer(analyzer='word', max_features=500)
reviews_tfidf = tfidfvectorizer.fit_transform(all_reviews)
reviews_tfidf = reviews_tfidf.toarray()
reviews_tfidf = pd.DataFrame(data=reviews_tfidf, columns=['tfidf_' + str(i) for i in range(500)])
reviews_tfidf['listing_id'] = reviews['listing_id']

reviews_tfidf.head()

Unnamed: 0,tfidf_0,tfidf_1,tfidf_2,tfidf_3,tfidf_4,tfidf_5,tfidf_6,tfidf_7,tfidf_8,tfidf_9,...,tfidf_491,tfidf_492,tfidf_493,tfidf_494,tfidf_495,tfidf_496,tfidf_497,tfidf_498,tfidf_499,listing_id
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1419.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1419.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.144606,...,0.0,0.155968,0.0,0.131945,0.0,0.0,0.210229,0.0,0.088418,1419.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1419.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1419.0


In [21]:
# Group by listing and get average per listing

# Average the features for each listings
reviews_tfidf_avg= reviews_tfidf.groupby('listing_id').mean().reset_index()
reviews_tfidf_avg.head()

Unnamed: 0,listing_id,tfidf_0,tfidf_1,tfidf_2,tfidf_3,tfidf_4,tfidf_5,tfidf_6,tfidf_7,tfidf_8,...,tfidf_490,tfidf_491,tfidf_492,tfidf_493,tfidf_494,tfidf_495,tfidf_496,tfidf_497,tfidf_498,tfidf_499
0,1419.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.035742,0.0,...,0.0,0.0,0.025995,0.0,0.021991,0.0,0.0,0.035038,0.0,0.035336
1,8077.0,0.00666,0.0,0.00385,0.000734,0.0,0.008198,0.008867,0.01383,0.004177,...,0.0,0.005644,0.006118,0.004677,0.047658,0.000608,0.0,0.001355,0.005286,0.025924
2,26654.0,0.008015,0.0,0.004169,0.0,0.0,0.0,0.008113,0.006933,0.0,...,0.00641,0.0,0.003784,0.005866,0.030423,0.0,0.0,0.0,0.0,0.055716
3,27423.0,0.004981,0.010609,0.009874,0.010113,0.0,0.01213,0.002758,0.008087,0.009691,...,0.0,0.0,0.010411,0.010304,0.014296,0.0,0.003552,0.0,0.007691,0.031207
4,30931.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.164793


In [22]:
vectorizer1 = CountVectorizer(analyzer='word', ngram_range=(1,2), max_features=500)
reviews_count1 = vectorizer1.fit_transform(all_reviews)
reviews_count1= reviews_count1.toarray()

In [23]:
reviews_count1 = pd.DataFrame(data=reviews_count1)

In [24]:
reviews_count1.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,490,491,492,493,494,495,496,497,498,499
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,1,0,...,1,0,0,0,0,1,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [25]:
reviews_count1['listing_id'] = reviews['listing_id'].values

In [26]:
reviews_count1.columns=['count_' + str(i) for i in range(reviews_count1.shape[1]-1)]+['listing_id']

In [27]:
reviews_count1.head()

Unnamed: 0,count_0,count_1,count_2,count_3,count_4,count_5,count_6,count_7,count_8,count_9,...,count_491,count_492,count_493,count_494,count_495,count_496,count_497,count_498,count_499,listing_id
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1419
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1419
2,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,1,0,0,0,0,1419
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1419
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1419


In [28]:
# Average the features per listing
reviews_count_avg = reviews_count1.groupby('listing_id').mean().reset_index()

reviews_count_avg.head()

Unnamed: 0,listing_id,count_0,count_1,count_2,count_3,count_4,count_5,count_6,count_7,count_8,...,count_490,count_491,count_492,count_493,count_494,count_495,count_496,count_497,count_498,count_499
0,1419,0.0,0.0,0.0,0.0,0.0,0.0,0.166667,0.0,0.166667,...,0.166667,0.0,0.0,0.0,0.0,0.333333,0.0,0.0,0.166667,0.0
1,8077,0.017964,0.0,0.011976,0.005988,0.053892,0.053892,0.05988,0.005988,0.023952,...,0.275449,0.0,0.035928,0.005988,0.023952,0.197605,0.023952,0.023952,0.035928,0.011976
2,26654,0.02381,0.0,0.02381,0.0,0.02381,0.02381,0.047619,0.02381,0.095238,...,0.02381,0.02381,0.0,0.047619,0.02381,0.309524,0.095238,0.047619,0.0,0.0
3,27423,0.074074,0.037037,0.037037,0.0,0.074074,0.0,0.074074,0.111111,0.074074,...,0.148148,0.037037,0.0,0.111111,0.037037,0.555556,0.111111,0.074074,0.074074,0.0
4,30931,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [29]:
print(all_reviews[1:6])

['enough words describe beautyful cozy alexandra house every detail tasteful functional kids stunned seen toys room tree house loved home thank', 'listing exceptional even better experience person house beautiful accommodating group anything could needed available experience working alexandra wonderful neighborhood easy get around ton great restaurants coffee shops stores within walking distance definitely one nicest houses ever chance stay anyone would lucky get experience great home great city', 'alexandra home amazing neat neighbourhood everything described great condition rooms great sizes third floor room sleeping nook outdoor deck fantastic anyone looking quiet neighbourhood short streetcar ride car trip away downtown core stay thanks alexandra', 'beautiful home comfortable clean perfect family families traveling together close amazing restaurents alexandra gave us complete list sorts useful information including nice running routes sporti ones highly highly recommend house kate'

Next, we will vectorize the reviews. We will create functions to convert reviews into their vector representations by averaging the vectors of the words they contain.

In [30]:
import time

Next, we will convert the review text to vectors

In [31]:
reviews_tfidf_avg.info()
reviews_tfidf_avg.to_csv('reviews_tfidf.csv', index=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10769 entries, 0 to 10768
Columns: 501 entries, listing_id to tfidf_499
dtypes: float64(501)
memory usage: 41.2 MB


In [32]:
reviews_count_avg.info()
reviews_count_avg.to_csv('reviews_countvec.csv', index=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14866 entries, 0 to 14865
Columns: 501 entries, listing_id to count_499
dtypes: float64(500), int64(1)
memory usage: 56.8 MB


Now we will create the predictive model using these features from reviews 

In [37]:
# Combine the review features into a single DataFrame using inner join to ensure only listings present in both datasets are kept
reviews_features_combined = reviews_tfidf_avg.merge(reviews_count_avg, on="listing_id", how="inner")
                                     

# Merge the combined features dataframe with the `review_scores_rating` column from the `listings` dataframe using inner join
reviews_data_for_modeling = reviews_features_combined.merge(listings[['listing_id', 'review_scores_rating']], on="listing_id", how="inner")

In [38]:
reviews_features_combined.info()
reviews_data_for_modeling.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10769 entries, 0 to 10768
Columns: 1001 entries, listing_id to count_499
dtypes: float64(1001)
memory usage: 82.2 MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10367 entries, 0 to 10366
Columns: 1002 entries, listing_id to review_scores_rating
dtypes: float64(1002)
memory usage: 79.3 MB


In [39]:
# Prepare Data For Modeling

from sklearn.model_selection import train_test_split

# Define reviews features and target variable
X_reviews = reviews_data_for_modeling.drop(['listing_id', 'review_scores_rating'], axis=1)
y_reviews = reviews_data_for_modeling['review_scores_rating']

# Split the reviews data into training and testing sets
X_reviews_train, X_reviews_test, y_reviews_train, y_reviews_test = train_test_split(X_reviews, y_reviews, test_size=0.2, random_state=42)

# Capture feature names from the training data
feature_names_reviews = X_reviews_train.columns

In [41]:
# XGB Model Training
import xgboost as xgb
import time

# Initialize the XGBoost regressor
xgb_model = xgb.XGBRegressor(n_estimators=100, learning_rate=0.05, max_depth=5, random_state=42)

# Start timing XGBoost training
start_xgb_train_time = time.time()

# Fit the XGB model to the reviews training data
xgb_model.fit(X_reviews_train, y_reviews_train)

# End timing XGBoost model
end_xgb_train_time = time.time()
xgb_training_time = end_xgb_train_time - start_xgb_train_time
print(f"XGBoost training time: {xgb_training_time: .2f} seconds")

# Start timing XGBoost prediction(testing)
start_xgb_test_time = time.time()

# Predict ratings on the test set
xgb_predictions_reviews = xgb_model.predict(X_reviews_test)

# End XGBoost prediction(testing)
end_xgb_test_time = time.time()
xgb_testing_time = end_xgb_test_time - start_xgb_test_time
print(f"XGBoost testing time: {xgb_testing_time:.2f} seconds")

XGBoost training time:  10.00 seconds
XGBoost testing time: 0.16 seconds


In [42]:
# XGB Model Evaluation
from sklearn.metrics import mean_squared_error, r2_score

# Calculate the performance metrics
mse_xgb_reviews = mean_squared_error(y_reviews_test, xgb_predictions_reviews)
r2_xgb_reviews = r2_score(y_reviews_test, xgb_predictions_reviews)
rmse_xgb_reviews = np.sqrt(mse_xgb_reviews)

print(f"XGB Mean Squared Error (MSE): {mse_xgb_reviews}")
print(f"XGB R-squared (R2): {r2_xgb_reviews}")
print(f"XGB Test RMSE: {rmse_xgb_reviews:.2f}")

XGB Mean Squared Error (MSE): 0.08877656203231299
XGB R-squared (R2): 0.22162347143626204
XGB Test RMSE: 0.30


In [43]:
#  xgb_model is the trained XGBoost model
xgb_feature_importances_reviews = xgb_model.feature_importances_

# Create a pandas Series to hold the feature importances with the feature names as the index
xgb_importances_reviews = pd.Series(xgb_feature_importances_reviews, index=feature_names_reviews)

# print the sorted xgb features based on importance
print(xgb_importances_reviews.sort_values(ascending=False))

count_213    0.020698
count_13     0.018728
count_112    0.015958
count_113    0.012877
count_61     0.011612
               ...   
count_103    0.000000
count_104    0.000000
count_105    0.000000
tfidf_297    0.000000
count_0      0.000000
Length: 1000, dtype: float32


Now I will build a Random Forest Regressor

In [46]:
# Random Forest Model

from sklearn.ensemble import RandomForestRegressor

# Random Forest Model Training

# Initialize Random Forest Model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

# Start training Random Forest training
start_rf_train_time = time.time()

# Fir the Random Forest model to the reviews training data
rf_model.fit(X_reviews_train, y_reviews_train)

# end timing Random Forest training
end_rf_train_time = time.time()
rf_training_time = end_rf_train_time - start_rf_train_time
print(f"Random Forest training time {rf_training_time: .2f} seconds")

# Random Forest Model Testing

# Start timing Random Forest Testing
start_rf_test_time = time.time()

# predict ratings on the test set
rf_predictions_reviews = rf_model.predict(X_reviews_test)

# End timing Random Forest testing
end_rf_test_time=time.time()
rf_testing_time = end_rf_test_time - start_rf_test_time
print(f"Random Forest testing time: {rf_testing_time: .2f} seconds")

Random Forest training time  1169.86 seconds
Random Forest testing time:  0.19 seconds


In [48]:
# Random Forest Model Evaluation
mse_rf_reviews = mean_squared_error(y_reviews_test, rf_predictions_reviews)
r2_rf_reviews = r2_score(y_reviews_test, rf_predictions_reviews)
rmse_rf_reviews = np.sqrt(mse_rf_reviews)

print(f"Random Forest Mean Squared Error (MSE): {mse_rf_reviews}")
print(f"Random Firest R-squared (R2): {r2_rf_reviews}")
print(f"Random Forest Test RMSE: {rmse_rf_reviews:.2f}")

Random Forest Mean Squared Error (MSE): 0.09439652561716488
Random Firest R-squared (R2): 0.17234866685169814
Random Forest Test RMSE: 0.31


In [49]:
# Analyzing Random Forest feature Importances
rf_feature_importances_reviews = rf_model.feature_importances_
rf_importances_reviews = pd.Series(rf_feature_importances_reviews, index=feature_names_reviews).sort_values(ascending=False)

print(rf_importances_reviews)

count_213    0.053668
count_13     0.027943
count_61     0.023659
count_226    0.021601
tfidf_192    0.020579
               ...   
count_90     0.000025
count_331    0.000025
tfidf_391    0.000023
tfidf_322    0.000021
count_91     0.000020
Length: 1000, dtype: float64


Now I will use an lgb model

In [51]:
!pip install lightgbm
import lightgbm as lgb

# Initialize the LGBM regressor
lgb_model = lgb.LGBMRegressor(n_estimators=100, learning_rate=0.05, max_depth=5, random_state=42)

# Start timing LGBM training
start_lgb_train_time = time.time()

# Fit the LGBM model to the reviews training data
lgb_model.fit(X_reviews_train, y_reviews_train)

# End timing LGBM training
end_lgb_train_time = time.time()
lgb_training_time = end_lgb_train_time - start_lgb_train_time
print(f"LGBM training time: {lgb_training_time:.2f} seconds")

# Start timing LGBM prediction (testing)
start_lgb_test_time = time.time()

# Predict ratings on the test set
lgb_predictions_reviews = lgb_model.predict(X_reviews_test)

# End timing LGBM prediction (testing)
end_lgb_test_time = time.time()
lgb_testing_time = end_lgb_test_time - start_lgb_test_time
print(f"LGBM testing time: {lgb_testing_time:.2f} seconds")

Collecting lightgbm
  Downloading lightgbm-4.3.0-py3-none-win_amd64.whl.metadata (19 kB)
Downloading lightgbm-4.3.0-py3-none-win_amd64.whl (1.3 MB)
   ---------------------------------------- 0.0/1.3 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.3 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.3 MB ? eta -:--:--
    --------------------------------------- 0.0/1.3 MB 262.6 kB/s eta 0:00:05
   -- ------------------------------------- 0.1/1.3 MB 525.1 kB/s eta 0:00:03
   ------ --------------------------------- 0.2/1.3 MB 1.1 MB/s eta 0:00:02
   --------------- ------------------------ 0.5/1.3 MB 2.0 MB/s eta 0:00:01
   --------------------------------- ------ 1.1/1.3 MB 3.6 MB/s eta 0:00:01
   ---------------------------------------  1.3/1.3 MB 4.0 MB/s eta 0:00:01
   ---------------------------------------- 1.3/1.3 MB 3.7 MB/s eta 0:00:00
Installing collected packages: lightgbm
Successfully installed lightgbm-4.3.0
[LightGBM] [Info] Auto-choosing

In [52]:
# LGBM Model Evaluation
mse_lgb_reviews = mean_squared_error(y_reviews_test, lgb_predictions_reviews)
r2_lgb_reviews = r2_score(y_reviews_test, lgb_predictions_reviews)
rmse_lgb_reviews = np.sqrt(mse_lgb_reviews)

print(f"LGBM Mean Squared Error (MSE): {mse_lgb_reviews}")
print(f"LGBM R-squared (R2): {r2_lgb_reviews}")
print(f"LGBM Test RMSE: {rmse_lgb_reviews:.2f}")

LGBM Mean Squared Error (MSE): 0.08528812688050662
LGBM R-squared (R2): 0.25220942770019616
LGBM Test RMSE: 0.29


In [53]:
# Analyzing LGBM Feature Importances
lgb_feature_importances_reviews = lgb_model.feature_importances_
lgb_importances_reviews = pd.Series(lgb_feature_importances_reviews, index=feature_names_reviews).sort_values(ascending=False)

print(lgb_importances_reviews)

count_190    64
count_226    56
count_144    42
count_402    35
count_362    33
             ..
tfidf_357     0
tfidf_358     0
tfidf_359     0
tfidf_360     0
count_499     0
Length: 1000, dtype: int32


Now we will prepare the listings dataset (feature extraction, encoding etc)

In [54]:
listings = pd.read_csv("http://data.insideairbnb.com/canada/on/toronto/2024-02-14/data/listings.csv.gz")

In [55]:
listings.rename(columns={'id': 'listing_id'}, inplace=True)
listings.rename(columns={'name': 'listing_name'}, inplace=True)

In [56]:
#  filter the listings and reviews datasets to include only common listings
listings = listings[listings['listing_id'].isin(reviews['listing_id'])]
reviews = reviews[reviews['listing_id'].isin(listings['listing_id'])]

Drop unnecessary columns

In [57]:
# listings.drop('listing_url', axis=1, inplace=True)
listings.drop('picture_url', axis=1, inplace=True)

listings.drop('host_name', axis=1, inplace=True)
listings.drop('host_neighbourhood', axis=1, inplace=True)
listings.drop('host_picture_url', axis=1, inplace=True)

listings.drop('host_thumbnail_url', axis=1, inplace=True)
listings.drop('host_url', axis=1, inplace=True)

listings.drop('scrape_id', axis=1, inplace=True)
listings.drop('last_scraped', axis=1, inplace=True)
listings.drop('source', axis=1, inplace=True)
listings.drop('calendar_last_scraped', axis=1, inplace=True)
listings.drop('calendar_updated', axis=1, inplace=True)

In [58]:
listings.drop('host_location', axis=1, inplace=True)
listings.drop('neighbourhood_group_cleansed', axis=1, inplace=True)
listings.drop('neighbourhood', axis=1, inplace=True)
listings.drop('license', axis=1, inplace=True)

In [59]:
listings.drop('listing_url', axis=1, inplace=True)

Handle Missing Values

In [60]:
# Display count of listings with missing 'review_scores_rating' before any operations
print(f"Initial count of listings with missing 'review_scores_rating': {listings['review_scores_rating'].isnull().sum()}")

# Drop listings with missing 'review_scores_rating'
listings.dropna(subset=['review_scores_rating'], inplace=True)

# Display shape of the DataFrame after dropping listings with missing 'review_scores_rating'
print(f"Shape of listings after dropping those missing 'review_scores_rating': {listings.shape}")

# Now, check and drop listings with any missing review score details
review_score_columns = ['review_scores_rating', 'review_scores_accuracy', 'review_scores_cleanliness', 'review_scores_checkin',
                        'review_scores_communication', 'review_scores_location', 'review_scores_value']

# Drop listings with any missing review scores
listings.dropna(subset=review_score_columns, inplace=True)

# Display final shape of the DataFrame after all drop operations
print(f"Final shape of listings after dropping any with missing review scores: {listings.shape}")

# Display the count of missing values for each review score column as a final check
missing_review_scores_final_check = listings[review_score_columns].isnull().sum()
print("Final check - missing review scores:")
print(missing_review_scores_final_check)

Initial count of listings with missing 'review_scores_rating': 2
Shape of listings after dropping those missing 'review_scores_rating': (14136, 59)
Final shape of listings after dropping any with missing review scores: (14131, 59)
Final check - missing review scores:
review_scores_rating           0
review_scores_accuracy         0
review_scores_cleanliness      0
review_scores_checkin          0
review_scores_communication    0
review_scores_location         0
review_scores_value            0
dtype: int64


In [61]:
# Directly fill missing 'bathrooms_text' for the identified rows
listings.loc[listings['bathrooms'] == 0.0, 'bathrooms_text'] = listings.loc[listings['bathrooms'] == 0.0, 'bathrooms_text'].fillna('0 baths')
listings.loc[listings['bathrooms'] == 1.0, 'bathrooms_text'] = listings.loc[listings['bathrooms'] == 1.0, 'bathrooms_text'].fillna('1 bath')

# Check the previously missing values to ensure they've been filled
print(listings.loc[[1260, 2609, 13723], ['bathrooms', 'bathrooms_text']])

       bathrooms bathrooms_text
1260         1.0         1 bath
2609         1.0  1 shared bath
13723        2.0        2 baths


In [62]:
# Identify the index of the row with the missing 'host_total_listings_count'
missing_index = listings[listings['host_total_listings_count'].isnull()].index

# Sum the specific listings counts for that index
total_calculated = listings.loc[missing_index, ['calculated_host_listings_count_entire_homes', 'calculated_host_listings_count_private_rooms', 'calculated_host_listings_count_shared_rooms']].sum(axis=1)

# Impute the missing 'host_total_listings_count' with the calculated total
listings.loc[missing_index, 'host_total_listings_count'] = total_calculated

# Verify the operation
print(listings.loc[missing_index, 'host_total_listings_count'])

829    1.0
Name: host_total_listings_count, dtype: float64


Get rid of special characters($,%) /convert to correct datatype

In [63]:
# Convert price to numeric
listings['price'] = listings['price'].replace('[\$,]', '', regex=True).astype(float)
listings['price'].head()

0      NaN
1      NaN
2    164.0
3      NaN
4      NaN
Name: price, dtype: float64

In [64]:
# host acceptance rate is a percentage, lets fix that
print(listings['host_acceptance_rate'].head())

# convert to percentage(float)
listings['host_acceptance_rate'] = listings['host_acceptance_rate'].str.replace('%','').astype(float)
print(listings['host_acceptance_rate'].head())

# host response rate is a percentage, it seems to be read as text so lets fix that
print(listings['host_response_rate'].head())

#convert host response rate to percentage(float)
listings['host_response_rate'] = listings['host_response_rate'].str.replace('%','').astype(float)
print(listings['host_response_rate'].head())

0    NaN
1    NaN
2    41%
3    75%
4    NaN
Name: host_acceptance_rate, dtype: object
0     NaN
1     NaN
2    41.0
3    75.0
4     NaN
Name: host_acceptance_rate, dtype: float64
0     NaN
1     NaN
2    100%
3    100%
4     NaN
Name: host_response_rate, dtype: object
0      NaN
1      NaN
2    100.0
3    100.0
4      NaN
Name: host_response_rate, dtype: float64


Impute missing values

In [65]:
# For numerical columns
listings['host_response_rate'] = listings['host_response_rate'].fillna(listings['host_response_rate'].median())
listings['host_acceptance_rate'] = listings['host_acceptance_rate'].fillna(listings['host_acceptance_rate'].median())

# For categorical columns
listings['host_response_time'] = listings['host_response_time'].fillna(listings['host_response_time'].mode()[0])

# Confirm the imputation
print(listings[['host_response_time', 'host_response_rate', 'host_acceptance_rate']].isnull().sum())

host_response_time      0
host_response_rate      0
host_acceptance_rate    0
dtype: int64


In [66]:
# Attempt to impute missing prices based on the median price for listings with the same 'room_type' and 'accommodates' number
listings['price'] = listings.groupby(['room_type', 'accommodates'])['price'] \
                             .transform(lambda x: x.fillna(x.median()))

# For listings where the price is still missing, use the median price of their specific room type as a fallback
listings['price'] = listings.groupby(['room_type'])['price'] \
                             .transform(lambda x: x.fillna(x.median()))
# If there are still missing values after these steps, use the overall median price as a last resort
overall_median_price = listings['price'].median()
listings['price'].fillna(overall_median_price, inplace=True)

# Check if there are still any missing values in 'price'
print("Missing values in 'price' after imputation:", listings['price'].isna().sum())

Missing values in 'price' after imputation: 0


In [67]:
print(listings['bathrooms'].isnull().sum())
listings['bathrooms'].fillna(1, inplace=True)

3560


In [68]:
print('Number of listings with null values for bedrooms:', listings['bedrooms'].isnull().sum())
print('Median: ', listings['bedrooms'].median())

# Filter to listings with null 'bedrooms'
listings_with_null_bedrooms = listings[listings['bedrooms'].isnull()]

# to see the count of each property type
print("\nCount of Each Property Type for Listings with Null Bedrooms:")
print(listings_with_null_bedrooms['property_type'].value_counts())

listings['bedrooms'].fillna(1, inplace=True)

Number of listings with null values for bedrooms: 1103
Median:  1.0

Count of Each Property Type for Listings with Null Bedrooms:
property_type
Private room in home                 428
Private room in rental unit          197
Entire rental unit                   108
Private room in condo                101
Private room in townhouse             68
Entire condo                          51
Private room in bungalow              45
Entire guest suite                    20
Private room in villa                 19
Shared room in rental unit            11
Entire loft                           11
Private room in guest suite            9
Entire home                            8
Shared room in home                    5
Shared room in condo                   3
Entire bungalow                        3
Entire townhouse                       3
Private room in cottage                2
Shared room in bungalow                2
Private room in guesthouse             2
Private room in bed and breakfast   

In [69]:
print('Number of listings with null beds values: ',  listings['beds'].isnull().sum())
print('Median: ', listings['beds'].median())
listings['beds'].fillna(1, inplace=True)

Number of listings with null beds values:  3575
Median:  1.0


Extract number of (amenities, verfications etc)

In [70]:
listings['number_of_amenities'] = listings['amenities'].apply(lambda x: len(x.split(',')))

# listings.drop('amenities', axis=1, inplace=True)

In [71]:
print(listings['host_verifications'].isnull().sum())
listings.dropna(subset=['host_verifications'], inplace=True)

listings['number_of_host_verifications'] = listings['host_verifications'].apply(lambda x: len(x.split(',')))

listings.drop('host_verifications', axis=1, inplace=True)

1


In [72]:
# Identify the listing with missing 'host_since'
missing_host_since_listing = listings[listings['host_since'].isna()]

# Assuming 'listing_id' is the common column between 'listings' and 'reviews'
# and 'date' is the column in 'reviews' that contains the review dates
earliest_review_date = reviews[reviews['listing_id'].isin(missing_host_since_listing['listing_id'])]['date'].min()
print(earliest_review_date)

# Impute the missing 'host_since' value
if pd.notnull(earliest_review_date):
    listings.loc[listings['host_since'].isna(), 'host_since'] = earliest_review_date

NaT


In [73]:
# Ensure 'first_review' is in datetime format
listings['first_review'] = pd.to_datetime(listings['first_review'])

# Choose a reference date as current date, for example, today's date or a specific recent date
current_date = pd.to_datetime('2024-04-01')  # Example current date, adjust as needed

# Calculate the difference in days between the current date and the first review
listings['Days_since_first_review'] = (current_date - listings['first_review']).dt.days

# Verify the new column
print(listings[['first_review', 'Days_since_first_review']].head())

  first_review  Days_since_first_review
0   2015-07-19                     3179
1   2009-08-20                     5338
2   2011-01-05                     4835
3   2012-01-26                     4449
4   2012-07-05                     4288


In [74]:
# Ensure 'last_review' is in datetime format
listings['last_review'] = pd.to_datetime(listings['last_review'])

# Use a reference date as the current date, adjust as necessary
current_date = pd.to_datetime('2024-04-01')  # Example current date, adjust as needed

# Calculate the difference in days between the current date and the last review
listings['Days_since_last_review'] = (current_date - listings['last_review']).dt.days

# Verify the new column
print(listings[['last_review', 'Days_since_last_review']].head())

  last_review  Days_since_last_review
0  2017-08-07                    2429
1  2013-08-27                    3870
2  2023-09-01                     213
3  2022-05-28                     674
4  2023-01-31                     426


Encode

In [75]:
print(listings['has_availability'].isnull().sum())

print(listings['has_availability'].value_counts())

listings['has_availability'].fillna(0, inplace=True)
listings['has_availability'].loc[listings['has_availability'] == 't'] = 1
listings['has_availability'].loc[listings['has_availability'] == 'f'] = 0

14
has_availability
t    14102
f       14
Name: count, dtype: int64


In [76]:
# List of columns to convert
columns_to_encode = [
    'host_is_superhost',
    'host_has_profile_pic',
    'host_identity_verified',
    'has_availability',
    'instant_bookable'
]

# Convert 't' to 1 and 'f' to 0
for column in columns_to_encode:
    listings[column] = listings[column].apply(lambda x: 1 if x == 't' else 0)

In [77]:
import re

# Assuming 'listings' is your DataFrame and it already contains 'bathrooms_text' and 'room_type'

# Function to extract bathroom count
def extract_bathroom_count(text):
    if pd.isna(text):
        return None
    if 'half-bath' in text.lower() or 'shared half-bath' in text.lower() or 'private half-bath' in text.lower():
        return 0.5
    match = re.search(r'(\d+(\.\d+)?)', text)  # Match a decimal number
    return float(match.group()) if match else None

# Modified function to extract bathroom privacy and infer from room type if not specified, with direct encoding
def extract_bathroom_privacy_encoded(row):
    text = row['bathrooms_text']
    room_type = row['room_type']
    privacy_encoded = 0  # Default to 0 for 'Not Specified'
    if pd.isna(text):
        return privacy_encoded
    if 'private' in text.lower():
        privacy_encoded = 1  # Encode 'Private' as 1
    elif 'shared' in text.lower():
        privacy_encoded = 2  # Encode 'Shared' as 2

    # Infer privacy based on room type if not explicitly mentioned
    if privacy_encoded == 0:
        if room_type in ['Private room', 'Shared room']:
            privacy_encoded = 2  # Shared
        elif room_type == 'Entire home/apt':
            privacy_encoded = 1  # Private

    return privacy_encoded

# Apply the functions
listings['bathroom_count'] = listings['bathrooms_text'].apply(extract_bathroom_count)
listings['bathroom_privacy_encoded'] = listings.apply(extract_bathroom_privacy_encoded, axis=1)

# Now you can drop the original 'bathrooms_text' column if you wish, as its information has been encoded into two new columns
# listings.drop('bathrooms_text', axis=1, inplace=True)

# Verify the changes
print(listings[['room_type', 'bathroom_count', 'bathroom_privacy_encoded']].head())

         room_type  bathroom_count  bathroom_privacy_encoded
0  Entire home/apt             3.0                         1
1     Private room             1.5                         2
2  Entire home/apt             1.0                         1
3  Entire home/apt             1.0                         1
4     Private room             1.0                         2


In [78]:
host_response_time_mapping = {
    'within an hour': 1,
    'within a few hours': 2,
    'within a day': 3,
    'a few days or more': 4
}
listings['host_response_time_encoded'] = listings['host_response_time'].map(host_response_time_mapping)

In [79]:
from sklearn.preprocessing import LabelEncoder

# Initialize the encoder
room_type_encoder = LabelEncoder()

# Fit and transform the data
listings['room_type_encoded'] = room_type_encoder.fit_transform(listings['room_type'])

print(listings['room_type_encoded'].head())

0    0
1    2
2    0
3    0
4    2
Name: room_type_encoded, dtype: int32


In [80]:
def categorize_property_type_numeric(property_type):
    unique_luxury = ['treehouse', 'island', 'shipping container', 'tiny home', 'camper', 'castle', 'hut', 'barn', 'boat']
    house_villa = ['house', 'villa', 'cottage', 'bungalow', 'townhouse']
    apartment_condo = ['apartment', 'condo', 'loft', 'serviced apartment']

    # Ensure the property type is in lowercase and strip any leading/trailing whitespace
    property_type_lower = property_type.lower().strip()

    # Numeric categories
    category_mapping = {
        'Unique/Luxury Stay': 0,
        'House/Villa': 1,
        'Apartment/Condo': 2,
        'Others': 3
    }

    # Check for unique/luxury stays
    if any(unique in property_type_lower for unique in unique_luxury):
        return category_mapping['Unique/Luxury Stay']

    # Check for house/villa
    elif any(house in property_type_lower for house in house_villa):
        return category_mapping['House/Villa']

    # Check for apartment/condo
    elif any(apartment in property_type_lower for apartment in apartment_condo):
        return category_mapping['Apartment/Condo']

    # Default to Others
    else:
        return category_mapping['Others']

# Apply the numeric categorization function
listings['property_category_numeric'] = listings['property_type'].apply(categorize_property_type_numeric)

# Check the distribution of the new property_category_numeric column
print(listings['property_category_numeric'].value_counts())


property_category_numeric
3    9543
2    3416
1    1154
0      17
Name: count, dtype: int64


Create new features based on means

In [81]:
# Calculate mean review scores for each 'accommodates' value
temp = listings.groupby('accommodates')[['review_scores_rating', 'review_scores_accuracy', 'review_scores_cleanliness', 'review_scores_checkin', 'review_scores_communication', 'review_scores_location', 'review_scores_value']].mean().reset_index()

# Rename columns to indicate these are mean values (adapted for clarity)
temp.columns = ['accommodates'] + [('mean_accommodates_' + c) for c in temp.columns[1:]]

# Merge these mean scores back into the main 'listings' DataFrame
listings = listings.merge(temp, on='accommodates', how='left')

In [82]:
# Calculate mean review scores grouped by 'host_response_time'
temp = listings.groupby('host_response_time')[['review_scores_rating', 'review_scores_accuracy', 'review_scores_cleanliness', 'review_scores_checkin', 'review_scores_communication', 'review_scores_location', 'review_scores_value']].mean().reset_index()

# Rename columns to indicate these are mean values, adapting for clarity
temp.columns = ['host_response_time'] + [('mean_host_response_time_' + c) for c in temp.columns[1:]]

# Merge these mean scores back into the main 'listings' DataFrame
listings = listings.merge(temp, on='host_response_time', how='left')

Sentiment score for text columns 

In [83]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()
tqdm.pandas()

In [84]:
print(listings['listing_name'].isnull().sum())

listings['listing_name_sentiment'] = listings['listing_name'].progress_apply(lambda x: sid.polarity_scores(x))

listings['name_compound'] = listings['listing_name_sentiment'].progress_apply(lambda x: x['compound'])
listings['name_neg'] = listings['listing_name_sentiment'].progress_apply(lambda x: x['neg'])
listings['name_neu'] = listings['listing_name_sentiment'].progress_apply(lambda x: x['neu'])
listings['name_pos'] = listings['listing_name_sentiment'].progress_apply(lambda x: x['pos'])

print(listings['name_compound'].head())

listings.drop('listing_name', axis=1, inplace=True)
listings.drop('listing_name_sentiment', axis=1, inplace=True)

0


100%|██████████| 14130/14130 [00:01<00:00, 9331.48it/s] 
100%|██████████| 14130/14130 [00:00<00:00, 571807.07it/s]
100%|██████████| 14130/14130 [00:00<00:00, 577113.49it/s]
100%|██████████| 14130/14130 [00:00<00:00, 706787.14it/s]
100%|██████████| 14130/14130 [00:00<00:00, 570079.99it/s]

0    0.8398
1    0.0000
2    0.0000
3    0.2732
4    0.0000
Name: name_compound, dtype: float64





In [85]:
print(listings['description'].isnull().sum())
listings['description'].fillna('none', inplace=True)

listings['description_sentiment'] = listings['description'].progress_apply(lambda x: sid.polarity_scores(x))

listings['description_compound'] = listings['description_sentiment'].progress_apply(lambda x: x['compound'])
listings['description_neg'] = listings['description_sentiment'].progress_apply(lambda x: x['neg'])
listings['description_neu'] = listings['description_sentiment'].progress_apply(lambda x: x['neu'])
listings['description_pos'] = listings['description_sentiment'].progress_apply(lambda x: x['pos'])

listings.drop('description', axis=1, inplace=True)
listings.drop('description_sentiment', axis=1, inplace=True)

219


100%|██████████| 14130/14130 [00:13<00:00, 1051.98it/s]
100%|██████████| 14130/14130 [00:00<00:00, 430749.38it/s]
100%|██████████| 14130/14130 [00:00<00:00, 637702.46it/s]
100%|██████████| 14130/14130 [00:00<00:00, 706273.35it/s]
100%|██████████| 14130/14130 [00:00<00:00, 694764.73it/s]


In [86]:
print(listings['host_about'].isnull().sum())
listings['host_about'].fillna('none', inplace=True)

listings['host_about_sentiment'] = listings['host_about'].progress_apply(lambda x: sid.polarity_scores(x))

listings['hostabout_compound'] = listings['host_about_sentiment'].progress_apply(lambda x: x['compound'])
listings['hostabout_neg'] = listings['host_about_sentiment'].progress_apply(lambda x: x['neg'])
listings['hostabout_neu'] = listings['host_about_sentiment'].progress_apply(lambda x: x['neu'])
listings['hostabout_pos'] = listings['host_about_sentiment'].progress_apply(lambda x: x['pos'])

listings.drop('host_about', axis=1, inplace=True)
listings.drop('host_about_sentiment', axis=1, inplace=True)

6813


100%|██████████| 14130/14130 [00:06<00:00, 2324.00it/s]
100%|██████████| 14130/14130 [00:00<00:00, 740162.05it/s]
100%|██████████| 14130/14130 [00:00<00:00, 781186.77it/s]
100%|██████████| 14130/14130 [00:00<00:00, 780991.18it/s]
100%|██████████| 14130/14130 [00:00<00:00, 795339.46it/s]


In [87]:
print(listings['neighborhood_overview'].isnull().sum())
listings['neighborhood_overview'].fillna('none', inplace=True)

listings['neighborhood_overview_sentiment'] = listings['neighborhood_overview'].progress_apply(lambda x: sid.polarity_scores(x))

listings['neighborhood_overview_compound'] = listings['neighborhood_overview_sentiment'].progress_apply(lambda x: x['compound'])
listings['neighborhood_overview_neg'] = listings['neighborhood_overview_sentiment'].progress_apply(lambda x: x['neg'])
listings['neighborhood_overview_neu'] = listings['neighborhood_overview_sentiment'].progress_apply(lambda x: x['neu'])
listings['neighborhood_overview_pos'] = listings['neighborhood_overview_sentiment'].progress_apply(lambda x: x['pos'])

listings.drop('neighborhood_overview', axis=1, inplace=True)
listings.drop('neighborhood_overview_sentiment', axis=1, inplace=True)

5110


100%|██████████| 14130/14130 [00:07<00:00, 1853.54it/s]
100%|██████████| 14130/14130 [00:00<00:00, 532666.28it/s]
100%|██████████| 14130/14130 [00:00<00:00, 486137.56it/s]
100%|██████████| 14130/14130 [00:00<00:00, 642096.59it/s]
100%|██████████| 14130/14130 [00:00<00:00, 532283.55it/s]


Now we will create a model using the features from reviews and 