In [1]:
# Importing libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
import shap

In [2]:
#Loading and preprocessing data
data = pd.read_csv('/kaggle/input/homestays/Homestays_Data(in).csv')

In [3]:
data.columns

Index(['id', 'log_price', 'property_type', 'room_type', 'amenities',
       'accommodates', 'bathrooms', 'bed_type', 'cancellation_policy',
       'cleaning_fee', 'city', 'description', 'first_review',
       'host_has_profile_pic', 'host_identity_verified', 'host_response_rate',
       'host_since', 'instant_bookable', 'last_review', 'latitude',
       'longitude', 'name', 'neighbourhood', 'number_of_reviews',
       'review_scores_rating', 'thumbnail_url', 'zipcode', 'bedrooms', 'beds'],
      dtype='object')

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74111 entries, 0 to 74110
Data columns (total 29 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   id                      74111 non-null  int64  
 1   log_price               74111 non-null  float64
 2   property_type           74111 non-null  object 
 3   room_type               74111 non-null  object 
 4   amenities               74111 non-null  object 
 5   accommodates            74111 non-null  int64  
 6   bathrooms               73911 non-null  float64
 7   bed_type                74111 non-null  object 
 8   cancellation_policy     74111 non-null  object 
 9   cleaning_fee            74111 non-null  bool   
 10  city                    74111 non-null  object 
 11  description             74111 non-null  object 
 12  first_review            58247 non-null  object 
 13  host_has_profile_pic    73923 non-null  object 
 14  host_identity_verified  73923 non-null

In [5]:
data.head()

Unnamed: 0,id,log_price,property_type,room_type,amenities,accommodates,bathrooms,bed_type,cancellation_policy,cleaning_fee,...,latitude,longitude,name,neighbourhood,number_of_reviews,review_scores_rating,thumbnail_url,zipcode,bedrooms,beds
0,6901257,5.010635,Apartment,Entire home/apt,"{""Wireless Internet"",""Air conditioning"",Kitche...",3,1.0,Real Bed,strict,True,...,40.696524,-73.991617,Beautiful brownstone 1-bedroom,Brooklyn Heights,2,100.0,https://a0.muscache.com/im/pictures/6d7cbbf7-c...,11201,1.0,1.0
1,6304928,5.129899,Apartment,Entire home/apt,"{""Wireless Internet"",""Air conditioning"",Kitche...",7,1.0,Real Bed,strict,True,...,40.766115,-73.98904,Superb 3BR Apt Located Near Times Square,Hell's Kitchen,6,93.0,https://a0.muscache.com/im/pictures/348a55fe-4...,10019,3.0,3.0
2,7919400,4.976734,Apartment,Entire home/apt,"{TV,""Cable TV"",""Wireless Internet"",""Air condit...",5,1.0,Real Bed,moderate,True,...,40.80811,-73.943756,The Garden Oasis,Harlem,10,92.0,https://a0.muscache.com/im/pictures/6fae5362-9...,10027,1.0,3.0
3,13418779,6.620073,House,Entire home/apt,"{TV,""Cable TV"",Internet,""Wireless Internet"",Ki...",4,1.0,Real Bed,flexible,True,...,37.772004,-122.431619,Beautiful Flat in the Heart of SF!,Lower Haight,0,,https://a0.muscache.com/im/pictures/72208dad-9...,94117,2.0,2.0
4,3808709,4.744932,Apartment,Entire home/apt,"{TV,Internet,""Wireless Internet"",""Air conditio...",2,1.0,Real Bed,moderate,True,...,38.925627,-77.034596,Great studio in midtown DC,Columbia Heights,4,40.0,,20009,0.0,1.0


 # **1. Feature Engineering**

In [6]:
#Creating host_tenure feature, providing a measure of host experience. 
data['host_since'] = pd.to_datetime(data['host_since'])
data['Host_Tenure'] = (datetime.now() - data['host_since']).dt.days / 365.00

In [7]:
#Creating Host_Tenure, providing a measure of host experience
data['host_since'] = pd.to_datetime(data['host_since'])
data['Host_Tenure'] = (datetime.now() - data['host_since']).dt.days / 365.25

In [8]:
#Amenities_count, quantify the property offerings
data['Amenities_count'] = data['amenities'].apply(lambda x: len(x.split(',')))

In [9]:
#Adding Days_Since_Last_Review in order to assess listing activity and relevance
data['last_review'] = pd.to_datetime(data['last_review'])
data['Days_Since_Last_Review'] = (datetime.now() - data['last_review']).dt.days

In [10]:
data.dropna()

Unnamed: 0,id,log_price,property_type,room_type,amenities,accommodates,bathrooms,bed_type,cancellation_policy,cleaning_fee,...,neighbourhood,number_of_reviews,review_scores_rating,thumbnail_url,zipcode,bedrooms,beds,Host_Tenure,Amenities_count,Days_Since_Last_Review
1,6304928,5.129899,Apartment,Entire home/apt,"{""Wireless Internet"",""Air conditioning"",Kitche...",7,1.0,Real Bed,strict,True,...,Hell's Kitchen,6,93.0,https://a0.muscache.com/im/pictures/348a55fe-4...,10019,3.0,3.0,6.882957,15,2418.0
2,7919400,4.976734,Apartment,Entire home/apt,"{TV,""Cable TV"",""Wireless Internet"",""Air condit...",5,1.0,Real Bed,moderate,True,...,Harlem,10,92.0,https://a0.muscache.com/im/pictures/6fae5362-9...,10027,1.0,3.0,7.531828,19,2427.0
5,12422935,4.442651,Apartment,Private room,"{TV,""Wireless Internet"",Heating,""Smoke detecto...",2,1.0,Real Bed,strict,True,...,Noe Valley,3,100.0,https://a0.muscache.com/im/pictures/82509143-4...,94131,1.0,1.0,6.915811,10,2436.0
7,13971273,4.787492,Condominium,Entire home/apt,"{TV,""Cable TV"",""Wireless Internet"",""Wheelchair...",2,1.0,Real Bed,moderate,True,...,Downtown,9,93.0,https://a0.muscache.com/im/pictures/61bd05d5-c...,90015,1.0,1.0,10.970568,26,2582.0
8,180792,4.787492,House,Private room,"{TV,""Cable TV"",""Wireless Internet"",""Pets live ...",2,1.0,Real Bed,moderate,True,...,Richmond District,159,99.0,https://a0.muscache.com/im/pictures/0ed6c128-7...,94121,1.0,1.0,8.925394,21,2417.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
74102,7935934,4.110874,Villa,Shared room,"{TV,""Wireless Internet"",""Air conditioning"",Kit...",10,3.0,Real Bed,flexible,False,...,West Hills,1,80.0,https://a0.muscache.com/im/pictures/27fea634-a...,91307,1.0,10.0,10.850103,22,2683.0
74103,11829011,5.135798,Apartment,Entire home/apt,"{TV,""Cable TV"",Internet,""Wireless Internet"",""A...",2,1.0,Real Bed,moderate,True,...,Union Square,24,98.0,https://a0.muscache.com/im/pictures/9a99e2de-c...,94108,1.0,1.0,12.613279,24,3098.0
74107,13281809,5.043425,Apartment,Entire home/apt,"{TV,""Cable TV"",Internet,""Wireless Internet"",Ki...",4,2.0,Real Bed,moderate,True,...,Hermosa Beach,16,93.0,https://a0.muscache.com/im/pictures/2b86560b-a...,90254,2.0,4.0,8.010951,16,2579.0
74108,18688039,5.220356,Apartment,Entire home/apt,"{TV,Internet,""Wireless Internet"",""Air conditio...",5,1.0,Real Bed,moderate,True,...,Williamsburg,43,94.0,https://a0.muscache.com/im/pictures/7fbe448c-5...,11206,2.0,2.0,12.336756,31,2431.0


# **2. Exploratory Data Analysis (EDA)**

In [11]:
data.columns

Index(['id', 'log_price', 'property_type', 'room_type', 'amenities',
       'accommodates', 'bathrooms', 'bed_type', 'cancellation_policy',
       'cleaning_fee', 'city', 'description', 'first_review',
       'host_has_profile_pic', 'host_identity_verified', 'host_response_rate',
       'host_since', 'instant_bookable', 'last_review', 'latitude',
       'longitude', 'name', 'neighbourhood', 'number_of_reviews',
       'review_scores_rating', 'thumbnail_url', 'zipcode', 'bedrooms', 'beds',
       'Host_Tenure', 'Amenities_count', 'Days_Since_Last_Review'],
      dtype='object')

In [12]:
# Boxplot for log_price across different room types
plt.figure(figsize=(10, 6))
sns.boxplot(x='room_type_Shared room', y='log_price', data=data)
plt.title("Boxplot of log_price by room_type: Shared room")
plt.show()

# Boxplot for log_price across different property types
plt.figure(figsize=(14, 6))
sns.boxplot(x='property_type_Guesthouse', y='log_price', data=data)
plt.title("Boxplot of log_price by property_type: Guest House")
plt.xticks(rotation=45)
plt.show()


ValueError: Could not interpret input 'room_type_Shared room'

<Figure size 1000x600 with 0 Axes>

In [None]:
# Correlation Matrix for the modified dataset
correlation_matrix = data[numerical_cols].corr()
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix (Numerical Features)')
plt.show()


In [None]:
# Scatter plots to explore relationships between variables.

for feature in numerical_cols[:-1]: 
    plt.figure(figsize=(8, 6))
    sns.scatterplot(data=data, x=feature, y='log_price')
    plt.title(f'{feature} vs Log Price')
    plt.xlabel(feature)
    plt.ylabel('Log Price')
    plt.show()


In [None]:
# Histogram for distribution analysis
plt.figure(figsize=(10, 6))
sns.histplot(data['log_price'], kde=True)
plt.title('Distribution of Log Price')
plt.xlabel('Log Price')
plt.ylabel('Frequency')
plt.show()

# **3. Geospatial Analysis**

In [None]:
import folium
from folium.plugins import HeatMap

In [None]:
mean_lat = data['latitude'].mean()
mean_lon = data['longitude'].mean()
homestay_map = folium.Map(location=[mean_lat, mean_lon], zoom_start=12)

# Adding markers for each listing
for index, row in data.iterrows():
    folium.Marker([row['latitude'], row['longitude']], popup=row['name']).add_to(homestay_map)

# Save the map as an HTML file
homestay_map.save("homestay_listings_map.html")

# Visually assessing price distribution
plt.figure(figsize=(10, 8))
plt.scatter(data['longitude'], data['latitude'], c=data['log_price'], cmap='coolwarm', alpha=0.5)
plt.colorbar(label='Log Price')
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.title('Price Distribution by Geographical Location')
plt.grid(True)
plt.show()

# **4. Sentiment Analysis on Textual Data**

In [None]:
from textblob import TextBlob
def calculate_sentiment(text):
    blob = TextBlob(str(text))
    return blob.sentiment.polarity

# Apply sentiment analysis to the description column
data['description_sentiment'] = data['description'].apply(calculate_sentiment)


In [None]:
data[['description', 'description_sentiment']].head()

In [None]:
positive_sentiments = data[data['description_sentiment'] >= 0]
positive_sentiments[['description_sentiment','description']]

In [None]:
negative_sentiments = data[data['description_sentiment']< 0]
negative_sentiments[['description_sentiment','description']]

# **5. Amenities Analysis**

In [None]:
# # Convert 'amenities' column to string format
# data['amenities'] = data['amenities'].astype(str)

# # Split the amenities into individual items
# data['amenities'] = data['amenities'].apply(lambda x: x.replace('{', '').replace('}', '').replace('"', '').split(','))

# # Create a new DataFrame with amenities as binary features
# amenities_df = pd.get_dummies(data['amenities'].apply(pd.Series).stack()).sum(axis=0)

# # Concatenate the new DataFrame with the original data
# data = pd.concat([data, amenities_df], axis=1)

# # Drop the original 'amenities' column
# data.drop(columns=['amenities'], inplace=True)

# # Identify non-numeric columns
# non_numeric_cols = data.select_dtypes(exclude=['float', 'int']).columns

# # Drop non-numeric columns from the correlation analysis
# data_numeric = data.drop(columns=non_numeric_cols)

# # Compute correlation between amenities and log_price
# amenities_corr = data_numeric.corr()['log_price'].sort_values(ascending=False)


In [None]:
# print("Top 10 amenities positively correlated with log_price:")
# amenities_corr.head(10)

# print("\nTop 10 amenities negatively correlated with log_price:")
# amenities_corr.tail(10)

# **6. Categorical Data Encoding**

In [None]:
room_type_cols = data.filter(regex='^room_Type').columns
property_type_cols = data.filter(regex='^property_type').columns

# Perform one-hot encoding for categorical variables
data = pd.get_dummies(data, columns=room_type_cols)
data = pd.get_dummies(data, columns=property_type_cols)

# Display the encoded DataFrame
print(data.head())

# **7. Model Development and Training**

In [None]:
data.info()
data.head()

In [None]:
data.info()

In [None]:
# Drop irrelevant columns if necessary
# Convert date column to datetime format
data['host_since'] = pd.to_datetime(data['host_since'])

# Extract relevant information from the date column (e.g., year, month, day)
data['hyear'] = data['host_since'].dt.year
data['hmonth'] = data['host_since'].dt.month
data['hday'] = data['host_since'].dt.day

# Drop the original date column
data.drop('host_since', axis=1, inplace=True)

# Convert date column to datetime format
data['last_review'] = pd.to_datetime(data['last_review'])

# Extract relevant information from the date column (e.g., year, month, day)
data['ryear'] = data['last_review'].dt.year
data['rmonth'] = data['last_review'].dt.month
data['rday'] = data['last_review'].dt.day

# Drop the original date column
data.drop('last_review', axis=1, inplace=True)

# Split the data into features (X) and target variable (y)
X = data.drop(columns=['log_price','bed_type','city','cancellation_policy'])
y = data['log_price']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model Training
# 1. Simple Linear Regression
lr = LinearRegression()
lr.fit(X_train, y_train)

# Evaluate the model
lr_predictions = lr.predict(X_test)
lr_rmse = mean_squared_error(y_test, lr_predictions, squared=False)
print("Linear Regression RMSE:", lr_rmse)

# 2. Random Forest Regressor
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Evaluate the model
rf_predictions = rf.predict(X_test)
rf_rmse = mean_squared_error(y_test, rf_predictions, squared=False)
print("Random Forest RMSE:", rf_rmse)

# **8. Model Optimization and Validation**

In [None]:
rf_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10]
}

gb_param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7]
}

# Perform grid search with k-fold cross-validation for RandomForestRegressor
rf_grid_search = GridSearchCV(RandomForestRegressor(random_state=42), rf_param_grid, cv=5, scoring='neg_mean_squared_error')
rf_grid_search.fit(X_train, y_train)
rf_best_model = rf_grid_search.best_estimator_
rf_predictions = rf_best_model.predict(X_test)
rf_mse = mean_squared_error(y_test, rf_predictions)
print("Random Forest Best Model Mean Squared Error:", rf_mse)

# Perform grid search with k-fold cross-validation for GradientBoostingRegressor
gb_grid_search = GridSearchCV(GradientBoostingRegressor(random_state=42), gb_param_grid, cv=5, scoring='neg_mean_squared_error')
gb_grid_search.fit(X_train, y_train)
gb_best_model = gb_grid_search.best_estimator_
gb_predictions = gb_best_model.predict(X_test)
gb_mse = mean_squared_error(y_test, gb_predictions)
print("Gradient Boosting Best Model Mean Squared Error:", gb_mse)

# **9. Feature Importance and Model Insights**

In [None]:
import pandas as pd
import shap
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

# Load the dataset
data = pd.read_csv("Modified_data.csv")

# Define features and target variable
features = ['accommodates', 'bathrooms', 'bedrooms', 'beds', 'description_sentiment']
target = 'log_price'

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data[features], data[target], test_size=0.2, random_state=42)

# Train RandomForestRegressor
rf_model = RandomForestRegressor(random_state=42)
rf_model.fit(X_train, y_train)

# Get feature importance scores for RandomForestRegressor
rf_feature_importances = pd.DataFrame({'Feature': features, 'Importance': rf_model.feature_importances_})
print("Random Forest Feature Importances:")
print(rf_feature_importances)

# Train GradientBoostingRegressor
gb_model = GradientBoostingRegressor(random_state=42)
gb_model.fit(X_train, y_train)

# Get feature importance scores for GradientBoostingRegressor
gb_feature_importances = pd.DataFrame({'Feature': features, 'Importance': gb_model.feature_importances_})
print("\nGradient Boosting Feature Importances:")
print(gb_feature_importances)

# Use SHAP values for RandomForestRegressor
explainer = shap.TreeExplainer(rf_model)
shap_values = explainer.shap_values(X_train)

# Visualize SHAP values
shap.summary_plot(shap_values, X_train, plot_type='bar', feature_names=features)


# **10. Predictive Performance Assessment**

In [None]:
# Make predictions on the test set using the final model
final_model_predictions = rf_model.predict(X_test)

# Calculate Root Mean Squared Error (RMSE)
rmse = np.sqrt(mean_squared_error(y_test, final_model_predictions))
print("Root Mean Squared Error (RMSE):", rmse)

# Calculate R-squared (R²)
r2 = r2_score(y_test, final_model_predictions)
print("R-squared (R²):", r2)

# Analyze residuals
residuals = y_test - final_model_predictions

# Plot residuals
plt.figure(figsize=(10, 6))
plt.scatter(final_model_predictions, residuals, alpha=0.5)
plt.title('Residual Plot')
plt.xlabel('Predicted Log Price')
plt.ylabel('Residuals')
plt.axhline(y=0, color='r', linestyle='--')
plt.show()