<a href="https://colab.research.google.com/github/Sameer103/Homestays_Data_analysis_and_price_prediction/blob/main/Homestays_Data_analysis_and_price_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import io
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import pearsonr

In [None]:
from datetime import datetime, timedelta

In [None]:
from google.colab import files
uploaded=files.upload()

In [None]:
df= pd.read_excel(io.BytesIO(uploaded['Homestays_Data.xlsx']))

In [None]:
df

In [None]:
df.info()

Task: Enhance the dataset by creating actionable and insightful features. Calculate `Host_Tenure` by determining the number of years from `host_since` to the current date, providing a measure of host experience. Generate `Amenities_Count` by counting the items listed in the `amenities` array to quantify property offerings. Determine `Days_Since_Last_Review` by calculating the days between `last_review` and today to assess listing activity and relevance.

In [None]:
# Host tenure calculation
df['host_since'] = pd.to_datetime(df['host_since'], format='%m-%d-%Y', errors='coerce')
df['current_date'] = datetime.now()
host_tenure = ((df['current_date'] - df['host_since']).dt.days // 365).fillna(0).astype(int)
host_tenure

In [None]:
if df['amenities'].dtype != 'str':
    df['amenities'] = df['amenities'].astype('str')
df['Amenities_Count'] = df['amenities'].apply(lambda x: len(set(x.strip('{}').split(','))))
df["Amenities_Count"]

In [None]:
df['last_review'] = pd.to_datetime(df['last_review'], format='%d-%m-%Y', errors='coerce')
days_since_last_review = ((df['current_date'] - df['last_review']).dt.days).fillna(0).astype(int)
days_since_last_review

In [None]:
df['room_type'] = df['room_type'].astype('category').cat.codes
df['property_type'] = df['property_type'].astype('category').cat.codes

# Correlation analysis
corr_matrix = df[['log_price', 'accommodates', 'number_of_reviews', 'room_type', 'property_type']].corr()
print(corr_matrix)

# Visualise correlation matrix
plt.figure(figsize=(8, 6))
sns.heatmap(corr_matrix, annot=True, cmap='YlOrRd')
plt.title('Correlation Matrix')
plt.show()


In [None]:
# Histogram
plt.figure(figsize=(12, 4))
plt.subplot(1, 3, 1)
df['log_price'].hist(bins=30)
plt.title('Distribution of log_price')
plt.subplot(1, 3, 2)
df['accommodates'].hist(bins=30)
plt.title('Distribution of accommodates')
plt.subplot(1, 3, 3)
df['number_of_reviews'].hist(bins=30)
plt.title('Distribution of number_of_reviews')
plt.tight_layout()
plt.show()

In [None]:

# Scatter plot analysis
plt.figure(figsize=(12, 4))
plt.subplot(1, 3, 1)
plt.scatter(df['accommodates'], df['log_price'])
plt.xlabel('Accommodates')
plt.ylabel('log_price')
plt.subplot(1, 3, 2)
plt.scatter(df['number_of_reviews'], df['log_price'])
plt.xlabel('Number of Reviews')
plt.ylabel('log_price')
plt.subplot(1, 3, 3)
plt.scatter(df['room_type'], df['log_price'])
plt.xlabel('Room Type')
plt.ylabel('log_price')
plt.tight_layout()
plt.show()

Task: Investigate the geographical data to understand regional pricing trends. Plot listings on a map using `latitude` and `longitude` data to visually assess price distribution. Examine if certain neighbourhoods or proximity to city centres influence pricing, providing a spatial perspective to the pricing strategy.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import folium
from folium.plugins import HeatMap

In [None]:
# Plot listings on a map
m = folium.Map(location=[df['latitude'].mean(), df['longitude'].mean()], zoom_start=12)

# Add markers for each listing
for idx, row in df.iterrows():
    folium.Marker([row['latitude'], row['longitude']], popup=f"Price: {row['log_price']}").add_to(m)

m.save('listings_map.html')

# Heatmap of price distribution
plt.figure(figsize=(12, 8))
HeatMap(data=df[['latitude', 'longitude', 'log_price']], radius=10).add_to(m)
m.save('price_heatmap.html')



In [None]:
# Analyze price distribution by neighborhood
df['neighborhood'] = df.apply(lambda row: f"{row['city']}", axis=1)
neighborhood_prices = df.groupby('neighborhood')['log_price'].mean().sort_values(ascending=False)

plt.figure(figsize=(12, 6))
neighborhood_prices.plot(kind='bar')
plt.title('Average log_price by Neighborhood')
plt.xlabel('Neighborhood')
plt.ylabel('log_price')
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()



In [None]:
# Analyze price distribution by distance to city centre
df['distance_to_city'] = np.sqrt((df['latitude'])**2 + (df['longitude'])**2)
plt.figure(figsize=(12, 6))
plt.scatter(df['distance_to_city'], df['log_price'])
plt.title('log_price vs Distance to City Centre')
plt.xlabel('Distance to City Centre')
plt.ylabel('log_price')
plt.show()

Task: Apply advanced natural language processing techniques to the `description` texts to extract sentiment scores. Use sentiment analysis tools to determine whether positive or negative descriptions influence listing prices, incorporating these findings into the predictive model being trained as a feature.

In [None]:
!pip install vaderSentiment

In [None]:
from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.linear_model import LinearRegression


In [None]:
# Sentiment analysis using TextBlob
df['description_polarity'] = df['description'].apply(lambda x: TextBlob(str(x)).sentiment.polarity)
df['description_subjectivity'] = df['description'].apply(lambda x: TextBlob(str(x)).sentiment.subjectivity)

# Sentiment analysis using VADER
vader = SentimentIntensityAnalyzer()
df['description_vader_score'] = df['description'].apply(lambda x: vader.polarity_scores(str(x))['compound'])

# Explore the sentiment scores
plt.figure(figsize=(12, 4))
plt.subplot(1, 3, 1)
df['description_polarity'].hist(bins=30)
plt.title('TextBlob Polarity')
plt.subplot(1, 3, 2)
df['description_subjectivity'].hist(bins=30)
plt.title('TextBlob Subjectivity')
plt.subplot(1, 3, 3)
df['description_vader_score'].hist(bins=30)
plt.title('VADER Sentiment Score')
plt.tight_layout()
plt.show()

# Analyze the relationship between sentiment and log_price
plt.figure(figsize=(12, 4))
plt.subplot(1, 3, 1)
plt.scatter(df['description_polarity'], df['log_price'])
plt.xlabel('TextBlob Polarity')
plt.ylabel('log_price')
plt.subplot(1, 3, 2)
plt.scatter(df['description_subjectivity'], df['log_price'])
plt.xlabel('TextBlob Subjectivity')
plt.ylabel('log_price')
plt.subplot(1, 3, 3)
plt.scatter(df['description_vader_score'], df['log_price'])
plt.xlabel('VADER Sentiment Score')
plt.ylabel('log_price')
plt.tight_layout()
plt.show()



In [None]:
# Incorporate sentiment features into the predictive model
X = df[['accommodates', 'number_of_reviews', 'description_polarity', 'description_subjectivity', 'description_vader_score']]
y = df['log_price']
model = LinearRegression()
model.fit(X, y)
print('Model Coefficients:', model.coef_)

Task: Thoroughly parse and analyse the `amenities` provided in the listings. Identify which amenities are most associated with higher or lower prices by applying statistical tests to determine correlations, thereby informing both pricing strategy and model inputs.

In [None]:
from scipy.stats import pearsonr, chi2_contingency


In [None]:
# Parse the amenities
df['amenities'] = df['amenities'].apply(lambda x: set(x.strip('{}').split(',')))
all_amenities = set([item for row in df['amenities'] for item in row])

# Create a binary amenity matrix
amenity_matrix = pd.DataFrame(0, index=df.index, columns=all_amenities)
for i, row in df.iterrows():
    for amenity in row['amenities']:
        amenity_matrix.at[i, amenity] = 1

# Analyze the relationship between amenities and log_price
plt.figure(figsize=(12, 6))
for amenity in all_amenities:
    pearson_r, p_value = pearsonr(amenity_matrix[amenity], df['log_price'])
    if p_value < 0.05:
        plt.scatter(amenity_matrix[amenity], df['log_price'], label=amenity, alpha=0.5)
plt.xlabel('Amenity Presence')
plt.ylabel('log_price')
plt.title('Amenities vs. log_price')
plt.legend(loc='upper left', bbox_to_anchor=(1, 1))
plt.tight_layout()
plt.show()

# Identify the most influential amenities
corr_matrix = amenity_matrix.corrwith(df['log_price'])
top_amenities = corr_matrix.abs().sort_values(ascending=False).head(10)
print('Top 10 Amenities Correlated with log_price:')
print(top_amenities)

# Perform chi-square test for association
for amenity in all_amenities:
    contingency_table = pd.crosstab(df['log_price'].apply(lambda x: 'High' if x > df['log_price'].mean() else 'Low'), amenity_matrix[amenity])
    chi2, p_value, dof, expected = chi2_contingency(contingency_table)
    if p_value < 0.05:
        print(f"Amenity '{amenity}' is significantly associated with high/low log_price (p-value: {p_value:.4f})")

Task: Convert categorical data into a format suitable for machine learning analysis. Apply one-hot encoding to variables like `room_type`, `city`, and `property_type`, ensuring that the model can interpret these as distinct features without any ordinal implication.

In [None]:
from sklearn.preprocessing import OneHotEncoder


In [None]:
encoder = OneHotEncoder()

# Encode room_type
room_type_encoded = encoder.fit_transform(df['room_type'].to_numpy().reshape(-1, 1)).toarray()
room_type_df = pd.DataFrame(room_type_encoded, columns=[f'room_type_{category}' for category in encoder.categories_[0]])
df = pd.concat([df, room_type_df], axis=1)

# Encode city
city_encoded = encoder.fit_transform(df['city'].to_numpy().reshape(-1, 1)).toarray()
city_df = pd.DataFrame(city_encoded, columns=[f'city_{category}' for category in encoder.categories_[0]])
df = pd.concat([df, city_df], axis=1)

# Encode property_type
property_type_encoded = encoder.fit_transform(df['property_type'].to_numpy().reshape(-1, 1)).toarray()
property_type_df = pd.DataFrame(property_type_encoded, columns=[f'property_type_{category}' for category in encoder.categories_[0]])
df = pd.concat([df, property_type_df], axis=1)

# Drop the original categorical columns
df = df.drop(['room_type', 'city', 'property_type'], axis=1)

# Inspect the new DataFrame
print(df.head())

Task: Design and train predictive models to estimate `log_price`. Begin with a simple linear regression to establish a baseline, then explore more complex models such as RandomForest and GradientBoosting to better capture non-linear relationships and interactions between features. Document (briefly within Jupyter notebook itself) the model-building process, specifying the choice of algorithms and rationale.

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
# Preprocess the data
X = df[['accommodates', 'number_of_reviews', 'description_polarity', 'description_subjectivity', 'description_vader_score']]
y = df['log_price']

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Linear Regression model
print('Training a Linear Regression model...')
linear_reg = LinearRegression()
linear_reg.fit(X_train, y_train)
linear_reg_train_mse = mean_squared_error(y_train, linear_reg.predict(X_train))
linear_reg_test_mse = mean_squared_error(y_test, linear_reg.predict(X_test))
linear_reg_train_r2 = r2_score(y_train, linear_reg.predict(X_train))
linear_reg_test_r2 = r2_score(y_test, linear_reg.predict(X_test))
print(f'Linear Regression Train MSE: {linear_reg_train_mse:.4f}')
print(f'Linear Regression Test MSE: {linear_reg_test_mse:.4f}')
print(f'Linear Regression Train R-squared: {linear_reg_train_r2:.4f}')
print(f'Linear Regression Test R-squared: {linear_reg_test_r2:.4f}')

# Train a Random Forest Regressor
print('\nTraining a Random Forest Regressor model...')
rf_reg = RandomForestRegressor(n_estimators=100, random_state=42)
rf_reg.fit(X_train, y_train)
rf_reg_train_mse = mean_squared_error(y_train, rf_reg.predict(X_train))
rf_reg_test_mse = mean_squared_error(y_test, rf_reg.predict(X_test))
rf_reg_train_r2 = r2_score(y_train, rf_reg.predict(X_train))
rf_reg_test_r2 = r2_score(y_test, rf_reg.predict(X_test))
print(f'Random Forest Regressor Train MSE: {rf_reg_train_mse:.4f}')
print(f'Random Forest Regressor Test MSE: {rf_reg_test_mse:.4f}')
print(f'Random Forest Regressor Train R-squared: {rf_reg_train_r2:.4f}')
print(f'Random Forest Regressor Test R-squared: {rf_reg_test_r2:.4f}')

# Train a Gradient Boosting Regressor
print('\nTraining a Gradient Boosting Regressor model...')
gb_reg = GradientBoostingRegressor(n_estimators=100, random_state=42)
gb_reg.fit(X_train, y_train)
gb_reg_train_mse = mean_squared_error(y_train, gb_reg.predict(X_train))
gb_reg_test_mse = mean_squared_error(y_test, gb_reg.predict(X_test))
gb_reg_train_r2 = r2_score(y_train, gb_reg.predict(X_train))
gb_reg_test_r2 = r2_score(y_test, gb_reg.predict(X_test))
print(f'Gradient Boosting Regressor Train MSE: {gb_reg_train_mse:.4f}')
print(f'Gradient Boosting Regressor Test MSE: {gb_reg_test_mse:.4f}')
print(f'Gradient Boosting Regressor Train R-squared: {gb_reg_train_r2:.4f}')
print(f'Gradient Boosting Regressor Test R-squared: {gb_reg_test_r2:.4f}')

Task: Analyze the trained models to identify which features most significantly impact `log_price`. Utilize model-specific methods like feature importance scores for tree-based models and SHAP values for an in-depth understanding of feature contributions.

In [None]:
!pip install scikit-learn-extra

In [None]:
# Import the necessary modules
#import plot_partial_dependence
import shap

In [None]:
X = df[['accommodates', 'number_of_reviews', 'room_type', 'city', 'property_type', 'description_polarity', 'description_subjectivity', 'description_vader_score']]
y = df['log_price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the models
linear_reg = LinearRegression()
linear_reg.fit(X_train, y_train)

rf_reg = RandomForestRegressor(n_estimators=100, random_state=42)
rf_reg.fit(X_train, y_train)

gb_reg = GradientBoostingRegressor(n_estimators=100, random_state=42)
gb_reg.fit(X_train, y_train)

# Feature importance for Random Forest Regressor
plt.figure(figsize=(12, 6))
feature_importances = rf_reg.feature_importances_
feature_names = X.columns
sorted_idx = np.argsort(feature_importances)[::-1]

plt.bar(range(X.shape[1]), feature_importances[sorted_idx])
plt.xticks(range(X.shape[1]), [feature_names[i] for i in sorted_idx], rotation=90)
plt.title('Random Forest Feature Importances')
plt.xlabel('Feature')
plt.ylabel('Importance')
plt.tight_layout()
plt.show()

# SHAP values for Gradient Boosting Regressor
explainer = shap.TreeExplainer(gb_reg)
shap_values = explainer.shap_values(X_test)

plt.figure(figsize=(12, 6))
shap.summary_plot(shap_values, X_test, plot_type="bar")
plt.title('SHAP Values for Gradient Boosting Regressor')
plt.tight_layout()
plt.show()

Task: Critically evaluate the performance of the final model on a reserved test set. Use metrics such as Root Mean Squared Error (RMSE) and R-squared to assess accuracy and goodness of fit. Provide a detailed analysis of the residuals to check for any patterns that might suggest model biases or misfit.


In [None]:
from sklearn.metrics import mean_squared_error, r2_score


In [None]:
X = df[['accommodates', 'number_of_reviews', 'description_polarity', 'description_subjectivity', 'description_vader_score']]
y = df['log_price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the final model (Gradient Boosting Regressor)
gb_reg = GradientBoostingRegressor(n_estimators=100, random_state=42)
gb_reg.fit(X_train, y_train)

# Evaluate the model on the test set
y_pred = gb_reg.predict(X_test)
test_rmse = np.sqrt(mean_squared_error(y_test, y_pred))
test_r2 = r2_score(y_test, y_pred)

print(f'Test RMSE: {test_rmse:.4f}')
print(f'Test R-squared: {test_r2:.4f}')

# Analyze the residuals
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
sns.scatterplot(x=y_test, y=y_pred)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
plt.xlabel('Actual log_price')
plt.ylabel('Predicted log_price')
plt.title('Actual vs. Predicted log_price')

plt.subplot(1, 2, 2)
residuals = y_test - y_pred
sns.histplot(residuals, bins=30)
plt.xlabel('Residuals')
plt.ylabel('Frequency')
plt.title('Histogram of Residuals')
plt.tight_layout()
plt.show()