In [None]:
1. Data Exploration and Preprocessing
# Import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

# Display basic dataset information
print(data.info())
print(data.describe())

# Visualize the distribution of the target variable (log_price)
plt.figure(figsize=(8, 6))
sns.histplot(data['log_price'], kde=True)
plt.title('Distribution of Airbnb Listing Prices (log-transformed)')
plt.xlabel('Log Price')
plt.ylabel('Frequency')
plt.show()

# Visualize missing values (optional)
plt.figure(figsize=(10, 6))
sns.heatmap(data.isnull(), cbar=False, cmap='viridis')
plt.title('Missing Values Heatmap')
plt.show()

# Handle missing values by using SimpleImputer (later in the code for each feature type)

In [None]:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74111 entries, 0 to 74110
Data columns (total 29 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   id                      74111 non-null  int64  
 1   log_price               74111 non-null  float64
 2   property_type           74111 non-null  object 
 3   room_type               74111 non-null  object 
 4   amenities               74111 non-null  object 
 5   accommodates            74111 non-null  int64  
 6   bathrooms               73911 non-null  float64
 7   bed_type                74111 non-null  object 
 8   cancellation_policy     74111 non-null  object 
 9   cleaning_fee            74111 non-null  bool   
 10  city                    74111 non-null  object 
 11  description             74111 non-null  object 
 12  first_review            58247 non-null  object 
 13  host_has_profile_pic    73923 non-null  object 
 14  host_identity_verified  73923 non-null  object 
 15  host_response_rate      55812 non-null  object 
 16  host_since              73923 non-null  object 
 17  instant_bookable        74111 non-null  object 
 18  last_review             58284 non-null  object 
 19  latitude                74111 non-null  float64
 20  longitude               74111 non-null  float64
 21  name                    74111 non-null  object 
 22  neighbourhood           67239 non-null  object 
 23  number_of_reviews       74111 non-null  int64  
 24  review_scores_rating    57389 non-null  float64
 25  thumbnail_url           65895 non-null  object 
 26  zipcode                 73143 non-null  object 
 27  bedrooms                74020 non-null  float64
 28  beds                    73980 non-null  float64
dtypes: bool(1), float64(7), int64(3), object(18)
memory usage: 15.9+ MB
None
                 id     log_price  accommodates     bathrooms      latitude  \
count  7.411100e+04  74111.000000  74111.000000  73911.000000  74111.000000   
mean   1.126662e+07      4.782069      3.155146      1.235263     38.445958   
std    6.081735e+06      0.717394      2.153589      0.582044      3.080167   
min    3.440000e+02      0.000000      1.000000      0.000000     33.338905   
25%    6.261964e+06      4.317488      2.000000      1.000000     34.127908   
50%    1.225415e+07      4.709530      2.000000      1.000000     40.662138   
75%    1.640226e+07      5.220356      4.000000      1.000000     40.746096   
max    2.123090e+07      7.600402     16.000000      8.000000     42.390437   

          longitude  number_of_reviews  review_scores_rating      bedrooms  \
count  74111.000000       74111.000000          57389.000000  74020.000000   
mean     -92.397525          20.900568             94.067365      1.265793   
std       21.705322          37.828641              7.836556      0.852143   
min     -122.511500           0.000000             20.000000      0.000000   
25%     -118.342374           1.000000             92.000000      1.000000   
50%      -76.996965           6.000000             96.000000      1.000000   
75%      -73.954660          23.000000            100.000000      1.000000   
max      -70.985047         605.000000            100.000000     10.000000   

               beds  
count  73980.000000  
mean       1.710868  
std        1.254142  
min        0.000000  
25%        1.000000  
50%        1.000000  
75%        2.000000  
max       18.000000  
Missing Values:
id                            0
log_price                     0
property_type                 0
room_type                     0
amenities                     0
accommodates                  0
bathrooms                   200
bed_type                      0
cancellation_policy           0
cleaning_fee                  0
city                          0
description                   0
first_review              15864
host_has_profile_pic        188
host_identity_verified      188
host_response_rate        18299
host_since                  188
instant_bookable              0
last_review               15827
latitude                      0
longitude                     0
name                          0
neighbourhood              6872
number_of_reviews             0
review_scores_rating      16722
thumbnail_url              8216
zipcode                     968
bedrooms                     91
beds                        131
dtype: int64

In [None]:
2. Model Development

In [None]:
# Separate features and target variable
a = data.drop(columns=['log_price'])
b = data['log_price']

# Use a smaller subset of the data for quicker testing
a_train_subset, _, b_train_subset, _ = train_test_split(a, b, test_size=0.9, random_state=42)

# Split data into training and test sets
a_train, b_test, b_train, b_test = train_test_split(a, b, test_size=0.2, random_state=42)

# Define preprocessing for numerical and categorical features
numeric_features = a.select_dtypes(include=['float64', 'int64']).columns
categorical_features = a.select_dtypes(include=['object']).columns

# Numerical pipeline: Impute missing values and scale features
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

numeric_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())


# Categorical pipeline: Impute missing values and apply one-hot encoding
from sklearn.preprocessing import OneHotEncoder

categorical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))

# Combine both pipelines into a column transformer
from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_pipeline, numeric_features),
        ('cat', categorical_pipeline, categorical_features)

# Define the model pipeline with preprocessing and RandomForestRegressor
from sklearn.ensemble import RandomForestRegressor

pipeline_rf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(random_state=42, max_samples=0.5, warm_start=True))  # Warm start and max_samples

# Define hyperparameter grid for RandomizedSearchCV
from scipy.stats import randint

param_dist = {
    'regressor__n_estimators': randint(50, 100),  # Limit n_estimators between 50 and 100
    'regressor__max_depth': [10, 20, 30],  # Limiting max depth
    'regressor__min_samples_split': randint(2, 5),
    'regressor__min_samples_leaf': randint(1, 5),

# Best model after tuning
best_rf_model = random_search_rf.best_estimator_

In [None]:
3. Model Evaluation

In [None]:
# Model evaluation on train and test data
y_train_pred = best_rf_model.predict(X_train)
y_test_pred = best_rf_model.predict(X_test)

# Calculate performance metrics
from sklearn.metrics import mean_squared_error, r2_score

train_mse = mean_squared_error(y_train, y_train_pred)
test_mse = mean_squared_error(y_test, y_test_pred)
train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)

# Output results
print(f"Train Mean Squared Error: {train_mse}")
print(f"Test Mean Squared Error: {test_mse}")
print(f"Train R^2: {train_r2}")
print(f"Test R^2: {test_r2}")

# Visualization of model performance: Predicted vs Actual for Test Set
plt.figure(figsize=(8, 6))
sns.scatterplot(x=y_test, y=y_test_pred)
plt.title('Predicted vs Actual Log Price (Test Set)')
plt.xlabel('Actual Log Price')
plt.ylabel('Predicted Log Price')
plt.show()

In [None]:
Evaluation Metrics in Machine Learning

Classification Metrics
In a classification task, our main task is to predict the target variable which is in the form of discrete values. To evaluate the performance
of such a model there are metrics as mentioned below:

Classification Accuracy
Classification accuracy is a fundamental metric for evaluating the performance of a classification model, providing a quick snapshot of how well
the model is performing in terms of correct predictions. This is calculated as the ratio of correct predictions to the total number of input Samples.
 Accuracy= 
Totalnumberofinputsamples
No.ofcorrectpredictions
â€‹
It works great if there are an equal number of samples for each class. For example, we have a 90% sample of class A and a 10% sample of class B in
our training set. Then, our model will predict with an accuracy of 90% by predicting all the training samples belonging to class A. If we test the
same model with a test set of 60% from class A and 40% from class B. Then the accuracy will fall, and we will get an accuracy of 60%. 

In [None]:
Improvements:
# Plotting feature importance
import numpy as np

# Get feature importances from the best model
feature_importances = best_rf_model.named_steps['regressor'].feature_importances_

# Get feature names
feature_names = numeric_features.tolist() + list(best_rf_model.named_steps['preprocessor'].transformers_[1][1].named_steps['onehot'].get_feature_names_out(categorical_features))

# Sort feature importances
sorted_idx = np.argsort(feature_importances)[::-1]

# Plot top 10 most important features
plt.figure(figsize=(10, 6))
plt.barh(range(10), feature_importances[sorted_idx[:10]], align='center')
plt.yticks(range(10), np.array(feature_names)[sorted_idx[:10]])
plt.title('Top 10 Feature Importances')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.show()


In [None]:
SUMMARY:
Your model's performance metrics indicate a strong fit to the data with good generalization to the test set. Visualizations like the Predicted 
vs Actual plot can further clarify how well the model is performing.