## Import Packages

In [2]:
# Standard Operational Packages
import pandas as pd
import numpy as np

# Visualisation packages
import matplotlib.pyplot as plt
import seaborn as sns

# Statistical Modelling Packages
from scipy import stats
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
from statsmodels.formula.api import ols
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [3]:
pd.set_option('display.max_columns', None)

## Data Loading

In [4]:
# Load the Data and display a few rows
airbnb_data_final = pd.read_csv('airbnb_data_final.csv')
airbnb_data_final.head(5)

Unnamed: 0.1,Unnamed: 0,listing_url,name,host_id,host_since,host_tenure_days,host_since_readable,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_total_listings_count,host_verifications,verifications_count,host_has_profile_pic,host_identity_verified,neighbourhood_cleansed,latitude,longitude,property_type,room_type,accommodates,bathrooms_text,bathrooms_number,bedrooms,beds,amenities,amenities_count,price,log_price,minimum_nights,maximum_nights,availability_30,availability_60,availability_90,availability_365,number_of_reviews,number_of_reviews_ltm,number_of_reviews_l30d,availability_eoy,estimated_occupancy_l365d,estimated_revenue_l365d,log_estimated_revenue_l365d,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,instant_bookable,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month,has_reviews,has_revenue,verification_status,superhost_status
0,0,https://www.airbnb.com/rooms/264776,Huge Four Bedroom Apartment,1389063,2011-11-09,5056,"November 09, 2011",within an hour,0.86,1.0,0,12,"['email', 'phone']",2,1,1,Lewisham,51.44306,-0.01948,Entire rental unit,Entire home/apt,10,2 baths,2,4,8,"[""Cooking basics"", ""Washer"", ""Iron"", ""Smoke al...",35,297.0,5.697093,3,365,13,15,31,293,68,12,1,134,110,32670.0,10.394243,4.68,4.65,4.72,4.84,4.74,4.62,4.72,1,11,11,0,0,0.51,1,1,Yes,No
1,1,https://www.airbnb.com/rooms/264777,One Bedroom Apartment,1389063,2011-11-09,5056,"November 09, 2011",within an hour,0.86,1.0,0,12,"['email', 'phone']",2,1,1,Lewisham,51.44284,-0.01997,Entire rental unit,Entire home/apt,2,1 bath,1,1,2,"[""Cooking basics"", ""Wine glasses"", ""Iron"", ""Sm...",44,98.0,4.59512,3,365,0,13,43,318,24,4,0,157,37,3626.0,8.196161,4.58,4.42,4.54,4.58,4.25,4.54,4.42,1,11,11,0,0,0.22,1,1,Yes,No
2,2,https://www.airbnb.com/rooms/264778,Two Bedroom Newly Refurbished Apartment,1389063,2011-11-09,5056,"November 09, 2011",within an hour,0.86,1.0,0,12,"['email', 'phone']",2,1,1,Lewisham,51.44359,-0.02275,Entire rental unit,Entire home/apt,4,1 bath,1,2,3,"[""Cooking basics"", ""Iron"", ""Smoke alarm"", ""Fir...",27,148.0,5.003946,3,365,3,6,27,302,58,6,0,141,55,8140.0,9.004668,4.5,4.5,4.45,4.72,4.52,4.36,4.38,1,11,11,0,0,0.43,1,1,Yes,No
3,3,https://www.airbnb.com/rooms/264779,Refurbished Two Bedroom Apartment,1389063,2011-11-09,5056,"November 09, 2011",within an hour,0.86,1.0,0,12,"['email', 'phone']",2,1,1,Lewisham,51.44355,-0.02309,Entire rental unit,Entire home/apt,5,1 bath,1,2,5,"[""Cooking basics"", ""Iron"", ""Smoke alarm"", ""Fir...",29,144.0,4.976734,3,365,11,33,53,328,36,7,0,167,64,9216.0,9.128805,4.64,4.67,4.58,4.83,4.61,4.5,4.47,1,11,11,0,0,0.3,1,1,Yes,No
4,4,https://www.airbnb.com/rooms/264780,Spacious refurbished 2 bedroom apt with balcony,1389063,2011-11-09,5056,"November 09, 2011",within an hour,0.86,1.0,0,12,"['email', 'phone']",2,1,1,Lewisham,51.44333,-0.02307,Entire rental unit,Entire home/apt,4,2 baths,2,2,4,"[""Cooking basics"", ""Washer"", ""Iron"", ""Smoke al...",35,157.0,5.062595,3,365,0,9,11,255,54,4,0,101,37,5809.0,8.667336,4.68,4.78,4.65,4.81,4.74,4.37,4.59,1,11,11,0,0,0.35,1,1,Yes,No


## Descriptive Statistics

In [5]:
# Calculate Descriptive Statistics
airbnb_data_final.describe()


Unnamed: 0.1,Unnamed: 0,host_id,host_tenure_days,host_response_rate,host_acceptance_rate,host_is_superhost,host_total_listings_count,verifications_count,host_has_profile_pic,host_identity_verified,latitude,longitude,accommodates,bathrooms_number,bedrooms,beds,amenities_count,price,log_price,minimum_nights,maximum_nights,availability_30,availability_60,availability_90,availability_365,number_of_reviews,number_of_reviews_ltm,number_of_reviews_l30d,availability_eoy,estimated_occupancy_l365d,estimated_revenue_l365d,log_estimated_revenue_l365d,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,instant_bookable,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month,has_reviews,has_revenue
count,57388.0,57388.0,57388.0,57388.0,57388.0,57388.0,57388.0,57388.0,57388.0,57388.0,57388.0,57388.0,57388.0,57388.0,57388.0,57388.0,57388.0,57388.0,57388.0,57388.0,57388.0,57388.0,57388.0,57388.0,57388.0,57388.0,57388.0,57388.0,57388.0,57388.0,57388.0,57388.0,57388.0,57388.0,57388.0,57388.0,57388.0,57388.0,57388.0,57388.0,57388.0,57388.0,57388.0,57388.0,57388.0,57388.0,57388.0
mean,28991.998101,262447900.0,2469.652506,0.813363,0.756898,0.237088,108.99601,2.001464,0.942967,0.921482,51.509557,-0.129435,3.553879,1.291838,1.561337,1.958667,31.508887,207.964365,4.948478,4.852861,411.465254,11.858263,28.560919,48.569422,205.367533,22.062104,7.689865,0.706228,120.425385,61.229125,10608.29,5.867262,3.82004,3.848968,3.801532,3.901506,3.920462,3.85952,3.756171,0.368997,21.375409,18.803879,2.507162,0.027706,0.973967,0.816094,0.656792
std,16724.469074,225569900.0,1429.002142,0.355732,0.34615,0.4253,705.792887,0.466149,0.231907,0.268988,0.049126,0.104885,2.200979,0.710558,1.044392,1.4303,15.325537,466.572076,0.786459,17.439291,359.164361,10.501927,20.270578,29.367108,120.040585,49.242958,13.567352,1.415312,65.903596,81.314951,44444.99,4.355443,1.869079,1.88026,1.862883,1.896038,1.903165,1.871082,1.844894,0.482537,61.992007,61.718321,9.42199,0.641526,1.336465,0.387411,0.474784
min,0.0,2594.0,96.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,51.295937,-0.496075,1.0,0.0,0.0,0.0,0.0,30.0,3.433987,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,14527.75,42653210.0,1090.0,0.89,0.6175,0.0,2.0,2.0,1.0,1.0,51.486129,-0.191195,2.0,1.0,1.0,1.0,20.0,78.0,4.369448,1.0,90.0,3.0,11.0,24.0,89.0,1.0,0.0,0.0,64.0,0.0,0.0,0.0,4.04,4.2,4.0,4.36,4.44,4.27,4.0,0.0,1.0,1.0,0.0,0.0,0.08,1.0,0.0
50%,29011.5,201867400.0,2617.0,1.0,0.96,0.0,5.0,2.0,1.0,1.0,51.51321,-0.133244,3.0,1.0,1.0,2.0,32.0,137.0,4.927254,2.0,365.0,9.0,26.0,49.0,220.0,5.0,2.0,0.0,135.0,20.0,2826.0,7.946971,4.75,4.79,4.71,4.86,4.9,4.77,4.65,0.0,3.0,1.0,0.0,0.0,0.51,1.0,1.0
75%,43468.25,479975200.0,3663.0,1.0,1.0,0.0,24.0,2.0,1.0,1.0,51.536753,-0.069998,4.0,1.0,2.0,2.0,42.0,229.0,5.438079,3.0,365.0,21.0,47.0,75.0,319.0,22.0,9.0,1.0,179.0,92.0,12180.0,9.407633,4.96,4.99,4.95,5.0,5.0,4.97,4.87,1.0,14.0,10.0,1.0,0.0,1.29,1.0,1.0
max,57940.0,700129800.0,6224.0,1.0,1.0,1.0,8723.0,3.0,1.0,1.0,51.68263,0.27896,16.0,30.0,50.0,50.0,104.0,30000.0,10.308986,999.0,9011.0,30.0,60.0,90.0,365.0,1855.0,355.0,41.0,205.0,255.0,5100000.0,15.444751,5.0,5.0,5.0,5.0,5.0,5.0,5.0,1.0,495.0,495.0,116.0,25.0,38.41,1.0,1.0


## Hypothesis Testing

### 1. Is there a statistically significant difference in the mean prices of hosts with superhost status and hosts without?
- H0 (Null Hypothesis): The mean log price of Superhosts equals that of Non-Superhosts.
- Ha (Alternative Hypothesis): The mean log price of Superhosts is NOT equal to that of Non-Superhosts.


In [6]:
# --- 0. HYPOTHESIS DEFINITION ---
# H0 (Null Hypothesis): The mean log price of Superhosts equals that of Non-Superhosts.
# Ha (Alternative Hypothesis): The mean log price of Superhosts is NOT equal to that of Non-Superhosts.
alpha = 0.05 # Significance level



print("--- Hypothesis Test for Mean LOG PRICE Difference (Superhost vs. Non-Superhost) ---\n")

# --- 1. DATA PREPARATION ---
try:
    # Separate the log_price data into two groups

    superhosts_listings = airbnb_data_final[airbnb_data_final['superhost_status'] == 'Yes']
    non_superhosts_listings = airbnb_data_final[airbnb_data_final['superhost_status'] == 'No']

    superhost_prices = superhosts_listings['log_price']
    non_superhost_prices = non_superhosts_listings['log_price']

    # Handle cases where one group might be empty (unlikely but safe)
    if superhost_prices.empty or non_superhost_prices.empty:
        print("Error: One or both host groups are empty. Cannot perform test.")
    else:
        print(f"Sample Size (Superhost): {len(superhost_prices)}")
        print(f"Sample Size (Non-Superhost): {len(non_superhost_prices)}")
        print(f"Mean log price (Superhost): {superhost_prices.mean():.4f}")
        print(f"Mean log price (Non-Superhost): {non_superhost_prices.mean():.4f}")
        print("-" * 50)

        # --- 2. ASSUMPTION CHECK: LEVENE'S TEST (Variance Equality) ---
        levene_stat, levene_p_value = stats.levene(
            superhost_prices,
            non_superhost_prices,
            center='mean'
        )

        # We will use Welch's t-test (equal_var=False) for robustness, regardless of this result.
        # But we report the Levene's test for completeness.
        print(f"Levene's Test p-value (Variance Equality): {levene_p_value:.4f}")
        if levene_p_value < alpha:
            print("Decision: Reject H0. Variances are NOT equal. (Welch's t-test is appropriate).")
        else:
            print("Decision: Fail to Reject H0. Variances are equal. (Standard t-test is acceptable).")
        print("-" * 50)

        # --- 3. HYPOTHESIS TEST: WELCH'S T-TEST ---
        t_stat, t_p_value = stats.ttest_ind(
            superhost_prices,
            non_superhost_prices,
            equal_var=False # Forces Welch's t-test for robustness
        )

        print(f"Welch's t-statistic: {t_stat:.4f}")
        print(f"p-value: {t_p_value:.10f}")
        print("-" * 50)

        # --- 4. INTERPRETATION ---
        print("FINAL CONCLUSION:")
        if t_p_value < alpha:
            print(f"Since the p-value ({t_p_value:.10f}) is less than alpha ({alpha}), we **REJECT the Null Hypothesis**.")
            print("There IS a **statistically significant difference** in the mean log price.")

            # Determine direction of difference
            mean_super = superhost_prices.mean()
            mean_non_super = non_superhost_prices.mean()

            if mean_super > mean_non_super:
                print(f"Superhosts have a higher mean log price (e^{mean_super:.2f}) than Non-Superhosts (e^{mean_non_super:.2f}).")
            else:
                print(f"Non-Superhosts have a higher mean log price (e^{mean_non_super:.2f}) than Superhosts (e^{mean_super:.2f}).")
        else:
            print(f"Since the p-value ({t_p_value:.10f}) is greater than alpha ({alpha}), we **FAIL TO REJECT the Null Hypothesis**.")
            print("There is **NO statistically significant difference** in the mean log price.")

except NameError:
    print("\nError: The DataFrame 'airbnb_data_cleaned' is not defined. Please ensure the data is loaded.")
except KeyError as e:
    print(f"\nError: One or more required columns are missing from the DataFrame: {e}. Please check 'log_price' and 'host_is_superhost'.")

--- Hypothesis Test for Mean LOG PRICE Difference (Superhost vs. Non-Superhost) ---

Sample Size (Superhost): 13606
Sample Size (Non-Superhost): 43782
Mean log price (Superhost): 5.0085
Mean log price (Non-Superhost): 4.9298
--------------------------------------------------
Levene's Test p-value (Variance Equality): 0.0000
Decision: Reject H0. Variances are NOT equal. (Welch's t-test is appropriate).
--------------------------------------------------
Welch's t-statistic: 9.9565
p-value: 0.0000000000
--------------------------------------------------
FINAL CONCLUSION:
Since the p-value (0.0000000000) is less than alpha (0.05), we **REJECT the Null Hypothesis**.
There IS a **statistically significant difference** in the mean log price.
Superhosts have a higher mean log price (e^5.01) than Non-Superhosts (e^4.93).


## 2.  Is there a statistically significant difference in the mean revenue of hosts with superhost status and hosts without?
- H0 (Null Hypothesis): The mean log revenue of Superhosts equals that of Non-Superhosts.
- Ha (Alternative Hypothesis): The mean log revenue of Superhosts is NOT equal to that of Non-Superhosts.

In [7]:
# --- 0. HYPOTHESIS DEFINITION ---
# H0 (Null Hypothesis): The mean log price of Superhosts equals that of Non-Superhosts.
# Ha (Alternative Hypothesis): The mean log price of Superhosts is NOT equal to that of Non-Superhosts.
alpha = 0.05 # Significance level

print("--- Hypothesis Test for Mean LOG Revenue Difference (Superhost vs. Non-Superhost) ---\n")

# --- 1. DATA PREPARATION ---
try:
    # Filter data to include listings with revenue > 0
    airbnb_data_final_nonzero_revenue = airbnb_data_final[airbnb_data_final['has_revenue'] == 1]
    # Separate the log_price data into two groups
    superhost_revenue = airbnb_data_final_nonzero_revenue[
        airbnb_data_final_nonzero_revenue['superhost_status'] == 'Yes'
    ]['log_estimated_revenue_l365d']

    non_superhost_revenue = airbnb_data_final_nonzero_revenue[
        airbnb_data_final_nonzero_revenue['superhost_status'] == 'No'
    ]['log_estimated_revenue_l365d']

    # Handle cases where one group might be empty (unlikely but safe)
    if superhost_revenue.empty or non_superhost_revenue.empty:
        print("Error: One or both host groups are empty. Cannot perform test.")
    else:
        print(f"Sample Size (Superhost): {len(superhost_revenue)}")
        print(f"Sample Size (Non-Superhost): {len(non_superhost_revenue)}")
        print(f"Mean log price (Superhost): {superhost_revenue.mean():.4f}")
        print(f"Mean log price (Non-Superhost): {non_superhost_revenue.mean():.4f}")
        print("-" * 50)

        # --- 2. ASSUMPTION CHECK: LEVENE'S TEST (Variance Equality) ---
        levene_stat, levene_p_value = stats.levene(
            superhost_revenue,
            non_superhost_revenue,
            center='mean'
        )

        # We will use Welch's t-test (equal_var=False) for robustness, regardless of this result.
        # But we report the Levene's test for completeness.
        print(f"Levene's Test p-value (Variance Equality): {levene_p_value:.4f}")
        if levene_p_value < alpha:
            print("Decision: Reject H0. Variances are NOT equal. (Welch's t-test is appropriate).")
        else:
            print("Decision: Fail to Reject H0. Variances are equal. (Standard t-test is acceptable).")
        print("-" * 50)

        # --- 3. HYPOTHESIS TEST: WELCH'S T-TEST ---
        t_stat, t_p_value = stats.ttest_ind(
            superhost_revenue,
            non_superhost_revenue,
            equal_var=False # Forces Welch's t-test for robustness
        )

        print(f"Welch's t-statistic: {t_stat:.4f}")
        print(f"p-value: {t_p_value:.10f}")
        print("-" * 50)

        # --- 4. INTERPRETATION ---
        print("FINAL CONCLUSION:")
        if t_p_value < alpha:
            print(f"Since the p-value ({t_p_value:.10f}) is less than alpha ({alpha}), we **REJECT the Null Hypothesis**.")
            print("There IS a **statistically significant difference** in the mean log revenue.")

            # Determine direction of difference
            mean_super = superhost_revenue.mean()
            mean_non_super = non_superhost_revenue.mean()

            if mean_super > mean_non_super:
                print(f"Superhosts have a higher mean log revenue (e^{mean_super:.2f}) than Non-Superhosts (e^{mean_non_super:.2f}).")
            else:
                print(f"Non-Superhosts have a higher mean log revenue (e^{mean_non_super:.2f}) than Superhosts (e^{mean_super:.2f}).")
        else:
            print(f"Since the p-value ({t_p_value:.10f}) is greater than alpha ({alpha}), we **FAIL TO REJECT the Null Hypothesis**.")
            print("There is **NO statistically significant difference** in the mean log revenue.")

except NameError:
    print("\nError: The DataFrame 'airbnb_data_cleaned' is not defined. Please ensure the data is loaded.")
except KeyError as e:
    print(f"\nError: One or more required columns are missing from the DataFrame: {e}. Please check 'log_price' and 'host_is_superhost'.")

--- Hypothesis Test for Mean LOG Revenue Difference (Superhost vs. Non-Superhost) ---

Sample Size (Superhost): 11842
Sample Size (Non-Superhost): 25850
Mean log price (Superhost): 9.4691
Mean log price (Non-Superhost): 8.6618
--------------------------------------------------
Levene's Test p-value (Variance Equality): 0.0000
Decision: Reject H0. Variances are NOT equal. (Welch's t-test is appropriate).
--------------------------------------------------
Welch's t-statistic: 61.3052
p-value: 0.0000000000
--------------------------------------------------
FINAL CONCLUSION:
Since the p-value (0.0000000000) is less than alpha (0.05), we **REJECT the Null Hypothesis**.
There IS a **statistically significant difference** in the mean log revenue.
Superhosts have a higher mean log revenue (e^9.47) than Non-Superhosts (e^8.66).


## Model Building
Finding the most significant factors affecting Revenue

In [12]:
# Preparing the data for the model
airbnb_data_model = airbnb_data_final_nonzero_revenue[['log_price', 'log_estimated_revenue_l365d', 'host_tenure_days',	'accommodates',	'number_of_reviews',
'host_response_rate',	'bathrooms_number',	'reviews_per_month',
'host_acceptance_rate',	'bedrooms',	'review_scores_rating',
'host_total_listings_count',	'amenities_count',	'review_scores_accuracy',
	'minimum_nights',	'review_scores_cleanliness',
'maximum_nights',	'review_scores_communication',
'availability_365',	'review_scores_value',
'latitude',  'longitude',
'calculated_host_listings_count']]


# --- Configuration ---
TARGET_COL = 'log_estimated_revenue_l365d' # Your dependent variable
STATUS_COL = 'superhost_status' # Include this as a feature in the overall model
RANDOM_STATE = 42

print("--- Running Random Forest Regressor on the FULL Dataset ---")

try:
    # --- 1. DEFINE FEATURES (X) AND TARGET (Y) ---
    # Assume all columns except the target are features.
    X_full = airbnb_data_model.drop(columns=[TARGET_COL, 'log_price',  ])
    y_full = airbnb_data_model[TARGET_COL]

    # --- 2. TRAIN/TEST SPLIT ---
    # Split the data to evaluate model performance later
    X_train, X_test, y_train, y_test = train_test_split(
        X_full, y_full, test_size=0.2, random_state=RANDOM_STATE
    )

    print(f"Total training listings: {len(X_train)}")
    print(f"Total testing listings: {len(X_test)}")
    print("-" * 55)

    # --- 3. TRAIN THE MODEL ---
    # n_estimators=500 is a good starting point for a stable model
    rf_model_overall = RandomForestRegressor(
        n_estimators=500,
        random_state=RANDOM_STATE,
        n_jobs=-1 # Use all available cores
    )

    # Fit the model to the training data
    rf_model_overall.fit(X_train, y_train)

    # --- 4. EVALUATE MODEL PERFORMANCE (on Test Set) ---
    y_pred = rf_model_overall.predict(X_test)

    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)

    print("Model Evaluation (Test Set):")
    print(f"RMSE (Root Mean Squared Error): {rmse:.4f}")
    print(f"R^2 Score (Variance Explained): {r2:.4f}")
    print("-" * 55)

    # --- 5. EXTRACT FEATURE IMPORTANCE ---
    # This is the crucial step for your analysis
    importance_overall = pd.Series(
        rf_model_overall.feature_importances_,
        index=X_full.columns
    ).sort_values(ascending=False)

    print("Top 10 Feature Importance (Overall Market):")
    print(importance_overall.head(10))

except NameError:
    print("\nError: The DataFrame 'airbnb_data_final' is not defined. Please ensure your data is loaded.")
except KeyError as e:
    print(f"\nError: A required column is missing. Check if '{TARGET_COL}' or features are correctly named: {e}.")
except ValueError as e:
    if "could not convert string to float" in str(e):
        print(f"\nError: Features must be numeric. Ensure all columns in X_full have been properly one-hot encoded or transformed (especially '{STATUS_COL}' if it's still a string).")
    else:
        print(f"\nAn error occurred during model training: {e}")

--- Running Random Forest Regressor on the FULL Dataset ---
Total training listings: 30153
Total testing listings: 7539
-------------------------------------------------------
Model Evaluation (Test Set):
RMSE (Root Mean Squared Error): 0.6082
R^2 Score (Variance Explained): 0.7814
-------------------------------------------------------
Top 10 Feature Importance (Overall Market):
number_of_reviews    0.363589
reviews_per_month    0.169294
accommodates         0.137391
minimum_nights       0.051855
longitude            0.044298
latitude             0.038927
bathrooms_number     0.025063
availability_365     0.020750
amenities_count      0.017144
host_tenure_days     0.015687
dtype: float64


Finding the most significant factors affecting Price

In [13]:
# Preparing the data for the model
airbnb_data_model = airbnb_data_final[['log_price', 'log_estimated_revenue_l365d', 'host_tenure_days',	'accommodates',	'number_of_reviews',
'host_response_rate',	'bathrooms_number',	'reviews_per_month',
'host_acceptance_rate',	'bedrooms',	'review_scores_rating',
'host_total_listings_count',	'amenities_count',	'review_scores_accuracy',
	'minimum_nights',	'review_scores_cleanliness',
'maximum_nights',	'review_scores_communication',
'availability_365',	'review_scores_value',
'latitude',  'longitude',
'calculated_host_listings_count']]


# --- Configuration ---
TARGET_COL = 'log_price' # Your dependent variable
STATUS_COL = 'superhost_status' # Include this as a feature in the overall model
RANDOM_STATE = 42

print("--- Running Random Forest Regressor on the FULL Dataset ---")

try:
    # --- 1. DEFINE FEATURES (X) AND TARGET (Y) ---
    # Assume all columns except the target are features.
    X_full = airbnb_data_model.drop(columns=[TARGET_COL, 'log_estimated_revenue_l365d',  ])
    y_full = airbnb_data_model[TARGET_COL]

    # --- 2. TRAIN/TEST SPLIT ---
    # Split the data to evaluate model performance later
    X_train, X_test, y_train, y_test = train_test_split(
        X_full, y_full, test_size=0.2, random_state=RANDOM_STATE
    )

    print(f"Total training listings: {len(X_train)}")
    print(f"Total testing listings: {len(X_test)}")
    print("-" * 55)

    # --- 3. TRAIN THE MODEL ---
    # n_estimators=500 is a good starting point for a stable model
    rf_model_overall = RandomForestRegressor(
        n_estimators=500,
        random_state=RANDOM_STATE,
        n_jobs=-1 # Use all available cores
    )

    # Fit the model to the training data
    rf_model_overall.fit(X_train, y_train)

    # --- 4. EVALUATE MODEL PERFORMANCE (on Test Set) ---
    y_pred = rf_model_overall.predict(X_test)

    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)

    print("Model Evaluation (Test Set):")
    print(f"RMSE (Root Mean Squared Error): {rmse:.4f}")
    print(f"R^2 Score (Variance Explained): {r2:.4f}")
    print("-" * 55)

    # --- 5. EXTRACT FEATURE IMPORTANCE ---
    # This is the crucial step for your analysis
    importance_overall = pd.Series(
        rf_model_overall.feature_importances_,
        index=X_full.columns
    ).sort_values(ascending=False)

    print("Top 10 Feature Importance (Overall Market):")
    print(importance_overall.head(10))

except NameError:
    print("\nError: The DataFrame 'airbnb_data_final' is not defined. Please ensure your data is loaded.")
except KeyError as e:
    print(f"\nError: A required column is missing. Check if '{TARGET_COL}' or features are correctly named: {e}.")
except ValueError as e:
    if "could not convert string to float" in str(e):
        print(f"\nError: Features must be numeric. Ensure all columns in X_full have been properly one-hot encoded or transformed (especially '{STATUS_COL}' if it's still a string).")
    else:
        print(f"\nAn error occurred during model training: {e}")

--- Running Random Forest Regressor on the FULL Dataset ---
Total training listings: 45910
Total testing listings: 11478
-------------------------------------------------------
Model Evaluation (Test Set):
RMSE (Root Mean Squared Error): 0.4052
R^2 Score (Variance Explained): 0.7364
-------------------------------------------------------
Top 10 Feature Importance (Overall Market):
accommodates                 0.386185
longitude                    0.102689
latitude                     0.098801
bathrooms_number             0.086040
host_total_listings_count    0.049666
availability_365             0.036622
reviews_per_month            0.030798
amenities_count              0.029986
host_tenure_days             0.028270
bedrooms                     0.023025
dtype: float64


## Superhost Status Deep-Dive
How does other factors affect the price of a listing hosted by a host with superhost status?


In [16]:
# Preparing the data for the model
airbnb_data_model = airbnb_data_final.loc[airbnb_data_final['superhost_status'] == 'Yes', ['log_price', 'log_estimated_revenue_l365d', 'host_tenure_days',	'accommodates',	'number_of_reviews',
'host_response_rate',	'bathrooms_number',	'reviews_per_month',
'host_acceptance_rate',	'bedrooms',	'review_scores_rating',
'host_total_listings_count',	'amenities_count',	'review_scores_accuracy',
	'minimum_nights',	'review_scores_cleanliness',
'maximum_nights',	'review_scores_communication',
'availability_365',	'review_scores_value',
'latitude',  'longitude',
'calculated_host_listings_count']]


# --- Configuration ---
TARGET_COL = 'log_price' # Your dependent variable
STATUS_COL = 'superhost_status' # Include this as a feature in the overall model
RANDOM_STATE = 42

print("--- Running Random Forest Regressor on the Superhost Listings Dataset ---")

try:
    # --- 1. DEFINE FEATURES (X) AND TARGET (Y) ---
    # Assume all columns except the target are features.
    X_full = airbnb_data_model.drop(columns=[TARGET_COL, 'log_estimated_revenue_l365d'])
    y_full = airbnb_data_model[TARGET_COL]

    # --- 2. TRAIN/TEST SPLIT ---
    # Split the data to evaluate model performance later
    X_train, X_test, y_train, y_test = train_test_split(
        X_full, y_full, test_size=0.2, random_state=RANDOM_STATE
    )

    print(f"Total training listings: {len(X_train)}")
    print(f"Total testing listings: {len(X_test)}")
    print("-" * 55)

    # --- 3. TRAIN THE MODEL ---
    # n_estimators=500 is a good starting point for a stable model
    rf_model_overall = RandomForestRegressor(
        n_estimators=500,
        random_state=RANDOM_STATE,
        n_jobs=-1 # Use all available cores
    )

    # Fit the model to the training data
    rf_model_overall.fit(X_train, y_train)

    # --- 4. EVALUATE MODEL PERFORMANCE (on Test Set) ---
    y_pred = rf_model_overall.predict(X_test)

    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)

    print("Model Evaluation (Test Set):")
    print(f"RMSE (Root Mean Squared Error): {rmse:.4f}")
    print(f"R^2 Score (Variance Explained): {r2:.4f}")
    print("-" * 55)

    # --- 5. EXTRACT FEATURE IMPORTANCE ---
    # This is the crucial step for your analysis
    importance_overall = pd.Series(
        rf_model_overall.feature_importances_,
        index=X_full.columns
    ).sort_values(ascending=False)

    print("Top 10 Feature Importance (Overall Market):")
    print(importance_overall.head(10))

except NameError:
    print("\nError: The DataFrame 'airbnb_data_final' is not defined. Please ensure your data is loaded.")
except KeyError as e:
    print(f"\nError: A required column is missing. Check if '{TARGET_COL}' or features are correctly named: {e}.")
except ValueError as e:
    if "could not convert string to float" in str(e):
        print(f"\nError: Features must be numeric. Ensure all columns in X_full have been properly one-hot encoded or transformed (especially '{STATUS_COL}' if it's still a string).")
    else:
        print(f"\nAn error occurred during model training: {e}")

--- Running Random Forest Regressor on the Superhost Listings Dataset ---
Total training listings: 10884
Total testing listings: 2722
-------------------------------------------------------
Model Evaluation (Test Set):
RMSE (Root Mean Squared Error): 0.3287
R^2 Score (Variance Explained): 0.8366
-------------------------------------------------------
Top 10 Feature Importance (Overall Market):
accommodates                      0.474591
bathrooms_number                  0.110934
longitude                         0.085888
latitude                          0.084651
host_total_listings_count         0.029257
reviews_per_month                 0.025478
calculated_host_listings_count    0.024222
availability_365                  0.023781
amenities_count                   0.019824
bedrooms                          0.019689
dtype: float64


How does other factors affect the revenue of a listing hosted by a host with superhost status?

In [17]:
# Preparing the data for the model
airbnb_data_model = airbnb_data_final.loc[airbnb_data_final['superhost_status'] == 'Yes', ['log_price', 'log_estimated_revenue_l365d', 'host_tenure_days',	'accommodates',	'number_of_reviews',
'host_response_rate',	'bathrooms_number',	'reviews_per_month',
'host_acceptance_rate',	'bedrooms',	'review_scores_rating',
'host_total_listings_count',	'amenities_count',	'review_scores_accuracy',
	'minimum_nights',	'review_scores_cleanliness',
'maximum_nights',	'review_scores_communication',
'availability_365',	'review_scores_value',
'latitude',  'longitude',
'calculated_host_listings_count']]


# --- Configuration ---
TARGET_COL = 'log_estimated_revenue_l365d' # Your dependent variable
RANDOM_STATE = 42

print("--- Running Random Forest Regressor on the Superhost Listings Dataset ---")

try:
    # --- 1. DEFINE FEATURES (X) AND TARGET (Y) ---
    # Assume all columns except the target are features.
    X_full = airbnb_data_model.drop(columns=[TARGET_COL, 'log_price'])
    y_full = airbnb_data_model[TARGET_COL]

    # --- 2. TRAIN/TEST SPLIT ---
    # Split the data to evaluate model performance later
    X_train, X_test, y_train, y_test = train_test_split(
        X_full, y_full, test_size=0.2, random_state=RANDOM_STATE
    )

    print(f"Total training listings: {len(X_train)}")
    print(f"Total testing listings: {len(X_test)}")
    print("-" * 55)

    # --- 3. TRAIN THE MODEL ---
    # n_estimators=500 is a good starting point for a stable model
    rf_model_overall = RandomForestRegressor(
        n_estimators=500,
        random_state=RANDOM_STATE,
        n_jobs=-1 # Use all available cores
    )

    # Fit the model to the training data
    rf_model_overall.fit(X_train, y_train)

    # --- 4. EVALUATE MODEL PERFORMANCE (on Test Set) ---
    y_pred = rf_model_overall.predict(X_test)

    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)

    print("Model Evaluation (Test Set):")
    print(f"RMSE (Root Mean Squared Error): {rmse:.4f}")
    print(f"R^2 Score (Variance Explained): {r2:.4f}")
    print("-" * 55)

    # --- 5. EXTRACT FEATURE IMPORTANCE ---
    # This is the crucial step for your analysis
    importance_overall = pd.Series(
        rf_model_overall.feature_importances_,
        index=X_full.columns
    ).sort_values(ascending=False)

    print("Top 10 Feature Importance (Overall Market):")
    print(importance_overall.head(10))

except NameError:
    print("\nError: The DataFrame 'airbnb_data_final' is not defined. Please ensure your data is loaded.")
except KeyError as e:
    print(f"\nError: A required column is missing. Check if '{TARGET_COL}' or features are correctly named: {e}.")
except ValueError as e:
    if "could not convert string to float" in str(e):
        print(f"\nError: Features must be numeric. Ensure all columns in X_full have been properly one-hot encoded or transformed (especially '{STATUS_COL}' if it's still a string).")
    else:
        print(f"\nAn error occurred during model training: {e}")

--- Running Random Forest Regressor on the Superhost Listings Dataset ---
Total training listings: 10884
Total testing listings: 2722
-------------------------------------------------------
Model Evaluation (Test Set):
RMSE (Root Mean Squared Error): 1.4735
R^2 Score (Variance Explained): 0.8008
-------------------------------------------------------
Top 10 Feature Importance (Overall Market):
number_of_reviews      0.552292
reviews_per_month      0.254647
accommodates           0.024487
longitude              0.019687
latitude               0.017954
availability_365       0.016458
minimum_nights         0.014554
host_tenure_days       0.012042
amenities_count        0.009707
review_scores_value    0.008433
dtype: float64


In [21]:

STATUS_COL = 'superhost_status'
STATUS_YES = 'Yes'
STATUS_NO = 'No'
ALPHA = 0.05

# Features to test for statistically significant differences
FEATURES_TO_TEST = [
    'review_scores_value',   # Service Quality / Value Perception
    'amenities_count',       # Quantifiable Investment / Comfort
    'reviews_per_month'      # Demand Velocity / Activity
]

print("--- Statistical Validation: Superhost vs. Non-Superhost Means ---\n")
results_summary = []

try:
    # 1. Separate the two populations for efficiency
    df_super = airbnb_data_final[airbnb_data_final[STATUS_COL] == STATUS_YES].copy()
    df_non_super = airbnb_data_final[airbnb_data_final[STATUS_COL] == STATUS_NO].copy()

    if df_super.empty or df_non_super.empty:
        print("Error: One or both host groups are empty after filtering.")
    else:
        # 2. Loop through each feature and perform the Welch's t-test
        for feature in FEATURES_TO_TEST:
            print(f"\nTesting Feature: {feature.upper()}")

            # Extract feature values, dropping NaNs if any (t-test requires complete data)
            super_values = df_super[feature].dropna()
            non_super_values = df_non_super[feature].dropna()

            # Calculate means for interpretation
            mean_super = super_values.mean()
            mean_non_super = non_super_values.mean()

            # Perform Welch's t-test (assumes unequal variances for robustness)
            t_stat, p_value = stats.ttest_ind(
                super_values,
                non_super_values,
                equal_var=False
            )

            # Determine significance and direction
            is_significant = p_value < ALPHA
            direction = "Higher" if mean_super > mean_non_super else "Lower"

            # Prepare summary row
            results_summary.append({
                'Feature': feature,
                'Mean Superhost': f'{mean_super:.4f}',
                'Mean Non-Superhost': f'{mean_non_super:.4f}',
                't-statistic': f'{t_stat:.4f}',
                'p-value': f'{p_value:.10f}',
                'Significant': is_significant,
                'Direction': direction
            })

            print(f"  Mean Superhost: {mean_super:.4f} | Mean Non-Superhost: {mean_non_super:.4f}")
            print(f"  p-value: {p_value:.10f}")
            print(f"  Result: {'REJECT H0 (Significant)' if is_significant else 'FAIL TO REJECT H0'}")

        # --- 3. Final Summary Table (Optional, for clear display) ---
        print("\n" + "=" * 60)
        print("FINAL STATISTICAL VALIDATION SUMMARY")
        print("=" * 60)

        summary_df = pd.DataFrame(results_summary)
        print(summary_df[['Feature', 'Mean Superhost', 'Mean Non-Superhost', 'p-value', 'Significant', 'Direction']].to_markdown(index=False))

except NameError:
    print("\nError: The DataFrame 'airbnb_data_final' is not defined. Please ensure your data is loaded.")
except KeyError as e:
    print(f"\nError: A required column is missing: {e}. Check if the features and status column are correctly named.")

--- Statistical Validation: Superhost vs. Non-Superhost Means ---


Testing Feature: REVIEW_SCORES_VALUE
  Mean Superhost: 4.4744 | Mean Non-Superhost: 3.5330
  p-value: 0.0000000000
  Result: REJECT H0 (Significant)

Testing Feature: AMENITIES_COUNT
  Mean Superhost: 38.7533 | Mean Non-Superhost: 29.2575
  p-value: 0.0000000000
  Result: REJECT H0 (Significant)

Testing Feature: REVIEWS_PER_MONTH
  Mean Superhost: 1.5577 | Mean Non-Superhost: 0.7925
  p-value: 0.0000000000
  Result: REJECT H0 (Significant)

FINAL STATISTICAL VALIDATION SUMMARY
| Feature             |   Mean Superhost |   Mean Non-Superhost |   p-value | Significant   | Direction   |
|:--------------------|-----------------:|---------------------:|----------:|:--------------|:------------|
| review_scores_value |           4.4744 |               3.533  |         0 | True          | Higher      |
| amenities_count     |          38.7533 |              29.2575 |         0 | True          | Higher      |
| reviews_per_mon

## Conclusion and Actionable Insights

Based on the analysis conducted in this notebook, the following conclusions and actionable insights can be drawn:

**Key Findings:**

*   **Superhosts have higher log prices and estimated revenue:** The hypothesis tests clearly demonstrate a statistically significant difference in both the mean log price and the mean log estimated revenue between Superhosts and Non-Superhosts. Superhosts tend to have higher prices and generate more revenue.
*   **Factors influencing revenue for Superhosts:** The Random Forest Regressor model trained on Superhost listings identified the number of reviews and reviews per month as the most significant factors influencing log estimated revenue for Superhosts. This suggests that a strong review profile is crucial for Superhost success in terms of revenue generation.
*   **Factors influencing price for Superhosts:** For Superhosts, the model indicated that `accommodates`, `bathrooms_number`, `longitude`, and `latitude` are the most important factors influencing log price. This highlights the importance of listing characteristics and location for pricing among Superhosts.
*   **Superhosts excel in key areas:** Statistical tests revealed that Superhosts have significantly higher mean values for `review_scores_value`, `amenities_count`, and `reviews_per_month` compared to Non-Superhosts. This suggests that Superhosts are generally providing better value, offering more amenities, and generating more reviews.
*   **Factors influencing price and revenue for the full dataset:**
    *   For **revenue**, the most significant factors were `number_of_reviews`, `reviews_per_month`, and `accommodates`, highlighting the importance of popularity, recent activity, and listing size.
    *   For **price**, the most impactful features were `accommodates`, `longitude`, `latitude`, and `bathrooms_number`, emphasizing the role of listing size, location, and amenities.

**Actionable Insights:**

*   **For Hosts aspiring to be Superhosts:** Focus on improving guest experience to increase review scores (especially value), enhance the number and quality of amenities offered, and actively encourage guests to leave reviews to boost `reviews_per_month`. These factors are strongly associated with both Superhost status and higher revenue.
*   **For Existing Superhosts:** Continue to prioritize obtaining positive reviews and maintaining a high volume of reviews per month. Additionally, optimize pricing strategies based on the capacity (`accommodates`), number of bathrooms, and location of your listings, as these are the primary drivers of price for Superhosts. Consider investing in amenities and ensuring the value offered aligns with the price.
*   **For All Hosts:** Regardless of Superhost status, increasing the number and frequency of reviews is crucial for boosting revenue. Additionally, understanding the impact of listing characteristics (`accommodates`, `bathrooms_number`) and location on pricing is essential for setting competitive prices.
*   **For the Platform (Airbnb):** The findings support the value of the Superhost program as an indicator of higher-performing listings. The platform could potentially use these insights to provide targeted recommendations to Non-Superhosts on areas for improvement to achieve Superhost status and increase their earnings. Highlighting listings with strong review profiles and ample amenities could also benefit guests.

This analysis provides a foundation for understanding the impact of Superhost status and the factors influencing price and revenue. Further analysis could delve deeper into the interactions between these variables and explore other potential drivers of success on the platform.