In [16]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import xgboost as xgb
import joblib  # For saving models and scalers

# Load the DataFrame
df = pd.read_csv(r"C:\Users\prabh\Downloads\airbnb_project\data\listing_1_With_Amenities.csv")

# Define the bins and labels for price ranges
bins = [0, 150, 300, 500, 1000, df['price'].max()]
labels = ['Very Low', 'Low', 'Medium', 'High', 'Very High']

# Create a new column 'Price_Range' in the DataFrame
df['Price_Range'] = pd.cut(df['price'], bins=bins, labels=labels, include_lowest=True)

# Create DataFrames based on the price range
df_price_ranges = {
    label: df[df['Price_Range'] == label].copy()
    for label in labels
}

# Define the columns to scale
standard_scaler_columns = [
    'minimum_nights', 'maximum_nights', 'availability_30', 'number_of_reviews',
    'review_scores_rating', 'calculated_host_listings_count', 'count_amenities',
    'data_year', 'data_month'
]

min_max_scaler_columns = [
    'accommodates', 'bathrooms_text', 'bedrooms', 'beds', 'host_response_score',
    'host_score', 'property_description_score'
]

# Define the categorical columns
categorical_columns = ['neighbourhood_cleansed', 'property_type']

# Function to scale and split the data, and return scalers
def scale_and_split(df, standard_scaler_columns, min_max_scaler_columns, categorical_columns):
    # Remove rows with missing target
    df = df.dropna(subset=['price'])

    # Handle 'bathrooms_text' if it's a string (convert to numeric)
    if 'bathrooms_text' in df.columns and df['bathrooms_text'].dtype == object:
        df['bathrooms_text'] = df['bathrooms_text'].str.extract(r'(\d+\.?\d*)').astype(float)

    # Fill missing numerical values with median
    df[standard_scaler_columns] = df[standard_scaler_columns].fillna(df[standard_scaler_columns].median())
    df[min_max_scaler_columns] = df[min_max_scaler_columns].fillna(df[min_max_scaler_columns].median())

    # Standard Scaler
    standard_scaler = StandardScaler()
    df[standard_scaler_columns] = standard_scaler.fit_transform(df[standard_scaler_columns])

    # MinMax Scaler
    min_max_scaler = MinMaxScaler()
    df[min_max_scaler_columns] = min_max_scaler.fit_transform(df[min_max_scaler_columns])

    # Convert categorical columns to 'category' dtype
    for col in categorical_columns:
        df[col] = df[col].astype('category')

    # Split the data into features and target
    X = df.drop(['price', 'Price_Range', 'id'], axis=1)  # Exclude 'id' if not used
    y = df['price']

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    return X_train, X_test, y_train, y_test, standard_scaler, min_max_scaler

# Function to train and evaluate the model
def train_and_evaluate(X_train, X_test, y_train, y_test, price_range_label):
    # Define the model
    model = xgb.XGBRegressor(
        objective='reg:squarederror',
        tree_method='hist',
        enable_categorical=True,
        eval_metric='mae',
        random_state=42
    )

    # Define the hyperparameter grid
    param_grid = {
        'n_estimators': [50, 100, 200, 300, 500],
        'max_depth': [3, 5, 7, 9, 12],
        'learning_rate': [0.01, 0.05, 0.1, 0.2],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'gamma': [0, 0.1, 0.2, 0.3],
        'min_child_weight': [1, 3, 5, 7],
        'reg_alpha': [0, 0.1, 0.5],
        'reg_lambda': [1, 1.5, 2, 3],
    }

    # Define the RandomizedSearchCV
    random_search = RandomizedSearchCV(
        estimator=model,
        param_distributions=param_grid,
        n_iter=50,
        scoring='neg_mean_absolute_error',
        cv=3,
        verbose=2,
        random_state=42,
        n_jobs=-1
    )

    # Fit the random search model
    random_search.fit(X_train, y_train)

    # Get the best model
    best_model = random_search.best_estimator_

    # Predict on test data
    y_pred = best_model.predict(X_test)

    # Evaluate the model
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    # Extract and print detailed results
    results_df = pd.DataFrame(random_search.cv_results_)
    results_df['mean_absolute_error'] = -results_df['mean_test_score']
    results_df = results_df.sort_values('rank_test_score')

    print(f"\nDetailed CV Results for Price Range '{price_range_label}':")
    for index, row in results_df.iterrows():
        print(f"Rank: {row['rank_test_score']}")
        print(f"Parameters: {row['params']}")
        print(f"Mean MAE: {row['mean_absolute_error']}")
        print("-" * 50)

    return best_model, mae, rmse, mse, r2, results_df

# Train and evaluate the model for each price range
results = []
detailed_results = []
models = {}    # Dictionary to store models
scalers = {}   # Dictionary to store scalers

for label in labels:
    print(f"\nTraining model for Price Range: {label}")
    df_price_range = df_price_ranges[label]
    
    # Check if the DataFrame is empty
    if df_price_range.empty:
        print(f"No data available for price range '{label}'. Skipping.")
        continue

    # Scale and split the data, and get scalers
    X_train, X_test, y_train, y_test, standard_scaler, min_max_scaler = scale_and_split(
        df_price_range,
        standard_scaler_columns,
        min_max_scaler_columns,
        categorical_columns
    )

    # Train and evaluate the model
    model, mae, rmse, mse, r2, cv_results = train_and_evaluate(
        X_train, X_test, y_train, y_test, label
    )

    # Save the results
    results.append({
        'Price_Range': label,
        'MAE': mae,
        'RMSE': rmse,
        'MSE': mse,
        'R2 Score': r2
    })
    cv_results['Price_Range'] = label
    detailed_results.append(cv_results)

    # Save the model to a file
    model_filename = f'xgboost_model_{label}.pkl'
    joblib.dump(model, model_filename)
    print(f"Model saved to {model_filename}")

    # Save the scalers to files
    standard_scaler_filename = f'standard_scaler_{label}.pkl'
    min_max_scaler_filename = f'min_max_scaler_{label}.pkl'
    joblib.dump(standard_scaler, standard_scaler_filename)
    joblib.dump(min_max_scaler, min_max_scaler_filename)
    print(f"Scalers saved to {standard_scaler_filename} and {min_max_scaler_filename}")

    # Store the model and scalers in dictionaries
    models[label] = model
    scalers[label] = {
        'standard_scaler': standard_scaler,
        'min_max_scaler': min_max_scaler
    }

# Create a DataFrame with the evaluation results
evaluation_results = pd.DataFrame(results)
print("\nFinal Evaluation Results:")
print(evaluation_results)

# Combine detailed results from all price ranges
all_cv_results = pd.concat(detailed_results, ignore_index=True)



Training model for Price Range: Very Low
Fitting 3 folds for each of 50 candidates, totalling 150 fits

Detailed CV Results for Price Range 'Very Low':
Rank: 1
Parameters: {'subsample': 0.6, 'reg_lambda': 1, 'reg_alpha': 0, 'n_estimators': 300, 'min_child_weight': 5, 'max_depth': 12, 'learning_rate': 0.1, 'gamma': 0, 'colsample_bytree': 0.8}
Mean MAE: 8.498119193013403
--------------------------------------------------
Rank: 2
Parameters: {'subsample': 1.0, 'reg_lambda': 1, 'reg_alpha': 0.5, 'n_estimators': 200, 'min_child_weight': 3, 'max_depth': 12, 'learning_rate': 0.05, 'gamma': 0, 'colsample_bytree': 1.0}
Mean MAE: 8.637193362637673
--------------------------------------------------
Rank: 3
Parameters: {'subsample': 1.0, 'reg_lambda': 1, 'reg_alpha': 0, 'n_estimators': 500, 'min_child_weight': 5, 'max_depth': 9, 'learning_rate': 0.1, 'gamma': 0.3, 'colsample_bytree': 1.0}
Mean MAE: 8.667590991405179
--------------------------------------------------
Rank: 4
Parameters: {'subsampl

### Final Evaluation Results:
  Price_Range         MAE        RMSE            MSE  R2 Score      
0    Very Low    7.529744   12.101891     146.455757  0.867153      
1         Low   15.196366   23.467307     550.714509  0.687373      
2      Medium   25.555053   38.561967    1487.025313  0.556018      
3        High   52.029769   81.869432    6702.603948  0.705965      
4   Very High  276.478711  781.792367  611199.304735  0.853135      

In [17]:
# Save the feature columns used during training
feature_columns = X_train.columns.tolist()  # Assuming the last X_train contains all features
joblib.dump(feature_columns, 'feature_columns.pkl')
print("Feature columns saved to 'feature_columns.pkl'")

Feature columns saved to 'feature_columns.pkl'


In [23]:
# Load the DataFrame
df = pd.read_csv(r"C:\Users\prabh\Downloads\airbnb_project\data\listing_1_With_Amenities.csv")
df.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 246186 entries, 0 to 246185
Data columns (total 125 columns):
 #    Column                                 Dtype  
---   ------                                 -----  
 0    id                                     int64  
 1    neighbourhood_cleansed                 object 
 2    property_type                          object 
 3    accommodates                           int64  
 4    bathrooms_text                         float64
 5    bedrooms                               float64
 6    beds                                   float64
 7    price                                  float64
 8    minimum_nights                         int64  
 9    maximum_nights                         int64  
 10   minimum_nights_avg_ntm                 float64
 11   maximum_nights_avg_ntm                 float64
 12   availability_30                        int64  
 13   number_of_reviews                      int64  
 14   review_scores_rating              

In [46]:
class AirbnbPricePredictor:
    def __init__(self, feature_columns_path='feature_columns.pkl'):
        self.feature_columns = joblib.load(feature_columns_path)
        self.models = {}
        self.scalers = {}
        for label in labels:
            model_filename = f'xgboost_model_{label}.pkl'
            standard_scaler_filename = f'standard_scaler_{label}.pkl'
            min_max_scaler_filename = f'min_max_scaler_{label}.pkl'
            try:
                self.models[label] = joblib.load(model_filename)
                self.scalers[label] = {
                    'standard_scaler': joblib.load(standard_scaler_filename),
                    'min_max_scaler': joblib.load(min_max_scaler_filename)
                }
            except FileNotFoundError:
                print(f"Model or scalers for '{label}' not found. Skipping.")

    def predict(self, listing_id, availability_30_value, data_year_value, data_month_value):
        # Retrieve the listing data
        df_listing = df[df['id'] == listing_id]
        if df_listing.empty:
            print(f"Listing ID {listing_id} not found.")
            return None, None

        listing_data = df_listing.iloc[0].copy()
        listing_data['availability_30'] = availability_30_value
        listing_data['data_year'] = data_year_value
        listing_data['data_month'] = data_month_value

        # Determine price range
        original_price = listing_data['price']
        price_range_label = pd.cut(
            [original_price],
            bins=[0, 150, 300, 500, 1000, df['price'].max()],
            labels=labels,
            include_lowest=True
        )[0]

        # Load the corresponding model and scalers
        model = self.models.get(price_range_label)
        scalers = self.scalers.get(price_range_label)

        if not model or not scalers:
            print(f"Model or scalers for price range '{price_range_label}' not available.")
            return None, None

        # Prepare input data
        input_data = listing_data.drop(['id']).to_frame().T
        missing_columns = set(self.feature_columns) - set(input_data.columns)
        for col in missing_columns:
            if col in categorical_columns:
                input_data[col] = 'Unknown'
            else:
                input_data[col] = 0
        input_data = input_data.reindex(columns=self.feature_columns, fill_value=0)

        # Convert categorical columns to 'category' dtype
        for col in categorical_columns:
            input_data[col] = input_data[col].astype('category')

        # Identify and convert any remaining object dtype columns to numeric
        object_columns = input_data.select_dtypes(include=['object']).columns.tolist()
        for col in object_columns:
            input_data[col] = pd.to_numeric(input_data[col], errors='coerce').fillna(0).astype(int)

        # Scale numerical features
        input_data[standard_scaler_columns] = scalers['standard_scaler'].transform(input_data[standard_scaler_columns])
        input_data[min_max_scaler_columns] = scalers['min_max_scaler'].transform(input_data[min_max_scaler_columns])

        # Ensure all features are present and correctly ordered
        input_data = input_data[self.feature_columns]

        # Predict the price
        predicted_price = model.predict(input_data)[0]

        return predicted_price, price_range_label


In [48]:
# Initialize the predictor
price_predictor = AirbnbPricePredictor()

# Predict the price for a specific listing
test_listing_id = df['id'].iloc[0]  # Replace with an actual listing ID
availability_30_value = 2
current_year = 2024
current_month = 11

predicted_price, used_price_range = price_predictor.predict(
    test_listing_id,
    availability_30_value,
    current_year,
    current_month
)

if predicted_price is not None:
    print(f"Predicted Optimal Price for Listing ID {test_listing_id}: ${predicted_price:.2f}")
    print(f"Used model for price range: {used_price_range}")


Predicted Optimal Price for Listing ID 1419: $330.66
Used model for price range: Medium
