In [55]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import warnings
import gradio as gr
import pickle
warnings.filterwarnings('ignore')

In [56]:
print("=" * 60)
print("STEP 1: LOADING AND EXPLORING DATA")
print("=" * 60)

df = pd.read_csv('/content/listings_clean_v1.csv')
print(f"Dataset shape: {df.shape}")
print(f"\nFirst few rows:\n{df.head()}")
print(f"\nData types:\n{df.dtypes}")
print(f"\nMissing values:\n{df.isnull().sum()}")


STEP 1: LOADING AND EXPLORING DATA
Dataset shape: (18539, 21)

First few rows:
      id                                               name  host_id  \
0  10803             Room in Deco Apartment, Brunswick East    38901   
1  43429                 Tranquil Javanese Studio and Pond!   189684   
2  44082                      Queen Room in Beautiful House   193031   
3  47100  Cosy, cute comfortable little home in top loca...   212071   
4  51592                   Central City Warehouse Apartment   190879   

  host_name  neighbourhood_group neighbourhood   latitude   longitude  \
0   Lindsay                  NaN      Moreland -37.766060  144.979510   
1     Allan                  NaN        Monash -37.899830  145.115790   
2     Vicki                  NaN     Frankston -38.147680  145.143640   
3     Loren                  NaN         Yarra -37.818371  145.005005   
4  Michelle                  NaN     Melbourne -37.812660  144.963130   

         room_type  price  ...  number_of_reviews

In [57]:
print("\n" + "=" * 60)
print("STEP 2: DATA CLEANING")
print("=" * 60)

df_clean = df.copy()

# Drop rows where price (target) is missing
if 'price' in df_clean.columns:
    df_clean = df_clean.dropna(subset=['price'])

# For numerical columns: fill with median
numerical_cols = df_clean.select_dtypes(include=[np.number]).columns
for col in numerical_cols:
    if df_clean[col].isnull().sum() > 0:
        median_val = df_clean[col].median()
        if pd.isna(median_val):
            df_clean[col].fillna(0, inplace=True)
        else:
            df_clean[col].fillna(median_val, inplace=True)

# For categorical columns: fill with mode or 'Unknown'
categorical_cols = df_clean.select_dtypes(include=['object']).columns
for col in categorical_cols:
    if df_clean[col].isnull().sum() > 0:
        mode_val = df_clean[col].mode()
        if len(mode_val) > 0:
            df_clean[col].fillna(mode_val[0], inplace=True)
        else:
            df_clean[col].fillna('Unknown', inplace=True)

# Drop any remaining rows with NaN
df_clean = df_clean.dropna()

print("Missing values after cleaning:")
print(f"Total NaN values: {df_clean.isnull().sum().sum()}")
print(f"Dataset shape: {df_clean.shape}")

# Remove outliers using IQR method
print("\nRemoving outliers...")
Q1 = df_clean['price'].quantile(0.25)
Q3 = df_clean['price'].quantile(0.75)
IQR = Q3 - Q1
df_clean = df_clean[(df_clean['price'] >= Q1 - 1.5 * IQR) &
                     (df_clean['price'] <= Q3 + 1.5 * IQR)]

# Final check for NaN
df_clean = df_clean.dropna()
print(f"Dataset shape after outlier removal: {df_clean.shape}")
print(f"Final NaN check: {df_clean.isnull().sum().sum()}")



STEP 2: DATA CLEANING
Missing values after cleaning:
Total NaN values: 0
Dataset shape: (18539, 21)

Removing outliers...
Dataset shape after outlier removal: (17106, 21)
Final NaN check: 0


In [58]:
print("\n" + "=" * 60)
print("STEP 3: FEATURE ENGINEERING")
print("=" * 60)

X = df_clean.drop('price', axis=1)
y = df_clean['price']

# Remove non-numeric and problematic columns
for col in X.columns:
    if X[col].dtype == 'object':
        # Try to encode categorical, skip if too many unique values
        if X[col].nunique() > 100:
            X = X.drop(col, axis=1)

# Encode categorical variables
le_dict = {}
categorical_features = X.select_dtypes(include=['object']).columns

for col in categorical_features:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))
    le_dict[col] = le

# Final NaN check in X
X = X.replace([np.inf, -np.inf], np.nan)
X = X.dropna()
y = y[X.index]

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"Final NaN in X: {X.isnull().sum().sum()}")
print(f"Final NaN in y: {y.isnull().sum()}")
print(f"Encoded categorical features: {list(categorical_features)}")

# Store feature names and min-max for UI
feature_names = X.columns.tolist()
feature_ranges = {col: (X[col].min(), X[col].max()) for col in feature_names}


STEP 3: FEATURE ENGINEERING
Features shape: (17106, 17)
Target shape: (17106,)
Final NaN in X: 0
Final NaN in y: 0
Encoded categorical features: ['neighbourhood', 'room_type']


In [59]:
print("\n" + "=" * 60)
print("STEP 4: CREATING VISUALIZATIONS")
print("=" * 60)

fig, axes = plt.subplots(2, 2, figsize=(14, 10))

axes[0, 0].hist(y, bins=50, color='skyblue', edgecolor='black')
axes[0, 0].set_title('Price Distribution', fontsize=12, fontweight='bold')
axes[0, 0].set_xlabel('Price')
axes[0, 0].set_ylabel('Frequency')

corr_matrix = X.corr()
top_features = corr_matrix.iloc[:, 0].abs().nlargest(6).index
sns.heatmap(X[top_features].corr(), annot=True, cmap='coolwarm', ax=axes[0, 1], fmt='.2f')
axes[0, 1].set_title('Feature Correlation Matrix', fontsize=12, fontweight='bold')

feature_corr = X.corrwith(y).abs().sort_values(ascending=False).head(10)
axes[1, 0].barh(range(len(feature_corr)), feature_corr.values, color='green')
axes[1, 0].set_yticks(range(len(feature_corr)))
axes[1, 0].set_yticklabels(feature_corr.index)
axes[1, 0].set_title('Top 10 Feature Correlations with Price', fontsize=12, fontweight='bold')
axes[1, 0].set_xlabel('Absolute Correlation')

axes[1, 1].text(0.5, 0.5, 'Feature Analysis Complete',
                ha='center', va='center', transform=axes[1, 1].transAxes, fontsize=14)
axes[1, 1].set_title('Dataset Overview', fontsize=12, fontweight='bold')

plt.tight_layout()
plt.savefig('01_exploratory_analysis.png', dpi=300, bbox_inches='tight')
print("✓ Saved: 01_exploratory_analysis.png")
plt.close()


STEP 4: CREATING VISUALIZATIONS
✓ Saved: 01_exploratory_analysis.png


In [60]:
print("\n" + "=" * 60)
print("STEP 5: TRAIN-TEST SPLIT & FEATURE SCALING")
print("=" * 60)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"Training set size: {X_train.shape}")
print(f"Test set size: {X_test.shape}")

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
print("Features scaled using StandardScaler")



STEP 5: TRAIN-TEST SPLIT & FEATURE SCALING
Training set size: (13684, 17)
Test set size: (3422, 17)
Features scaled using StandardScaler


In [62]:
print("\n" + "=" * 60)
print("STEP 6: BUILDING AND TRAINING MODELS")
print("=" * 60)

models = {
    'Linear Regression': LinearRegression(),
    'Ridge (α=100.0)': Ridge(alpha=100.0),
    'Lasso (α=1.0)': Lasso(alpha=1.0),
    'Random Forest': RandomForestRegressor(n_estimators=150, max_depth=10, min_samples_split=5, min_samples_leaf=2, max_features='sqrt', random_state=42, n_jobs=-1),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=200, learning_rate=0.01, max_depth=3, min_samples_leaf=5, subsample=0.8, validation_fraction=0.1, n_iter_no_change=10, random_state=42)
}

results = {}

for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)

    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    results[name] = {'model': model, 'rmse': rmse, 'mae': mae, 'r2': r2, 'pred': y_pred}

    print(f"\n{name}:")
    print(f"  RMSE: {rmse:.4f}")
    print(f"  MAE:  {mae:.4f}")
    print(f"  R²:   {r2:.4f}")


STEP 6: BUILDING AND TRAINING MODELS

Linear Regression:
  RMSE: 23.2691
  MAE:  16.7375
  R²:   0.9217

Ridge (α=100.0):
  RMSE: 23.3189
  MAE:  16.7813
  R²:   0.9213

Lasso (α=1.0):
  RMSE: 23.4795
  MAE:  17.0245
  R²:   0.9203

Random Forest:
  RMSE: 14.3543
  MAE:  9.3705
  R²:   0.9702

Gradient Boosting:
  RMSE: 11.8810
  MAE:  9.1136
  R²:   0.9796


In [52]:
print("\n" + "=" * 60)
print("STEP 8: MODEL COMPARISON")
print("=" * 60)

comparison_data = {
    'Model': list(results.keys()),
    'RMSE': [f"{results[m]['rmse']:.4f}" for m in results.keys()],
    'MAE': [f"{results[m]['mae']:.4f}" for m in results.keys()],
    'R² Score': [f"{results[m]['r2']:.4f}" for m in results.keys()]
}

comparison_df = pd.DataFrame(comparison_data)
print("\n" + comparison_df.to_string(index=False))

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

model_names = list(results.keys())
rmse_vals = [results[m]['rmse'] for m in model_names]
r2_vals = [results[m]['r2'] for m in model_names]

sorted_idx = sorted(range(len(r2_vals)), key=lambda i: r2_vals[i], reverse=True)
sorted_names = [model_names[i] for i in sorted_idx]
sorted_rmse = [rmse_vals[i] for i in sorted_idx]
sorted_r2 = [r2_vals[i] for i in sorted_idx]

axes[0].barh(sorted_names, sorted_rmse, color='coral')
axes[0].set_xlabel('RMSE (Lower is Better)', fontweight='bold')
axes[0].set_title('Model RMSE Comparison', fontsize=12, fontweight='bold')
axes[0].invert_yaxis()

axes[1].barh(sorted_names, sorted_r2, color='lightgreen')
axes[1].set_xlabel('R² Score (Higher is Better)', fontweight='bold')
axes[1].set_title('Model R² Score Comparison', fontsize=12, fontweight='bold')
axes[1].invert_yaxis()

plt.tight_layout()
plt.savefig('02_model_comparison.png', dpi=300, bbox_inches='tight')
print("\n✓ Saved: 02_model_comparison.png")
plt.close()


STEP 8: MODEL COMPARISON

            Model    RMSE     MAE R² Score
Linear Regression 23.2691 16.7375   0.9217
  Ridge (α=100.0) 23.3189 16.7813   0.9213
    Lasso (α=1.0) 23.4795 17.0245   0.9203
    Decision Tree  3.1935  2.6970   0.9985
    Random Forest 14.3543  9.3705   0.9702
Gradient Boosting 11.8810  9.1136   0.9796

✓ Saved: 02_model_comparison.png


In [54]:
print("\n" + "=" * 60)
print("STEP 9: BEST MODEL ANALYSIS")
print("=" * 60)

best_model_name = comparison_df.iloc[0]['Model']
best_model = results[best_model_name]['model']
best_pred = results[best_model_name]['pred']
best_rmse = results[best_model_name]['rmse']
best_r2 = results[best_model_name]['r2']

print(f"\nBest Model: {best_model_name}")
print(f"RMSE: {best_rmse:.4f}")
print(f"R²: {best_r2:.4f}")

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

axes[0].scatter(y_test, best_pred, alpha=0.5, color='blue')
axes[0].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
axes[0].set_xlabel('Actual Price', fontweight='bold')
axes[0].set_ylabel('Predicted Price', fontweight='bold')
axes[0].set_title('Actual vs Predicted Prices', fontsize=12, fontweight='bold')
axes[0].grid(alpha=0.3)

residuals = y_test - best_pred
axes[1].scatter(best_pred, residuals, alpha=0.5, color='green')
axes[1].axhline(y=0, color='r', linestyle='--', lw=2)
axes[1].set_xlabel('Predicted Price', fontweight='bold')
axes[1].set_ylabel('Residuals', fontweight='bold')
axes[1].set_title('Residuals Plot', fontsize=12, fontweight='bold')
axes[1].grid(alpha=0.3)

plt.tight_layout()
plt.savefig('03_best_model_analysis.png', dpi=300, bbox_inches='tight')
print("✓ Saved: 03_best_model_analysis.png")
plt.close()



STEP 9: BEST MODEL ANALYSIS

Best Model: Linear Regression
RMSE: 23.2691
R²: 0.9217
✓ Saved: 03_best_model_analysis.png


In [49]:
print("\n" + "=" * 60)
print("STEP 10: CROSS-VALIDATION & OVERFITTING ANALYSIS")
print("=" * 60)

cv_scores = cross_val_score(best_model, X_train_scaled, y_train,
                            cv=5, scoring='r2')
print(f"Cross-validation R² scores: {cv_scores}")
print(f"Mean CV R²: {cv_scores.mean():.4f} (+/- {cv_scores.std():.4f})")

# Calculate overfitting metric
train_pred = best_model.predict(X_train_scaled)
train_r2 = r2_score(y_train, train_pred)
test_r2 = best_r2

print(f"\n--- Overfitting Check ---")
print(f"Training R²: {train_r2:.4f}")
print(f"Testing R²:  {test_r2:.4f}")
print(f"Difference:  {(train_r2 - test_r2):.4f} (should be < 0.05)")

if (train_r2 - test_r2) < 0.05:
    print("✓ Model is NOT overfitting - Good generalization!")
else:
    print("⚠ Model shows signs of overfitting")

# ============= FINAL SUMMARY =============
print("\n" + "=" * 60)
print("FINAL SUMMARY")
print("=" * 60)
print(f"\n✓ Best Model: {best_model_name}")
print(f"✓ Test RMSE: {best_rmse:.4f}")
print(f"✓ Test R² Score: {best_r2:.4f}")
print(f"✓ Cross-validation R²: {cv_scores.mean():.4f}")
print(f"\nDataset size: {len(df_clean)}")
print(f"Training samples: {len(X_train)}")
print(f"Testing samples: {len(X_test)}")
print(f"Number of features: {X.shape[1]}")

# Save model and scaler
pickle.dump(best_model, open('best_model.pkl', 'wb'))
pickle.dump(scaler, open('scaler.pkl', 'wb'))
pickle.dump(feature_names, open('feature_names.pkl', 'wb'))
pickle.dump(le_dict, open('le_dict.pkl', 'wb'))
print("\n✓ Models saved for Gradio UI")


STEP 10: CROSS-VALIDATION & OVERFITTING ANALYSIS
Cross-validation R² scores: [0.91633017 0.92097101 0.9210453  0.91908777 0.91804384]
Mean CV R²: 0.9191 (+/- 0.0018)

--- Overfitting Check ---
Training R²: 0.9194
Testing R²:  0.9217
Difference:  -0.0022 (should be < 0.05)
✓ Model is NOT overfitting - Good generalization!

FINAL SUMMARY

✓ Best Model: Linear Regression
✓ Test RMSE: 23.2691
✓ Test R² Score: 0.9217
✓ Cross-validation R²: 0.9191

Dataset size: 17106
Training samples: 13684
Testing samples: 3422
Number of features: 17

✓ Models saved for Gradio UI


In [50]:
print("\n" + "=" * 60)
print("STEP 11: LAUNCHING GRADIO UI")
print("=" * 60)

def predict_price(*args):
    """
    Predict Airbnb price based on input features
    """
    try:
        # Create input array from user inputs
        input_data = np.array(list(args)).reshape(1, -1)

        # Scale the input
        input_scaled = scaler.transform(input_data)

        # Make prediction
        predicted_price = best_model.predict(input_scaled)[0]

        # Format output
        result = f"""
        <div style='text-align: center; padding: 20px; background-color: #f0f0f0; border-radius: 10px;'>
            <h2 style='color: #2c3e50; margin: 0;'>Predicted Airbnb Price</h2>
            <p style='font-size: 14px; color: #7f8c8d; margin-top: 5px;'>{best_model_name}</p>
            <h1 style='color: #27ae60; font-size: 48px; margin: 20px 0;'>${predicted_price:,.2f}</h1>
            <p style='color: #95a5a6; font-size: 12px;'>
                Model Accuracy (R²): {best_r2:.2%} | RMSE: ${best_rmse:.2f}
            </p>
        </div>
        """
        return result
    except Exception as e:
        return f"<div style='color: red; padding: 20px;'><h3>Error:</h3><p>{str(e)}</p></div>"

# Create Gradio input components
inputs = []
for i, feature in enumerate(feature_names):
    min_val, max_val = feature_ranges[feature]
    inputs.append(
        gr.Slider(
            minimum=min_val,
            maximum=max_val,
            value=(min_val + max_val) / 2,
            label=f"{feature} ({min_val:.0f} - {max_val:.0f})",
            step=0.1
        )
    )

# Create Gradio interface
interface = gr.Interface(
    fn=predict_price,
    inputs=inputs,
    outputs=gr.HTML(),
    title="🏠 Airbnb Rental Price Predictor",
    description=f"""
    <div style='text-align: center; padding: 20px;'>
        <h3>Predict Airbnb listing prices using Machine Learning</h3>
        <p><strong>Model:</strong> {best_model_name}</p>
        <p><strong>Accuracy (R² Score):</strong> {best_r2:.2%}</p>
        <p><strong>RMSE:</strong> ${best_rmse:.2f}</p>
        <p style='color: #7f8c8d;'>Adjust the sliders below to predict rental prices based on property features</p>
    </div>
    """,
    theme=gr.themes.Soft(),
    examples=[
        [(feature_ranges[f][0] + feature_ranges[f][1]) / 2 for f in feature_names],
        [(feature_ranges[f][1] * 0.75) for f in feature_names],
        [(feature_ranges[f][1] * 0.25) for f in feature_names],
    ]
)

if __name__ == "__main__":
    print("\n" + "=" * 60)
    print("🚀 Launching Gradio UI...")
    print("=" * 60)
    print("\n📊 All visualizations saved:")
    print("  ✓ 01_exploratory_analysis.png")
    print("  ✓ 02_model_comparison.png")
    print("  ✓ 03_best_model_analysis.png")
    print("\n💾 Model artifacts saved:")
    print("  ✓ best_model.pkl")
    print("  ✓ scaler.pkl")
    print("  ✓ feature_names.pkl")
    print("  ✓ le_dict.pkl")
    print("\n" + "=" * 60)

    interface.launch(share=True)


STEP 11: LAUNCHING GRADIO UI

🚀 Launching Gradio UI...

📊 All visualizations saved:
  ✓ 01_exploratory_analysis.png
  ✓ 02_model_comparison.png
  ✓ 03_best_model_analysis.png

💾 Model artifacts saved:
  ✓ best_model.pkl
  ✓ scaler.pkl
  ✓ feature_names.pkl
  ✓ le_dict.pkl

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://34d07d54f6be570b9a.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
