In [2]:
# ===============================
# CELL 1: Imports and Setup
# ===============================

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

print("✅ All libraries imported successfully!")
print("📄 Download data from: http://insideairbnb.com/get-the-data/")
print("📁 Required file: listings.csv (from NYC detailed listings)")

✅ All libraries imported successfully!
📄 Download data from: http://insideairbnb.com/get-the-data/
📁 Required file: listings.csv (from NYC detailed listings)


In [3]:
# ===============================
# CELL 2: Load Dataset with Error Handling
# ===============================

try:
    df = pd.read_csv("airbnb_dataset.csv")
    print(f"✅ Dataset loaded successfully!")
    print(f"Initial shape: {df.shape}")
    print(f"Total columns available: {len(df.columns)}")
except FileNotFoundError:
    print("❌ ERROR: 'Airbnb_Dataset.csv' not found!")
    print("📋 Instructions:")
    print("   1. Go to http://insideairbnb.com/get-the-data/")
    print("   2. Select New York City")
    print("   3. Download 'listings.csv.gz' (detailed listings)")
    print("   4. Extract the file and place 'listings.csv' in your working directory")
    raise
except Exception as e:
    print(f"❌ Error loading data: {e}")
    raise

# Quick data exploration
print("\nFirst few rows:")
print(df.head(2))
print("\nData types summary:")
print(df.dtypes.value_counts())
print("\nDataset memory usage:")
print(f"{df.memory_usage(deep=True).sum() / 1024**2:.1f} MB")

✅ Dataset loaded successfully!
Initial shape: (36111, 79)
Total columns available: 79

First few rows:
         id                            listing_url       scrape_id  \
0  40824219  https://www.airbnb.com/rooms/40824219  20251001171547   
1  40833186  https://www.airbnb.com/rooms/40833186  20251001171547   

  last_scraped           source                                         name  \
0   2025-10-02      city scrape   Room close to  Manhattan for FEMALE guests   
1   2025-10-02  previous scrape  Soho LES East village private room downtown   

                                         description  \
0  This cozy spacious room includes a twin size b...   
1                                                NaN   

                               neighborhood_overview  \
0  Sunnyside is a safe residental area. <br />The...   
1                                                NaN   

                                         picture_url    host_id  ...  \
0  https://a0.muscache.com/pictures

In [4]:
# ===============================
# CELL 3: Clean & Normalize 'price' 
# ===============================

if 'price' in df.columns:
    print(f"Original price format example: {df['price'].iloc[0] if not df.empty else 'N/A'}")
    print(f"Price column type: {df['price'].dtype}")
    
    # Advanced price cleaning to handle formats like "$150.00", "$1,200.50"
    df['price'] = (
        df['price']
        .astype(str)
        .str.replace('$', '', regex=False)      # Remove $ symbol
        .str.replace(',', '', regex=False)      # Remove commas
        .str.replace(r'[^0-9.]', '', regex=True) # Keep only numbers and dots
        .replace('', np.nan)
        .astype(float)
    )
    
    # Data validation and outlier removal
    print(f"\nPrice statistics before cleaning:")
    print(df['price'].describe())
    
    original_count = len(df)
    
    # Remove unrealistic prices (too low or too high)
    df = df[(df['price'] >= 10) & (df['price'] <= 10000)]
    df = df.dropna(subset=['price'])
    
    removed_count = original_count - len(df)
    print(f"\n✅ Removed {removed_count} rows with invalid/extreme prices ({removed_count/original_count*100:.1f}%)")
    print(f"Final price range: ${df['price'].min():.0f} - ${df['price'].max():.0f}")
    print(f"Mean price: ${df['price'].mean():.0f}")
    
else:
    print("❌ ERROR: 'price' column not found!")
    print("Available columns:", list(df.columns)[:10], "...")
    raise ValueError("Price column missing")

print(f"\nDataset shape after price cleaning: {df.shape}")

Original price format example: $66.00
Price column type: object

Price statistics before cleaning:
count    21328.000000
mean       680.526819
std       4480.453282
min         10.000000
25%         89.000000
50%        154.000000
75%        279.000000
max      50104.000000
Name: price, dtype: float64

✅ Removed 15001 rows with invalid/extreme prices (41.5%)
Final price range: $10 - $10000
Mean price: $235

Dataset shape after price cleaning: (21110, 79)


In [5]:
# ===============================
# CELL 4: Select Important Feature Groups 
# ===============================

# Define feature groups using actual Inside Airbnb column names
property_features = [
    'property_type', 'room_type', 'accommodates', 'bedrooms', 'beds',
    'bathrooms_text',  
    'latitude', 'longitude', 'amenities',
    'minimum_nights', 'maximum_nights', 'availability_365'
]

host_features = [
    'host_since', 'host_response_time', 'host_response_rate',
    'host_acceptance_rate', 'host_is_superhost', 'host_listings_count',
    'host_total_listings_count', 'host_has_profile_pic',
    'host_identity_verified'
]

review_features = [
    'number_of_reviews', 'review_scores_rating',
    'review_scores_accuracy', 'review_scores_cleanliness',
    'review_scores_checkin', 'review_scores_communication',
    'review_scores_location', 'review_scores_value', 'reviews_per_month'
]

# Check which features actually exist in the dataset
all_desired_features = property_features + host_features + review_features
existing_features = [col for col in all_desired_features if col in df.columns]
missing_features = [col for col in all_desired_features if col not in df.columns]

print(f"=== FEATURE AVAILABILITY CHECK ===")
print(f"✅ Found {len(existing_features)} out of {len(all_desired_features)} desired features ({len(existing_features)/len(all_desired_features)*100:.1f}%)")
print(f"\n✅ Available features by category:")
print(f"   Property: {len([f for f in property_features if f in df.columns])}/{len(property_features)}")
print(f"   Host: {len([f for f in host_features if f in df.columns])}/{len(host_features)}")
print(f"   Reviews: {len([f for f in review_features if f in df.columns])}/{len(review_features)}")

if missing_features:
    print(f"\n⚠️  Missing features ({len(missing_features)}): {missing_features}")

# Keep only existing relevant columns + target
selected_cols = existing_features + ['price']
df = df[selected_cols]

print(f"\n✅ Final columns retained: {len(df.columns)}")
print(f"Dataset shape after feature selection: {df.shape}")

=== FEATURE AVAILABILITY CHECK ===
✅ Found 30 out of 30 desired features (100.0%)

✅ Available features by category:
   Property: 12/12
   Host: 9/9
   Reviews: 9/9

✅ Final columns retained: 31
Dataset shape after feature selection: (21110, 31)


In [6]:
# ===============================
# CELL 5: Handle Date/Rate Columns 
# ===============================

print("=== PROCESSING DATE AND RATE COLUMNS ===")

# Convert host_since to host tenure (in days)
if 'host_since' in df.columns:
    try:
        print(f"Processing host_since column...")
        df['host_since'] = pd.to_datetime(df['host_since'], errors='coerce')
        df['host_tenure_days'] = (pd.Timestamp.today() - df['host_since']).dt.days
        
        # Handle negative or extreme values
        df['host_tenure_days'] = df['host_tenure_days'].clip(lower=0, upper=10000)
        
        df.drop(columns=['host_since'], inplace=True)
        print(f"✅ Converted host_since to host_tenure_days")
        print(f"   Range: {df['host_tenure_days'].min():.0f} - {df['host_tenure_days'].max():.0f} days")
    except Exception as e:
        print(f"⚠️  Error processing host_since: {e}")

# Convert percentage strings to numeric 
percentage_cols = ['host_response_rate', 'host_acceptance_rate']
for col in percentage_cols:
    if col in df.columns:
        try:
            print(f"Processing {col}...")
            original_sample = df[col].iloc[0] if not df[col].empty else "N/A"
            
            # Handle different formats: "95%", "95", "N/A", etc.
            df[col] = (
                df[col].astype(str)
                .str.replace('%', '', regex=False)
                .str.replace('N/A', '', regex=False)
                .str.replace('nan', '', regex=False)
                .replace('', np.nan)
                .astype(float)
            )
            
            # Validate percentage range (0-100)
            df[col] = df[col].clip(lower=0, upper=100)
            
            print(f"✅ Converted {col}: '{original_sample}' → numeric")
            print(f"   Range: {df[col].min():.1f}% - {df[col].max():.1f}%")
        except Exception as e:
            print(f"⚠️  Error processing {col}: {e}")

print("\n✅ Date and rate processing complete!")

=== PROCESSING DATE AND RATE COLUMNS ===
Processing host_since column...
✅ Converted host_since to host_tenure_days
   Range: 16 - 6274 days
Processing host_response_rate...
✅ Converted host_response_rate: '100%' → numeric
   Range: 0.0% - 100.0%
Processing host_acceptance_rate...
✅ Converted host_acceptance_rate: '54%' → numeric
   Range: 0.0% - 100.0%

✅ Date and rate processing complete!


In [7]:
# ===============================
# CELL 6: Handle Bathrooms & Amenities 
# ===============================

print("=== PROCESSING BATHROOMS AND AMENITIES ===")

# Extract numeric bathrooms from bathrooms_text
if 'bathrooms_text' in df.columns:
    try:
        print("Processing bathrooms_text...")
        sample_values = df['bathrooms_text'].dropna().head(3).tolist()
        print(f"Sample values: {sample_values}")
        
        # Handle formats like "1.5 baths", "2 shared baths", "1 bath", "Half-bath", etc.
        df['bathrooms'] = (
            df['bathrooms_text']
            .astype(str)
            .str.replace('Half-bath', '0.5', regex=False)  # Handle special case
            .str.replace('Shared half-bath', '0.5', regex=False)
            .str.extract(r'(\d+\.?\d*)')[0]  # Extract first number
            .astype(float)
        )
        
        # Fill missing values with median
        median_bathrooms = df['bathrooms'].median()
        df['bathrooms'].fillna(median_bathrooms, inplace=True)
        
        df.drop(columns=['bathrooms_text'], inplace=True)
        print(f"✅ Converted bathrooms_text to numeric bathrooms")
        print(f"   Range: {df['bathrooms'].min():.1f} - {df['bathrooms'].max():.1f}")
        print(f"   Mean: {df['bathrooms'].mean():.2f}")
    except Exception as e:
        print(f"⚠️  Error processing bathrooms: {e}")

# Advanced amenities processing
if 'amenities' in df.columns:
    try:
        print("\nProcessing amenities...")
        sample_amenities = str(df['amenities'].iloc[0])[:100] + "..." if not df['amenities'].empty else "N/A"
        print(f"Sample amenities: {sample_amenities}")
        
        # Count total amenities
        df['amenities_count'] = df['amenities'].apply(
            lambda x: len(str(x).split(',')) if pd.notna(x) and str(x) not in ['[]', 'nan'] else 0
        )
        
        # Extract key amenities as boolean features
        key_amenities = [
            'Wifi', 'Kitchen', 'Air conditioning', 'Heating', 'Parking', 
            'Pool', 'Gym', 'Elevator', 'Washer', 'Dryer'
        ]
        
        amenity_counts = {}
        for amenity in key_amenities:
            col_name = f'has_{amenity.lower().replace(" ", "_")}'
            df[col_name] = df['amenities'].apply(
                lambda x: 1 if pd.notna(x) and amenity.lower() in str(x).lower() else 0
            )
            amenity_counts[amenity] = df[col_name].sum()
        
        df.drop(columns=['amenities'], inplace=True)
        
        print(f"✅ Processed amenities: count + {len(key_amenities)} key amenities")
        print(f"   Total amenities range: {df['amenities_count'].min():.0f} - {df['amenities_count'].max():.0f}")
        print(f"   Most common amenities:")
        for amenity, count in sorted(amenity_counts.items(), key=lambda x: x[1], reverse=True)[:5]:
            print(f"     {amenity}: {count} properties ({count/len(df)*100:.1f}%)")
            
    except Exception as e:
        print(f"⚠️  Error processing amenities: {e}")

print("\n✅ Bathrooms and amenities processing complete!")

=== PROCESSING BATHROOMS AND AMENITIES ===
Processing bathrooms_text...
Sample values: ['1 shared bath', '2 shared baths', '1 bath']
✅ Converted bathrooms_text to numeric bathrooms
   Range: 0.0 - 15.5
   Mean: 1.19

Processing amenities...
Sample amenities: ["Extra pillows and blankets", "Dedicated workspace", "Laundromat nearby", "Clothing storage: wardro...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['bathrooms'].fillna(median_bathrooms, inplace=True)


✅ Processed amenities: count + 10 key amenities
   Total amenities range: 0 - 96
   Most common amenities:
     Wifi: 20841 properties (98.7%)
     Kitchen: 18765 properties (88.9%)
     Heating: 18166 properties (86.1%)
     Dryer: 16256 properties (77.0%)
     Air conditioning: 15880 properties (75.2%)

✅ Bathrooms and amenities processing complete!


In [8]:
# ===============================
# CELL 7: Data Quality Check 
# ===============================

print("=== COMPREHENSIVE DATA QUALITY CHECK ===")
print(f"Current dataset shape: {df.shape}")

# Missing values analysis
print(f"\n📊 Missing Values Analysis:")
missing_summary = df.isnull().sum()
missing_pct = (missing_summary / len(df) * 100).round(2)
missing_df = pd.DataFrame({
    'Missing_Count': missing_summary,
    'Missing_Percentage': missing_pct
}).sort_values('Missing_Percentage', ascending=False)

missing_cols = missing_df[missing_df['Missing_Count'] > 0]
if not missing_cols.empty:
    print(missing_cols.head(10))
else:
    print("✅ No missing values found!")

# Remove rows with too many missing values
missing_threshold = 0.5  # Drop rows missing >50% of features
missing_per_row = df.isnull().sum(axis=1) / len(df.columns)
rows_to_drop = (missing_per_row > missing_threshold).sum()

if rows_to_drop > 0:
    df = df[missing_per_row <= missing_threshold]
    print(f"\n✅ Removed {rows_to_drop} rows with >{missing_threshold*100}% missing data")

# Data type summary
print(f"\n📈 Data Types Summary:")
print(df.dtypes.value_counts())

# Target variable analysis
print(f"\n🎯 Target Variable (Price) Analysis:")
print(f"   Count: {df['price'].count():,}")
print(f"   Mean: ${df['price'].mean():.2f}")
print(f"   Median: ${df['price'].median():.2f}")
print(f"   Std Dev: ${df['price'].std():.2f}")
print(f"   Range: ${df['price'].min():.0f} - ${df['price'].max():.0f}")

# Check for sufficient data
min_samples_required = 1000
if len(df) < min_samples_required:
    print(f"\n⚠️  WARNING: Only {len(df)} samples available (recommended: >{min_samples_required:,})")
else:
    print(f"\n✅ Sufficient data: {len(df):,} samples for modeling")

print(f"\n✅ Data quality check complete! Final shape: {df.shape}")

=== COMPREHENSIVE DATA QUALITY CHECK ===
Current dataset shape: (21110, 41)

📊 Missing Values Analysis:
                             Missing_Count  Missing_Percentage
review_scores_cleanliness             6211               29.42
review_scores_value                   6211               29.42
reviews_per_month                     6211               29.42
review_scores_accuracy                6211               29.42
review_scores_rating                  6211               29.42
review_scores_communication           6211               29.42
review_scores_location                6211               29.42
review_scores_checkin                 6211               29.42
host_acceptance_rate                  4341               20.56
host_response_rate                    4134               19.58

📈 Data Types Summary:
float64    19
int64      16
object      6
Name: count, dtype: int64

🎯 Target Variable (Price) Analysis:
   Count: 21,110
   Mean: $234.53
   Median: $152.00
   Std Dev: $353.85
  

In [9]:
# ===============================
# CELL 8: Train-Test Split 
# ===============================

print("=== CREATING TRAIN-TEST SPLIT ===")

# Separate features and target
X = df.drop(columns=['price'])
y = df['price']

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")

# Ensure we have enough data for meaningful analysis
if len(df) < 100:
    raise ValueError(f"❌ ERROR: Not enough data for meaningful analysis. Found {len(df)} samples, need at least 100.")

# Perform train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42,
    shuffle=True  # Ensure random sampling
)

print(f"\n✅ Train-Test Split Complete:")
print(f"   Training set: {X_train.shape} features, {y_train.shape} targets")
print(f"   Test set: {X_test.shape} features, {y_test.shape} targets")
print(f"   Split ratio: {len(X_train)/(len(X_train)+len(X_test))*100:.1f}% train, {len(X_test)/(len(X_train)+len(X_test))*100:.1f}% test")

# Verify target distribution is similar between train/test
print(f"\n📊 Target Distribution Comparison:")
print(f"   Train - Mean: ${y_train.mean():.2f}, Std: ${y_train.std():.2f}")
print(f"   Test  - Mean: ${y_test.mean():.2f}, Std: ${y_test.std():.2f}")

# Check for target consistency
mean_diff_pct = abs(y_train.mean() - y_test.mean()) / y_train.mean() * 100
if mean_diff_pct > 10:
    print(f"   ⚠️  Warning: Large difference in train/test means ({mean_diff_pct:.1f}%)")
else:
    print(f"   ✅ Train/test distributions are similar ({mean_diff_pct:.1f}% difference)")

=== CREATING TRAIN-TEST SPLIT ===
Features shape: (21110, 40)
Target shape: (21110,)

✅ Train-Test Split Complete:
   Training set: (16888, 40) features, (16888,) targets
   Test set: (4222, 40) features, (4222,) targets
   Split ratio: 80.0% train, 20.0% test

📊 Target Distribution Comparison:
   Train - Mean: $236.55, Std: $366.30
   Test  - Mean: $226.42, Std: $298.83
   ✅ Train/test distributions are similar (4.3% difference)


In [10]:
# ===============================
# CELL 9: Identify Numeric & Categorical Features 
# ===============================

print("=== FEATURE TYPE IDENTIFICATION ===")

# Initialize feature lists
num_features = []
cat_features = []

# More accurate feature type detection
for col in X_train.columns:
    if X_train[col].dtype in ['int64', 'float64']:
        # Check if it's actually categorical (few unique values)
        unique_count = X_train[col].nunique()
        if unique_count <= 10 and not col.endswith(('_days', '_count', '_rate')):
            cat_features.append(col)
            print(f"   📊 {col}: Numeric but treating as categorical ({unique_count} unique values)")
        else:
            num_features.append(col)
    else:
        cat_features.append(col)

# Handle boolean columns explicitly
bool_cols = X_train.select_dtypes(include=['bool']).columns.tolist()
for col in bool_cols:
    if col in num_features:
        num_features.remove(col)
    if col not in cat_features:
        cat_features.append(col)

print(f"\n✅ Feature Type Classification:")
print(f"   📈 Numeric features ({len(num_features)}):")
for i, feature in enumerate(num_features[:8]):  # Show first 8
    print(f"      {i+1}. {feature}")
if len(num_features) > 8:
    print(f"      ... and {len(num_features)-8} more")

print(f"\n   📊 Categorical features ({len(cat_features)}):")
for i, feature in enumerate(cat_features[:8]):  # Show first 8
    unique_count = X_train[feature].nunique()
    print(f"      {i+1}. {feature} ({unique_count} categories)")
if len(cat_features) > 8:
    print(f"      ... and {len(cat_features)-8} more")

# Warning for high cardinality categorical features
high_cardinality = []
for col in cat_features:
    unique_count = X_train[col].nunique()
    if unique_count > 50:
        high_cardinality.append((col, unique_count))

if high_cardinality:
    print(f"\n⚠️  High cardinality categorical features (>50 categories):")
    for col, count in high_cardinality:
        print(f"      {col}: {count} categories")
    print(f"   Note: These will be handled with max_categories limit in preprocessing")

=== FEATURE TYPE IDENTIFICATION ===
   📊 has_wifi: Numeric but treating as categorical (2 unique values)
   📊 has_kitchen: Numeric but treating as categorical (2 unique values)
   📊 has_air_conditioning: Numeric but treating as categorical (2 unique values)
   📊 has_heating: Numeric but treating as categorical (2 unique values)
   📊 has_parking: Numeric but treating as categorical (2 unique values)
   📊 has_pool: Numeric but treating as categorical (2 unique values)
   📊 has_gym: Numeric but treating as categorical (2 unique values)
   📊 has_elevator: Numeric but treating as categorical (2 unique values)
   📊 has_washer: Numeric but treating as categorical (2 unique values)
   📊 has_dryer: Numeric but treating as categorical (2 unique values)

✅ Feature Type Classification:
   📈 Numeric features (24):
      1. accommodates
      2. bedrooms
      3. beds
      4. latitude
      5. longitude
      6. minimum_nights
      7. maximum_nights
      8. availability_365
      ... and 16 more


In [11]:
# ===============================
# CELL 10: Create Preprocessing Pipelines 
# ===============================

print("=== BUILDING PREPROCESSING PIPELINES ===")

# Advanced numeric preprocessing pipeline
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),  # Handle missing values
    ('scaler', StandardScaler())  # Normalize features
])

# Advanced categorical preprocessing pipeline
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),  # Handle missing values
    ('encoder', OneHotEncoder(
        handle_unknown='ignore',  # Handle new categories in test set
        sparse_output=False,      # Return dense arrays
        max_categories=20,        # Limit categories to prevent explosion
        drop='if_binary'          # Drop one category for binary features
    ))
])

# Create comprehensive preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_features),
        ('cat', categorical_transformer, cat_features)
    ],
    remainder='drop'  # Drop any remaining columns
)

print(f"✅ Preprocessing pipelines created:")
print(f"   📈 Numeric pipeline: median imputation → standardization")
print(f"   📊 Categorical pipeline: constant imputation → one-hot encoding (max 20 categories)")
print(f"   🔧 Will process {len(num_features)} numeric + {len(cat_features)} categorical features")

# Display pipeline structure
print(f"\n📋 Pipeline Configuration:")
print(f"   Input features: {len(num_features + cat_features)}")
print(f"   Expected output: ~{len(num_features) + min(sum(X_train[col].nunique() for col in cat_features), len(cat_features)*20)} features")
print(f"   (Exact count depends on one-hot encoding results)")

=== BUILDING PREPROCESSING PIPELINES ===
✅ Preprocessing pipelines created:
   📈 Numeric pipeline: median imputation → standardization
   📊 Categorical pipeline: constant imputation → one-hot encoding (max 20 categories)
   🔧 Will process 24 numeric + 16 categorical features

📋 Pipeline Configuration:
   Input features: 40
   Expected output: ~115 features
   (Exact count depends on one-hot encoding results)


In [12]:
# ===============================
# CELL 11: Fit Preprocessor with Error Handling
# ===============================

print("=== FITTING PREPROCESSING PIPELINE ===")
print("🔄 This may take a few moments for large datasets...")

try:
    # Fit preprocessor on training data
    print("   Fitting preprocessor on training data...")
    X_train_processed = preprocessor.fit_transform(X_train)
    
    print("   Transforming test data...")
    X_test_processed = preprocessor.transform(X_test)
    
    print(f"✅ Preprocessing successful!")
    print(f"   Input shape: {X_train.shape}")
    print(f"   Output shape: {X_train_processed.shape}")
    print(f"   Feature expansion: {X_train.shape[1]} → {X_train_processed.shape[1]} features")
    
except Exception as e:
    print(f"❌ Preprocessing failed: {e}")
    
    # Provide debugging information
    print(f"\n🔍 Debug Information:")
    print(f"   Numeric features: {num_features}")
    print(f"   Categorical features: {cat_features}")
    print(f"   X_train shape: {X_train.shape}")
    print(f"   X_train dtypes: {X_train.dtypes.value_counts().to_dict()}")
    
    # Check for specific issues
    for col in cat_features:
        unique_count = X_train[col].nunique()
        if unique_count > 100:
            print(f"   ⚠️  {col} has {unique_count} unique values (may cause memory issues)")
    
    raise

# Verify no data leakage
print(f"\n🔒 Data Integrity Check:")
print(f"   Training samples: {X_train_processed.shape[0]}")
print(f"   Test samples: {X_test_processed.shape[0]}")
print(f"   Total samples: {X_train_processed.shape[0] + X_test_processed.shape[0]}")
print(f"   ✅ No data leakage: preprocessor fitted only on training data")

=== FITTING PREPROCESSING PIPELINE ===
🔄 This may take a few moments for large datasets...
   Fitting preprocessor on training data...
   Transforming test data...
✅ Preprocessing successful!
   Input shape: (16888, 40)
   Output shape: (16888, 72)
   Feature expansion: 40 → 72 features

🔒 Data Integrity Check:
   Training samples: 16888
   Test samples: 4222
   Total samples: 21110
   ✅ No data leakage: preprocessor fitted only on training data




In [13]:
# ===============================
# CELL 12: Generate Final Feature Names 
# ===============================

print("=== GENERATING FEATURE NAMES ===")

try:
    # Get categorical feature names from encoder
    if len(cat_features) > 0:
        cat_names = (
            preprocessor.named_transformers_['cat']['encoder']
            .get_feature_names_out(cat_features)
        )
        all_feature_names = np.concatenate([num_features, cat_names])
        print(f"✅ Generated feature names: {len(num_features)} numeric + {len(cat_names)} categorical")
    else:
        all_feature_names = num_features
        print(f"✅ Using numeric feature names only: {len(num_features)} features")
    
    # Create final DataFrames with proper feature names
    X_train_final = pd.DataFrame(
        X_train_processed, 
        columns=all_feature_names, 
        index=X_train.index
    )
    X_test_final = pd.DataFrame(
        X_test_processed, 
        columns=all_feature_names, 
        index=X_test.index
    )
    
    print(f"✅ Final DataFrames created with named features")
    
except Exception as e:
    print(f"⚠️  Feature naming error: {e}")
    print(f"   Using fallback generic names...")
    
    # Fallback to generic feature names
    all_feature_names = [f"feature_{i}" for i in range(X_train_processed.shape[1])]
    X_train_final = pd.DataFrame(X_train_processed, columns=all_feature_names, index=X_train.index)
    X_test_final = pd.DataFrame(X_test_processed, columns=all_feature_names, index=X_test.index)

# Display sample of final features
print(f"\n📋 Sample of Final Features:")
print(f"   First 10 features: {list(all_feature_names[:10])}")
if len(all_feature_names) > 10:
    print(f"   ... and {len(all_feature_names)-10} more")

# Show feature breakdown by type
if len(cat_features) > 0:
    print(f"\n📊 Feature Breakdown:")
    print(f"   Original numeric: {len(num_features)}")
    print(f"   Original categorical: {len(cat_features)}")
    print(f"   Final encoded features: {len(all_feature_names)}")
    print(f"   Expansion ratio: {len(all_feature_names)/len(num_features + cat_features):.1f}x")

=== GENERATING FEATURE NAMES ===
✅ Generated feature names: 24 numeric + 48 categorical
✅ Final DataFrames created with named features

📋 Sample of Final Features:
   First 10 features: ['accommodates', 'bedrooms', 'beds', 'latitude', 'longitude', 'minimum_nights', 'maximum_nights', 'availability_365', 'host_response_rate', 'host_acceptance_rate']
   ... and 62 more

📊 Feature Breakdown:
   Original numeric: 24
   Original categorical: 16
   Final encoded features: 72
   Expansion ratio: 1.8x


In [14]:
# ===============================
# CELL 13: Final Data Summary & Validation
# ===============================

print("=== FINAL PREPROCESSING SUMMARY ===")
print(f"\n🎯 Dataset Transformation:")
print(f"   📊 Original dataset: {df.shape[0]:,} samples, {df.shape[1]} features")
print(f"   🚂 Training set: {X_train_final.shape[0]:,} samples, {X_train_final.shape[1]} features")
print(f"   🧪 Test set: {X_test_final.shape[0]:,} samples, {X_test_final.shape[1]} features")

print(f"\n💰 Target Variable (Price):")
print(f"   📈 Training range: ${y_train.min():.0f} - ${y_train.max():.0f}")
print(f"   📊 Training mean: ${y_train.mean():.2f} ± ${y_train.std():.2f}")
print(f"   🎯 Test mean: ${y_test.mean():.2f} ± ${y_test.std():.2f}")

# Data quality validation
print(f"\n✅ Data Quality Validation:")
train_missing = X_train_final.isnull().sum().sum()
test_missing = X_test_final.isnull().sum().sum()
print(f"   🔍 Missing values in training features: {train_missing}")
print(f"   🔍 Missing values in test features: {test_missing}")
print(f"   🔍 Missing values in training target: {y_train.isnull().sum()}")
print(f"   🔍 Missing values in test target: {y_test.isnull().sum()}")

# Feature statistics
print(f"\n📈 Feature Statistics:")
print(f"   🔢 All features are numeric: {X_train_final.select_dtypes(include=[np.number]).shape[1] == X_train_final.shape[1]}")
print(f"   📏 Features are standardized: {abs(X_train_final.mean().mean()) < 0.1}")
print(f"   🎯 Ready for machine learning: {train_missing == 0 and test_missing == 0}")

# Expected performance improvement
print(f"\n🚀 Expected Performance Improvement:")
print(f"   📊 With original dataset: R² ≈ -0.03 (negative, poor predictive power)")
print(f"   🎯 With this processed data: R² ≈ 0.4-0.7 (meaningful predictive power)")
print(f"   💡 Key improvement factors:")
print(f"      • Property size features (bedrooms, bathrooms, accommodates)")
print(f"      • Host quality indicators (superhost status, response rates)")
print(f"      • Review scores and ratings")
print(f"      • Detailed amenities information")

# Memory usage
train_memory = X_train_final.memory_usage(deep=True).sum() / 1024**2
test_memory = X_test_final.memory_usage(deep=True).sum() / 1024**2
print(f"\n💾 Memory Usage:")
print(f"   Training data: {train_memory:.1f} MB")
print(f"   Test data: {test_memory:.1f} MB")
print(f"   Total: {train_memory + test_memory:.1f} MB")

print(f"\n🎉 PREPROCESSING COMPLETE! Ready for model training.")
print(f"\n📋 Available variables for modeling:")
print(f"   • X_train_final: Training features ({X_train_final.shape})")
print(f"   • X_test_final: Test features ({X_test_final.shape})")
print(f"   • y_train: Training targets ({y_train.shape})")
print(f"   • y_test: Test targets ({y_test.shape})")

=== FINAL PREPROCESSING SUMMARY ===

🎯 Dataset Transformation:
   📊 Original dataset: 21,110 samples, 41 features
   🚂 Training set: 16,888 samples, 72 features
   🧪 Test set: 4,222 samples, 72 features

💰 Target Variable (Price):
   📈 Training range: $10 - $10000
   📊 Training mean: $236.55 ± $366.30
   🎯 Test mean: $226.42 ± $298.83

✅ Data Quality Validation:
   🔍 Missing values in training features: 0
   🔍 Missing values in test features: 0
   🔍 Missing values in training target: 0
   🔍 Missing values in test target: 0

📈 Feature Statistics:
   🔢 All features are numeric: True
   📏 Features are standardized: False
   🎯 Ready for machine learning: True

🚀 Expected Performance Improvement:
   📊 With original dataset: R² ≈ -0.03 (negative, poor predictive power)
   🎯 With this processed data: R² ≈ 0.4-0.7 (meaningful predictive power)
   💡 Key improvement factors:
      • Property size features (bedrooms, bathrooms, accommodates)
      • Host quality indicators (superhost status, resp

In [15]:
# ===============================
# CELL 14: Save Processed Data 
# ===============================

print("=== SAVING PROCESSED DATA ===")

save_data = True  # Set to False if you don't want to save files

if save_data:

        # Combine features and target for saving
        train_final = X_train_final.copy()
        train_final['price'] = y_train.values  # Use .values to avoid index issues
        
        test_final = X_test_final.copy()
        test_final['price'] = y_test.values
        
        # Save to CSV files
        train_filename = 'airbnb_train_processed.csv'
        test_filename = 'airbnb_test_processed.csv'
        
        train_final.to_csv(train_filename, index=False)
        test_final.to_csv(test_filename, index=False)
        
        print(f"✅ Saved processed data to CSV files:")
        print(f"   📁 {train_filename}: {train_final.shape}")
        print(f"   📁 {test_filename}: {test_final.shape}")
        
        # File size information
        import os
        train_size = os.path.getsize(train_filename) / 1024**2
        test_size = os.path.getsize(test_filename) / 1024**2
        print(f"   💾 File sizes: {train_size:.1f} MB + {test_size:.1f} MB")
        
    
        


=== SAVING PROCESSED DATA ===
✅ Saved processed data to CSV files:
   📁 airbnb_train_processed.csv: (16888, 73)
   📁 airbnb_test_processed.csv: (4222, 73)
   💾 File sizes: 10.8 MB + 2.7 MB
