# Airbnb PreProcessing

## Setup and Data Loading 

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%pip install --quiet seaborn
import seaborn as sns
from datetime import datetime
import re
import json
import ast
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import warnings
warnings.filterwarnings('ignore')

# Display Options
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', 50)

Matplotlib is building the font cache; this may take a moment.


In [3]:
# Loading the Dataset
listing_data = pd.read_csv('listings.csv')

In [4]:
# Select only the specified columns
selected_columns = [
    'name', 'description', 'picture_url', 'property_type', 'room_type', 
    'accommodates', 'bathrooms', 'bedrooms', 'beds', 'amenities', 'price', 
    'number_of_reviews', 'review_scores_rating', 'review_scores_cleanliness', 
    'review_scores_location', 'host_since', 'host_response_time', 'host_is_superhost', 
    'host_total_listings_count', 'host_identity_verified', 'neighbourhood_group_cleansed',
    'review_scores_accuracy', 'review_scores_checkin', 'review_scores_communication', 
    'review_scores_value', 'latitude', 'longitude', 'last_scraped'
]

# Filter to only keep columns that exist in the dataset
existing_columns = [col for col in selected_columns if col in listing_data.columns]
missing_columns = [col for col in selected_columns if col not in listing_data.columns]

# Keep only selected columns
listing_data = listing_data[existing_columns]

## Intial Data Exploration

In [5]:
# Dataset Info 
listing_data.info()

# Check for completely empty columns first
completely_empty = listing_data.columns[listing_data.isnull().all()].tolist()
print(f"Completely empty columns: {completely_empty}")

# Missing Values Analysis
missing_data = listing_data.isnull().sum().sort_values(ascending=False)
missing_pct = (missing_data / len(listing_data)) * 100

missing_df = pd.DataFrame({
    'Missing Count': missing_data,
    'Missing Percentage': missing_pct
}).round(2)
print(missing_df[missing_df['Missing Count'] > 0])

# Data Types Summary
print(listing_data.dtypes.value_counts())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7151 entries, 0 to 7150
Data columns (total 28 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   name                          7151 non-null   object 
 1   description                   6985 non-null   object 
 2   picture_url                   7151 non-null   object 
 3   property_type                 7151 non-null   object 
 4   room_type                     7151 non-null   object 
 5   accommodates                  7151 non-null   int64  
 6   bathrooms                     6293 non-null   float64
 7   bedrooms                      6930 non-null   float64
 8   beds                          6289 non-null   float64
 9   amenities                     7151 non-null   object 
 10  price                         6295 non-null   object 
 11  number_of_reviews             7151 non-null   int64  
 12  review_scores_rating          5677 non-null   float64
 13  rev

In [6]:
listing_data["neighbourhood_group_cleansed"].unique()

array(['Salford', 'Rochdale', 'Manchester', 'Trafford', 'Stockport',
       'Tameside', 'Oldham', 'Bury', 'Bolton', 'Wigan'], dtype=object)

## Data Cleaning

### Missing Values and looking at columns

In [592]:
# Looking at amenities 
sample_amenities = listing_data['amenities'].dropna().head(3)

print("Sample 1:", sample_amenities.iloc[0])
print("Sample 2:", sample_amenities.iloc[1])
print("Sample 3:", sample_amenities.iloc[2])

Sample 1: ["Shampoo", "Iron", "Clothing storage: closet and dresser", "Carbon monoxide alarm", "Room-darkening shades", "City skyline view", "Hangers", "Self check-in", "Conditioner", "Hair dryer", "Bed linens", "Refrigerator", "Dining table", "Shower gel", "First aid kit", "Private backyard \u2013 Fully fenced", "Lockbox", "Free driveway parking on premises", "Smoke alarm", "Essentials", "Exterior security cameras on property", "Bikes", "Breakfast", "Long term stays allowed", "HDTV", "Central heating", "Coffee", "Outdoor furniture", "Dishes and silverware", "Fire extinguisher", "Microwave", "Wine glasses", "Cleaning products", "Extra pillows and blankets", "Free street parking", "Wifi", "Body soap", "Kitchen", "Oven", "Books and reading material", "Drying rack for clothing", "Dedicated workspace", "Toaster", "Luggage dropoff allowed", "Cooking basics", "Hot water", "Hot water kettle"]
Sample 2: ["Shampoo", "Iron", "Free parking on premises", "Carbon monoxide alarm", "Hangers", "Hair d

### Clean Price and Percentage Columns

In [593]:
def clean_price(price_str):
    """Clean price column by removing $ and commas and convert to numeric"""
    if pd.isna(price_str):
        return np.nan
    # Handle different price formats
    price_clean = str(price_str).replace('$', '').replace(',', '').strip()
    if price_clean == '' or price_clean.lower() == 'n/a':
        return np.nan
    return float(price_clean)

def clean_percentage(pct_str):
    """Clean percentage columns by removing % and convert to numeric"""
    if pd.isna(pct_str) or str(pct_str).lower() == 'n/a':
        return np.nan
    pct_clean = str(pct_str).replace('%', '').strip()
    if pct_clean == '':
        return np.nan
    return float(pct_clean)

# Apply price cleaning
if 'price' in listing_data.columns:
    listing_data['price'] = listing_data['price'].apply(clean_price)

# Apply percentage cleaning
percentage_cols = ['host_response_rate', 'host_acceptance_rate']
for col in percentage_cols:
    if col in listing_data.columns:
        listing_data[col] = listing_data[col].apply(clean_percentage)

### Handle Boolean Columns

In [594]:
boolean_columns = [
    'host_is_superhost', 'host_identity_verified'
]

for col in boolean_columns:
    if col in listing_data.columns:
        
        # Handle various boolean representations
        listing_data[col] = listing_data[col].map({
            't': True, 'f': False, 
            'true': True, 'false': False,
            'TRUE': True, 'FALSE': False,
            True: True, False: False,
            1: True, 0: False,
            '1': True, '0': False
        })

### More missingness

In [595]:
# Examine columns with missingness
moderate_missing = []

for col in missing_df.index:
    missing_percent = missing_df.loc[col, 'Missing Percentage']
    
    if missing_percent > 10 and missing_percent <= 50:
        moderate_missing.append(col)

print("Columns with 10–50% missing data:", moderate_missing)

Columns with 10–50% missing data: ['review_scores_cleanliness', 'review_scores_location', 'review_scores_accuracy', 'review_scores_value', 'review_scores_communication', 'review_scores_rating', 'review_scores_checkin', 'host_response_time', 'beds', 'bathrooms', 'price']


In [596]:
# Handle Missing Values in Important Columns
# Fill missing numeric values with median
numeric_fill_cols = ['bathrooms', 'bedrooms', 'beds']
for col in numeric_fill_cols:
    if col in listing_data.columns:
        missing_count = listing_data[col].isnull().sum()
        if missing_count > 0:
            median_val = listing_data[col].median()
            listing_data[col] = listing_data[col].fillna(median_val)

# Fill categorical missing values with appropriate defaults
categorical_fills = {
    'host_response_time': 'unknown',
}

for col, fill_value in categorical_fills.items():
    if col in listing_data.columns:
        missing_count = listing_data[col].isnull().sum()
        if missing_count > 0:
            listing_data[col] = listing_data[col].fillna(fill_value)

# Fill boolean columns with False for missing values
for col in boolean_columns:
    if col in listing_data.columns:
        missing_count = listing_data[col].isnull().sum()
        if missing_count > 0:
            listing_data[col] = listing_data[col].fillna(False)

### Data Processing 

In [597]:
date_columns = [
    'host_since'
]
for col in date_columns:
    if col in listing_data.columns:
        listing_data[col] = pd.to_datetime(listing_data[col], errors='coerce')

## Feature Engineering

In [598]:
if 'host_since' in listing_data.columns:

    # Converts date time
    listing_data['host_since'] = pd.to_datetime(listing_data['host_since'], errors='coerce')
    listing_data['last_scraped'] = pd.to_datetime(listing_data['last_scraped'], errors='coerce')

    # Computes and converts 
    listing_data['host_days_active'] = (listing_data['last_scraped'] - listing_data['host_since']).dt.days
    listing_data.loc[listing_data['host_days_active'] < 0, 'host_days_active'] = None
    listing_data['host_years_active'] = listing_data['host_days_active'] / 365.25

    yrs = listing_data['host_years_active'].dropna()

In [599]:
# Review recency
if 'last_review' in listing_data.columns and 'last_scraped' in listing_data.columns:

    # Convert to datetime
    listing_data['last_review'] = pd.to_datetime(listing_data['last_review'], errors='coerce')
    listing_data['last_scraped'] = pd.to_datetime(listing_data['last_scraped'], errors='coerce')

    # Days since last review
    listing_data['days_since_last_review'] = (listing_data['last_scraped'] - listing_data['last_review']).dt.days

    bad = listing_data['days_since_last_review'] < 0
    if bad.any():
        print("  Warning: found", int(bad.sum()), "listings with negative days. Setting to 0.")
        listing_data.loc[bad, 'days_since_last_review'] = 0

    ds = listing_data['days_since_last_review'].dropna()

In [600]:
# Price per person
if 'price' in listing_data.columns and 'accommodates' in listing_data.columns:

    # Checks
    listing_data['price'] = pd.to_numeric(listing_data['price'], errors='coerce')

    # Avoid divide-by-zero
    zero_acc = (listing_data['accommodates'] == 0).sum()
    if zero_acc > 0:
        print("  Warning: found", int(zero_acc), "listings with 0 accommodates. Setting to 1.")
        listing_data.loc[listing_data['accommodates'] == 0, 'accommodates'] = 1

    # Compute 
    listing_data['price_per_person'] = listing_data['price'] / listing_data['accommodates']
    ppp = listing_data['price_per_person'].dropna()

In [601]:
# People per bedroom ratio
if all(col in listing_data.columns for col in ['bedrooms', 'accommodates']):
    
    # Handle zero bedrooms
    zero_bedrooms = (listing_data['bedrooms'] == 0).sum()
    
    listing_data['people_per_bedroom'] = listing_data['accommodates'] / np.maximum(listing_data['bedrooms'], 1)
    
    valid_ratios = listing_data['people_per_bedroom'].dropna()

### Picture URL Processing

In [602]:
if 'picture_url' in listing_data.columns:
    
    # URL analysis
    total_rows = len(listing_data)
    total_urls = listing_data['picture_url'].count()
    missing_urls = listing_data['picture_url'].isnull().sum()
    
    def extract_comprehensive_url_features(url):
        """Extract comprehensive features from picture URL"""
        if pd.isna(url):
            return {
                'has_picture': False,
                'url_length': 0,
                'is_muscache': False,
                'image_id_length': 0,
                'is_original': False,
                'file_extension': 'none',
                'url_has_size_param': False,
                'url_path_segments': 0,
                'estimated_image_quality': 'unknown',
                'url_complexity_score': 0
            }
        
        url_str = str(url)
        features = {}
        
        # Features
        features['has_picture'] = True
        features['url_length'] = len(url_str)
        features['is_muscache'] = 'muscache.com' in url_str.lower()
        
        # Image ID extraction
        image_id_match = re.search(r'/([a-f0-9]{8,}|[0-9]{8,})[\/_]', url_str)
        if image_id_match:
            features['image_id_length'] = len(image_id_match.group(1))
        else:
            features['image_id_length'] = 0
        
        # Quality indicators
        features['is_original'] = '_original' in url_str.lower()
        
        # Extract file extension
        extension_match = re.search(r'\.([a-zA-Z]{3,4})(?:\?|$)', url_str)
        if extension_match:
            features['file_extension'] = extension_match.group(1).lower()
        else:
            features['file_extension'] = 'none'
        
        # URL analysis 
        features['url_has_size_param'] = any(param in url_str for param in ['im_w=', 'im_h=', 'w=', 'h='])
        features['url_path_segments'] = len([seg for seg in url_str.split('/') if seg])
        
        # Estimate quality 
        if '_original' in url_str:
            features['estimated_image_quality'] = 'original'
        elif any(size in url_str for size in ['_large', '_xl', '_xxl']):
            features['estimated_image_quality'] = 'large'
        elif any(size in url_str for size in ['_medium', '_med']):
            features['estimated_image_quality'] = 'medium'
        elif any(size in url_str for size in ['_small', '_thumb']):
            features['estimated_image_quality'] = 'small'
        else:
            features['estimated_image_quality'] = 'standard'
        
        # URL complexity score
        complexity_score = 0
        complexity_score += features['url_path_segments'] * 0.5
        complexity_score += features['image_id_length'] * 0.2
        if features['is_muscache']:
            complexity_score += 2
        if features['is_original']:
            complexity_score += 3
        features['url_complexity_score'] = complexity_score
        
        return features
    
    url_features = listing_data['picture_url'].apply(extract_comprehensive_url_features)
    url_features_df = pd.json_normalize(url_features)
    
    # Add URL features to main dataframe
    for col in url_features_df.columns:
        listing_data[col] = url_features_df[col]

    ext_dist = listing_data['file_extension'].value_counts()
    for ext, count in ext_dist.items():
        pct = (count / len(listing_data)) * 100
        
    
    quality_dist = listing_data['estimated_image_quality'].value_counts()
    for quality, count in quality_dist.items():
        pct = (count / len(listing_data)) * 100

### Amentities Processing

In [603]:
def parse_amenities(amenities_str):
    """Parse amenities string into list"""
    if pd.isna(amenities_str):
        return []
    
    amenities_str = str(amenities_str).strip()
    if not amenities_str or amenities_str.lower() in ['nan', 'none', '']:
        return []
    
    # Handle list format 
    if amenities_str.startswith('[') and amenities_str.endswith(']'):
        amenities_str = amenities_str[1:-1]  # Remove brackets
    
    # Split by comma 
    items = []
    for item in amenities_str.split(','):
        # Remove quotes and whitespace
        clean_item = item.strip().strip('"\'').strip()
        if clean_item:
            items.append(clean_item)
    
    return items

def has_amenity_flexible(amenities_list, amenity_terms):
    """Check for amenity with flexible matching"""
    if not amenities_list:
        return False
    
    amenities_lower = [item.lower() for item in amenities_list]
    amenities_text = ' '.join(amenities_lower)
    
    # Check if any term matches
    for term in amenity_terms:
        term_lower = term.lower()
        if any(term_lower in amenity_lower for amenity_lower in amenities_lower):
            return True
        if term_lower in amenities_text:
            return True
    
    return False

# Parse amenities
listing_data['amenities_list'] = listing_data['amenities'].apply(parse_amenities)
listing_data['amenities_count'] = listing_data['amenities_list'].apply(len)

In [604]:
# Check sample of parsed amenities
sample_amenities = listing_data['amenities_list'].iloc[0]
print(f"Sample parsed amenities (first 10): {sample_amenities[:10]}")

Sample parsed amenities (first 10): ['Shampoo', 'Iron', 'Clothing storage: closet and dresser', 'Carbon monoxide alarm', 'Room-darkening shades', 'City skyline view', 'Hangers', 'Self check-in', 'Conditioner', 'Hair dryer']


In [605]:
#Amenity Categories

# Basic Amenities
basic_amenities = {
    'wifi': ['wifi', 'wi-fi', 'wireless internet', 'internet'],
    'kitchen': ['kitchen', 'kitchenette', 'full kitchen'],
    'air_conditioning': ['air conditioning', 'ac', 'air con', 'cooling'],
    'heating': ['heating', 'heater', 'central heating', 'radiator'],
    'tv': ['tv', 'television', 'cable tv', 'smart tv', 'netflix'],
    'washer': ['washer', 'washing machine', 'laundry'],
    'dryer': ['dryer', 'tumble dryer', 'drying machine'],
    'iron': ['iron', 'ironing board'],
    'hair_dryer': ['hair dryer', 'hairdryer', 'blow dryer'],
    'essentials': ['essentials', 'basics', 'towels', 'bed sheets', 'soap', 'toilet paper']
}

# Safety Features
safety_amenities = {
    'smoke_alarm': ['smoke alarm', 'smoke detector', 'fire alarm'],
    'carbon_monoxide_alarm': ['carbon monoxide alarm', 'co detector', 'carbon monoxide detector'],
    'first_aid_kit': ['first aid kit', 'medical kit'],
    'fire_extinguisher': ['fire extinguisher'],
    'security_cameras': ['security cameras', 'surveillance', 'cctv'],
    'lockbox': ['lockbox', 'key safe', 'lock box'],
    'private_entrance': ['private entrance', 'separate entrance', 'own entrance']
}

# Kitchen & Dining
kitchen_dining = {
    'refrigerator': ['refrigerator', 'fridge', 'mini fridge', 'mini-fridge'],
    'microwave': ['microwave', 'micro wave'],
    'oven': ['oven', 'stove', 'cooktop', 'hob'],
    'dishwasher': ['dishwasher', 'dish washer'],
    'coffee_maker': ['coffee maker', 'coffee machine', 'espresso machine', 'nespresso'],
    'dining_table': ['dining table', 'dining area', 'eating area'],
    'cookware': ['cooking basics', 'pots and pans', 'cookware', 'dishes and silverware'],
    'blender': ['blender', 'food processor'],
    'toaster': ['toaster'],
    'kettle': ['kettle', 'electric kettle']
}

# Bathroom Amenities
bathroom_amenities = {
    'shampoo': ['shampoo', 'body soap', 'shower gel'],
    'conditioner': ['conditioner'],
    'body_soap': ['body soap', 'soap', 'shower gel'],
    'hot_water': ['hot water', 'hot shower'],
    'bathtub': ['bathtub', 'bath tub', 'bath'],
    'bidet': ['bidet'],
    'bathroom_essentials': ['bathroom essentials', 'bath towels', 'toilet paper']
}

# Bedroom & Living Space
bedroom_living = {
    'bed_linens': ['bed linens', 'bedding', 'sheets', 'pillows'],
    'extra_pillows': ['extra pillows', 'pillows and blankets'],
    'hangers': ['hangers', 'coat hangers', 'wardrobe'],
    'closet': ['closet', 'wardrobe', 'clothing storage'],
    'desk': ['desk', 'workspace', 'laptop friendly workspace'],
    'chair': ['chair', 'office chair', 'desk chair'],
    'sofa': ['sofa', 'couch', 'living room'],
    'blackout_curtains': ['blackout curtains', 'room darkening shades']
}

# Internet & Office
internet_office = {
    'dedicated_workspace': ['dedicated workspace', 'office space', 'work area'],
    'laptop_friendly': ['laptop friendly', 'laptop workspace'],
    'ethernet': ['ethernet connection', 'wired internet'],
    'printer': ['printer'],
    'monitor': ['monitor', 'external monitor']
}

# Entertainment
entertainment = {
    'sound_system': ['sound system', 'speakers', 'stereo'],
    'game_console': ['game console', 'playstation', 'xbox', 'nintendo'],
    'books': ['books', 'reading material'],
    'board_games': ['board games', 'card games', 'games'],
    'music': ['music', 'spotify', 'streaming']
}

# Outdoor & Recreation
outdoor_recreation = {
    'balcony': ['balcony', 'terrace', 'patio'],
    'garden': ['garden', 'yard', 'outdoor space'],
    'bbq_grill': ['bbq grill', 'barbecue', 'grill', 'outdoor grill'],
    'outdoor_furniture': ['outdoor furniture', 'patio furniture', 'garden furniture'],
    'beach_access': ['beach access', 'beachfront', 'waterfront'],
    'mountain_view': ['mountain view', 'mountains'],
    'city_view': ['city view', 'skyline view'],
    'garden_view': ['garden view', 'park view']
}

# Luxury & Premium
luxury_amenities = {
    'pool': ['pool', 'swimming pool', 'shared pool', 'private pool'],
    'hot_tub': ['hot tub', 'jacuzzi', 'spa'],
    'gym': ['gym', 'fitness centre', 'exercise equipment', 'weights'],
    'sauna': ['sauna', 'steam room'],
    'concierge': ['concierge', 'doorman', 'reception'],
    'room_service': ['room service', 'housekeeping'],
    'luxury_toiletries': ['luxury toiletries', 'premium amenities'],
    'wine_cooler': ['wine cooler', 'wine fridge', 'mini bar']
}

# Transportation & Location
transport_location = {
    'free_parking': ['free parking', 'parking included', 'garage'],
    'paid_parking': ['paid parking', 'parking available'],
    'ev_charger': ['ev charger', 'electric vehicle charging', 'tesla charger'],
    'public_transport': ['near public transport', 'metro', 'subway access'],
    'bicycle': ['bicycle', 'bike', 'cycling'],
    'airport_shuttle': ['airport shuttle', 'transfer service']
}

# Family & Accessibility
family_accessibility = {
    'family_friendly': ['family friendly', 'child friendly', 'kids welcome'],
    'crib': ['crib', 'baby cot', 'cot'],
    'high_chair': ['high chair', 'baby chair'],
    'baby_bath': ['baby bath', 'bathtub for babies'],
    'step_free_access': ['step free access', 'wheelchair accessible', 'accessible'],
    'wide_doorways': ['wide doorways', 'accessible doorways'],
    'accessible_bathroom': ['accessible bathroom', 'roll-in shower']
}

# Pets
pet_amenities = {
    'pets_allowed': ['pets allowed', 'pet friendly', 'dogs allowed', 'cats allowed'],
    'pet_bowls': ['pet bowls', 'dog bowls'],
    'pet_bed': ['pet bed', 'dog bed']
}

# Climate & Environment
climate_environment = {
    'fan': ['fan', 'ceiling fan', 'portable fan'],
    'fireplace': ['fireplace', 'wood burning fireplace'],
    'humidifier': ['humidifier'],
    'air_purifier': ['air purifier', 'hepa filter'],
    'mosquito_net': ['mosquito net', 'bug net']
}

# Combine all amenity categories
all_amenity_categories = {
    'Basic': basic_amenities,
    'Safety': safety_amenities,
    'Kitchen_Dining': kitchen_dining,
    'Bathroom': bathroom_amenities,
    'Bedroom_Living': bedroom_living,
    'Internet_Office': internet_office,
    'Entertainment': entertainment,
    'Outdoor_Recreation': outdoor_recreation,
    'Luxury': luxury_amenities,
    'Transport_Location': transport_location,
    'Family_Accessibility': family_accessibility,
    'Pet': pet_amenities,
    'Climate_Environment': climate_environment
}

In [606]:
# Create binary features for all amenities
amenity_feature_count = 0

for category_name, category_amenities in all_amenity_categories.items():
    category_count = 0
    for amenity, search_terms in category_amenities.items():
        col_name = f"has_{amenity}"
        listing_data[col_name] = listing_data['amenities_list'].apply(
            lambda x: has_amenity_flexible(x, search_terms)
        )
        amenity_feature_count += 1
        category_count += 1

print(f"Total amenity features created: {amenity_feature_count}")

Total amenity features created: 89


In [607]:
# Create category counts
for category_name, category_amenities in all_amenity_categories.items():
    category_cols = [f"has_{amenity}" for amenity in category_amenities.keys()]
    existing_cols = [col for col in category_cols if col in listing_data.columns]
    
    count_col = f"{category_name.lower()}_amenities_count"
    listing_data[count_col] = listing_data[existing_cols].sum(axis=1)

In [608]:
# Create overall amenity quality scores

# Basic necessity score 
basic_essentials = ['has_wifi', 'has_kitchen', 'has_tv', 'has_essentials', 'has_heating']
existing_basic = [col for col in basic_essentials if col in listing_data.columns]
listing_data['basic_amenities_score'] = listing_data[existing_basic].sum(axis=1)

# Luxury score 
luxury_features = ['has_pool', 'has_hot_tub', 'has_gym', 'has_concierge', 'has_room_service']
existing_luxury = [col for col in luxury_features if col in listing_data.columns]
listing_data['luxury_amenities_score'] = listing_data[existing_luxury].sum(axis=1)

# Convenience score 
convenience_features = ['has_washer', 'has_dryer', 'has_dishwasher', 'has_free_parking', 'has_elevator']
existing_convenience = [col for col in convenience_features if col in listing_data.columns]
listing_data['convenience_amenities_score'] = listing_data[existing_convenience].sum(axis=1)

print(f"Basic amenities score range: {listing_data['basic_amenities_score'].min()} - {listing_data['basic_amenities_score'].max()}")
print(f"Luxury amenities score range: {listing_data['luxury_amenities_score'].min()} - {listing_data['luxury_amenities_score'].max()}")
print(f"Convenience amenities score range: {listing_data['convenience_amenities_score'].min()} - {listing_data['convenience_amenities_score'].max()}")

Basic amenities score range: 0 - 5
Luxury amenities score range: 0 - 4
Convenience amenities score range: 0 - 4


### Text Feature Engineering

In [609]:
# Enhanced Name Features
def extract_name_features(name):
    """Extract comprehensive features from listing name"""
    if pd.isna(name):
        return {
            'name_length': 0, 'name_word_count': 0, 'name_luxury_score': 0,
            'name_location_score': 0, 'name_mentions_apartment': False,
            'name_mentions_house': False, 'name_mentions_studio': False,
            'name_mentions_loft': False, 'name_mentions_room': False,
            'name_comfort_score': 0, 'name_mentions_private': False,
            'name_mentions_entire': False, 'name_view_score': 0,
            'name_mentions_central': False, 'name_mentions_modern': False
        }
    
    name_lower = str(name).lower()
    features = {}
    
    # Basic text statistics
    features['name_length'] = len(name)
    features['name_word_count'] = len(name.split())
    
    # Luxury indicators 
    luxury_words = ['luxury', 'luxurious', 'premium', 'deluxe', 'executive', 
                   'penthouse', 'villa', 'mansion', 'suite', 'presidential']
    features['name_luxury_score'] = sum(1 for word in luxury_words if word in name_lower)
    
    # Location indicators
    location_words = ['central', 'centre', 'center', 'downtown', 'city centre', 
                     'city center', 'heart of', 'near', 'close to', 'walking distance',
                     'zone 1', 'zone 2', 'prime location']
    features['name_location_score'] = sum(1 for word in location_words if word in name_lower)
    
    # Property type indicators
    features['name_mentions_apartment'] = any(word in name_lower for word in ['apartment', 'flat', 'apt'])
    features['name_mentions_house'] = any(word in name_lower for word in ['house', 'home', 'cottage', 'townhouse'])
    features['name_mentions_studio'] = 'studio' in name_lower
    features['name_mentions_loft'] = 'loft' in name_lower
    features['name_mentions_room'] = 'room' in name_lower and 'bedroom' not in name_lower
    
    # Comfort and quality indicators 
    comfort_words = ['cosy', 'cozy', 'comfortable', 'spacious', 'bright', 'modern', 
                    'stylish', 'beautiful', 'charming', 'elegant', 'sophisticated']
    features['name_comfort_score'] = sum(1 for word in comfort_words if word in name_lower)
    
    # Privacy indicators
    features['name_mentions_private'] = 'private' in name_lower
    features['name_mentions_entire'] = any(word in name_lower for word in ['entire', 'whole', 'full'])
    
    # View indicators
    view_words = ['view', 'garden', 'balcony', 'terrace', 'sea view', 'ocean view', 
                 'mountain view', 'city view', 'river view', 'park view', 'skyline']
    features['name_view_score'] = sum(1 for word in view_words if word in name_lower)
    
    # Specific location features
    features['name_mentions_central'] = any(word in name_lower for word in ['central', 'centre', 'center'])
    features['name_mentions_modern'] = any(word in name_lower for word in ['modern', 'contemporary', 'new', 'renovated'])
    
    return features

name_features = listing_data['name'].apply(extract_name_features)
name_features_df = pd.json_normalize(name_features)
name_features_df.index = listing_data.index

# Add name features to main dataframe
for col in name_features_df.columns:
    listing_data[col] = name_features_df[col]

print(f"Name features created: {len(name_features_df.columns)}")

Name features created: 15


In [610]:
# Enhanced Description Processing
def calculate_readability_score(text):
    """Calculate readability score using Flesch formula"""
    if pd.isna(text) or len(str(text).strip()) == 0:
        return 0
    
    text_str = str(text)
    sentences = len([s for s in text_str.split('.') if s.strip()])
    words = len(text_str.split())
    
    if sentences == 0 or words == 0:
        return 0
    
    # Count syllables 
    vowels = 'aeiouAEIOU'
    syllables = sum(1 for char in text_str if char in vowels)
    
    if syllables == 0:
        syllables = words  # Fallback
    
    # Reading Ease formula
    avg_sentence_length = words / sentences
    avg_syllables_per_word = syllables / words
    
    readability = 206.835 - (1.015 * avg_sentence_length) - (84.6 * avg_syllables_per_word)
    return max(0, min(100, readability))

def calculate_sentiment_score(text):
    """Calculate sentiment"""
    if pd.isna(text):
        return 0
    
    text_lower = str(text).lower()
    
    # Positive words 
    positive_words = [
        'amazing', 'beautiful', 'perfect', 'excellent', 'wonderful', 'fantastic', 
        'great', 'awesome', 'lovely', 'stunning', 'spectacular', 'incredible',
        'comfortable', 'cosy', 'cozy', 'charming', 'peaceful', 'relaxing', 
        'enjoyable', 'delightful', 'convenient', 'spacious', 'bright', 'clean', 
        'modern', 'stylish', 'elegant', 'sophisticated', 'luxury', 'premium',
        'superb', 'outstanding', 'exceptional', 'brilliant', 'magnificent',
        'gorgeous', 'fabulous', 'splendid', 'marvellous', 'marvelous'
    ]
    
    # Negative words
    negative_words = [
        'terrible', 'awful', 'bad', 'horrible', 'disappointing', 'dirty',
        'noisy', 'uncomfortable', 'small', 'cramped', 'old', 'outdated',
        'inconvenient', 'difficult', 'problems', 'issues', 'broken',
        'poor', 'worst', 'unpleasant', 'disgusting', 'nasty', 'dreadful'
    ]
    
    positive_count = sum(1 for word in positive_words if word in text_lower)
    negative_count = sum(1 for word in negative_words if word in text_lower)
    
    # Calculate sentiment score
    total_words = len(text_lower.split())
    if total_words == 0:
        return 0
    
    sentiment_score = (positive_count - negative_count) / max(total_words / 20, 1)
    return max(-1, min(1, sentiment_score))

def extract_description_features(description):
    """Extract features from description"""
    if pd.isna(description):
        return {
            'desc_length': 0, 'desc_word_count': 0, 'desc_sentence_count': 0,
            'avg_word_length': 0, 'desc_readability': 0, 'desc_sentiment_score': 0,
            'desc_luxury_mentions': 0, 'desc_location_mentions': 0, 'desc_transport_mentions': 0,
            'desc_experience_mentions': 0, 'desc_facility_mentions': 0, 'desc_business_mentions': 0,
            'desc_safety_mentions': 0, 'desc_cleanliness_mentions': 0, 'desc_comfort_mentions': 0,
            'desc_view_mentions': 0, 'desc_activity_mentions': 0, 'desc_food_mentions': 0,
            'desc_family_mentions': 0, 'desc_romantic_mentions': 0, 'desc_exclamation_count': 0,
            'desc_question_count': 0, 'desc_caps_ratio': 0, 'desc_number_count': 0
        }
    
    desc_str = str(description)
    desc_lower = desc_str.lower()
    words = desc_str.split()
    
    features = {}
    
    # Basic text statistics
    features['desc_length'] = len(desc_str)
    features['desc_word_count'] = len(words)
    features['desc_sentence_count'] = len([s for s in desc_str.split('.') if s.strip()])
    features['avg_word_length'] = np.mean([len(word) for word in words]) if words else 0
    
    # Readability and sentiment
    features['desc_readability'] = calculate_readability_score(desc_str)
    features['desc_sentiment_score'] = calculate_sentiment_score(desc_str)
    
    # Theme-based mentions
    luxury_words = ['luxury', 'luxurious', 'premium', 'upscale', 'high-end', 'exclusive', 'elegant']
    features['desc_luxury_mentions'] = sum(1 for word in luxury_words if word in desc_lower)
    
    location_words = ['location', 'neighbourhood', 'neighborhood', 'area', 'district', 'zone',
                     'close', 'near', 'walking', 'minutes', 'central', 'convenient']
    features['desc_location_mentions'] = sum(1 for word in location_words if word in desc_lower)
    
    transport_words = ['metro', 'tube', 'underground', 'subway', 'bus', 'train', 'station', 
                      'transport', 'uber', 'taxi', 'airport', 'railway']
    features['desc_transport_mentions'] = sum(1 for word in transport_words if word in desc_lower)
    
    experience_words = ['experience', 'enjoy', 'relax', 'explore', 'discover', 'adventure', 
                       'stay', 'visit', 'holiday', 'vacation', 'getaway']
    features['desc_experience_mentions'] = sum(1 for word in experience_words if word in desc_lower)
    
    facility_words = ['kitchen', 'bathroom', 'bedroom', 'living', 'dining', 'balcony', 
                     'garden', 'parking', 'wifi', 'pool', 'gym']
    features['desc_facility_mentions'] = sum(1 for word in facility_words if word in desc_lower)
    
    business_words = ['business', 'work', 'workspace', 'office', 'meetings', 'conference', 
                     'professional', 'corporate']
    features['desc_business_mentions'] = sum(1 for word in business_words if word in desc_lower)
    
    safety_words = ['safe', 'secure', 'security', 'safety', 'protected', 'gated', 'keyless']
    features['desc_safety_mentions'] = sum(1 for word in safety_words if word in desc_lower)
    
    cleanliness_words = ['clean', 'fresh', 'spotless', 'sanitised', 'sanitized', 'hygienic', 'tidy']
    features['desc_cleanliness_mentions'] = sum(1 for word in cleanliness_words if word in desc_lower)
    
    comfort_words = ['comfortable', 'cosy', 'cozy', 'relaxing', 'peaceful', 'quiet', 'serene']
    features['desc_comfort_mentions'] = sum(1 for word in comfort_words if word in desc_lower)
    
    view_words = ['view', 'views', 'overlook', 'facing', 'panoramic', 'scenic']
    features['desc_view_mentions'] = sum(1 for word in view_words if word in desc_lower)
    
    activity_words = ['restaurant', 'shopping', 'museum', 'theatre', 'theater', 'park', 'beach', 
                     'nightlife', 'entertainment', 'attractions']
    features['desc_activity_mentions'] = sum(1 for word in activity_words if word in desc_lower)
    
    food_words = ['restaurant', 'food', 'dining', 'cafe', 'coffee', 'breakfast', 'kitchen']
    features['desc_food_mentions'] = sum(1 for word in food_words if word in desc_lower)
    
    family_words = ['family', 'children', 'kids', 'child-friendly', 'family-friendly']
    features['desc_family_mentions'] = sum(1 for word in family_words if word in desc_lower)
    
    romantic_words = ['romantic', 'couple', 'honeymoon', 'intimate', 'private']
    features['desc_romantic_mentions'] = sum(1 for word in romantic_words if word in desc_lower)
    
    # Punctuation and formatting features
    features['desc_exclamation_count'] = desc_str.count('!')
    features['desc_question_count'] = desc_str.count('?')
    
    # Capital letters ratio
    caps_count = sum(1 for char in desc_str if char.isupper())
    features['desc_caps_ratio'] = caps_count / len(desc_str) if len(desc_str) > 0 else 0
    
    # Number count
    features['desc_number_count'] = sum(1 for word in words if any(char.isdigit() for char in word))
    
    return features

if 'description' in listing_data.columns:
    desc_features = listing_data['description'].apply(extract_description_features)
    desc_features_df = pd.json_normalize(desc_features)
    desc_features_df.index = listing_data.index
    
    # Add description features to main dataframe
    for col in desc_features_df.columns:
        listing_data[col] = desc_features_df[col]
    
    print(f"Description features created: {len(desc_features_df.columns)}")

Description features created: 24


In [611]:
# Create Overall Text Quality Score
def calculate_overall_text_quality(row):
    """Calculate comprehensive text quality score"""
    score = 0
    
    # Name contribution (25% weight)
    name_score = (
        row.get('name_luxury_score', 0) * 3 +
        row.get('name_location_score', 0) * 2 +
        row.get('name_comfort_score', 0) * 2 +
        row.get('name_view_score', 0) * 1.5 +
        (2 if row.get('name_mentions_private', False) else 0) +
        (1.5 if row.get('name_mentions_entire', False) else 0)
    )
    score += name_score * 0.25
    
    # Description contribution (50% weight)
    desc_score = (
        row.get('desc_luxury_mentions', 0) * 3 +
        row.get('desc_experience_mentions', 0) * 2 +
        row.get('desc_cleanliness_mentions', 0) * 2.5 +
        row.get('desc_safety_mentions', 0) * 2 +
        row.get('desc_comfort_mentions', 0) * 2 +
        (row.get('desc_sentiment_score', 0) + 1) * 5 +  # Normalise sentiment (-1 to 1) to (0 to 2)
        row.get('desc_facility_mentions', 0) * 1.5 +
        row.get('desc_location_mentions', 0) * 1.5
    )
    score += desc_score * 0.5
    
    # Amenities contribution (25% weight)
    amenities_score = (
        row.get('luxury_amenities_score', 0) * 4 +
        row.get('convenience_amenities_score', 0) * 2.5 +
        row.get('basic_amenities_score', 0) * 2 +
        row.get('safety_amenities_count', 0) * 2
    )
    score += amenities_score * 0.25
    
    return score

listing_data['overall_text_quality'] = listing_data.apply(calculate_overall_text_quality, axis=1)

# Create percentile rankings
listing_data['text_quality_percentile'] = listing_data['overall_text_quality'].rank(pct=True) * 100

# Create categorical quality levels
def categorise_quality(percentile):
    if percentile >= 95:
        return 'Exceptional'
    elif percentile >= 85:
        return 'Premium'
    elif percentile >= 70:
        return 'High'
    elif percentile >= 50:
        return 'Good'
    elif percentile >= 30:
        return 'Average'
    else:
        return 'Basic'

listing_data['text_quality_category'] = listing_data['text_quality_percentile'].apply(categorise_quality)

### Handling Outlliers

In [612]:
def handle_outliers_iqr(df, column, method='clip'):
    """Handle outliers using IQR method"""
    if column not in df.columns:
        print(f"Column {column} not found")
        return df
    
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    outliers_count = ((df[column] < lower_bound) | (df[column] > upper_bound)).sum()
    
    if method == 'clip':
        df[column] = df[column].clip(lower=lower_bound, upper=upper_bound)
        print(f"{column}: Clipped {outliers_count} outliers (bounds: {lower_bound:.2f} - {upper_bound:.2f})")
    
    return df

# Use percentile method for price 
if 'price' in listing_data.columns:
    print("Handling price outliers...")
    price_before = listing_data['price'].describe()
    
    lower_bound = listing_data['price'].quantile(0.005)  # 0.5th percentile
    upper_bound = listing_data['price'].quantile(0.995)  # 99.5th percentile
    
    outliers_count = ((listing_data['price'] < lower_bound) | (listing_data['price'] > upper_bound)).sum()
    listing_data['price'] = listing_data['price'].clip(lower=lower_bound, upper=upper_bound)
    
    print(f"Price: Clipped {outliers_count} outliers (bounds: £{lower_bound:.2f} - £{upper_bound:.2f})")
    print("Price distribution after outlier handling:")
    print(listing_data['price'].describe())

# Handle other numeric outliers
numeric_outlier_cols = ['accommodates', 'bedrooms', 'beds', 'bathrooms', 'number_of_reviews']
for col in numeric_outlier_cols:
    if col in listing_data.columns:
        listing_data = handle_outliers_iqr(listing_data, col, method='clip')

Handling price outliers...
Price: Clipped 51 outliers (bounds: £20.00 - £1163.89)
Price distribution after outlier handling:
count    6295.000000
mean      106.473627
std       122.277979
min        20.000000
25%        49.000000
50%        81.000000
75%       120.000000
max      1163.890000
Name: price, dtype: float64
accommodates: Clipped 296 outliers (bounds: -2.50 - 9.50)
bedrooms: Clipped 579 outliers (bounds: -0.50 - 3.50)
beds: Clipped 179 outliers (bounds: -2.00 - 6.00)
bathrooms: Clipped 61 outliers (bounds: -0.50 - 3.50)
number_of_reviews: Clipped 792 outliers (bounds: -47.00 - 81.00)


### Encode Categorical Variables

In [613]:
# Ordinal encoding for host response time
if 'host_response_time' in listing_data.columns:
    response_time_order = ['within an hour', 'within a few hours', 'within a day', 
                          'a few days or more', 'unknown']
    response_time_map = {v: i for i, v in enumerate(response_time_order)}
    listing_data['host_response_time_encoded'] = listing_data['host_response_time'].map(response_time_map)
    
    for k, v in response_time_map.items():
        count = (listing_data['host_response_time'] == k).sum()
        print(f"  {k}: {v} ({count} listings)")

  within an hour: 0 (4911 listings)
  within a few hours: 1 (700 listings)
  within a day: 2 (386 listings)
  a few days or more: 3 (226 listings)
  unknown: 4 (928 listings)


In [614]:
# Check cardinality of categorical variables before encoding
categorical_for_encoding = ['property_type', 'room_type', 'neighbourhood_cleansed', 
                           'neighbourhood_group_cleansed', 'text_quality_category']

for col in categorical_for_encoding:
    if col in listing_data.columns:
        unique_count = listing_data[col].nunique()
        print(f"{col}: {unique_count} unique values")
        
        # Show top categories
        top_cats = listing_data[col].value_counts().head()
        print(f"  Top categories: {top_cats.to_dict()}")

property_type: 56 unique values
  Top categories: {'Entire rental unit': 1983, 'Entire home': 1660, 'Private room in home': 1560, 'Entire condo': 548, 'Private room in rental unit': 330}
room_type: 4 unique values
  Top categories: {'Entire home/apt': 4668, 'Private room': 2449, 'Shared room': 31, 'Hotel room': 3}
neighbourhood_group_cleansed: 10 unique values
  Top categories: {'Manchester': 3211, 'Salford': 1482, 'Trafford': 659, 'Stockport': 382, 'Oldham': 291}
text_quality_category: 6 unique values
  Top categories: {'Basic': 2145, 'Average': 1430, 'Good': 1417, 'High': 1090, 'Premium': 713}


In [615]:
# One-hot encoding but limit to top categories
def prepare_categorical_for_encoding(df, column, max_categories=20):
    """Prepare categorical column for encoding by limiting to top categories"""
    if column not in df.columns:
        return df
    
    value_counts = df[column].value_counts()
    
    if len(value_counts) > max_categories:
        top_categories = value_counts.head(max_categories).index.tolist()
        df[f"{column}_grouped"] = df[column].apply(
            lambda x: x if x in top_categories else 'Other'
        )
        print(f"{column}: Grouped to top {max_categories} categories + 'Other'")
        return f"{column}_grouped"
    else:
        return column

# Prepare high-cardinality categorical variables
prepared_categoricals = []
for col in categorical_for_encoding:
    if col in listing_data.columns:
        prepared_col = prepare_categorical_for_encoding(listing_data, col, max_categories=15)
        prepared_categoricals.append(prepared_col)


property_type: Grouped to top 15 categories + 'Other'


In [616]:
# One-hot encode categorical variables
categorical_columns = ['room_type', 'property_type_grouped', 'estimated_image_quality', 'text_quality_category']
listing_data = pd.get_dummies(listing_data, columns=categorical_columns, prefix=categorical_columns)

### Final Preperations

In [617]:
# Remove columns that won't be useful for modelling
final_drop_cols = [
    'last_scraped', 'host_since', 'calendar_last_scraped', 'first_review', 'last_review', 
    'description', 'name', 'amenities', 'host_verifications', 'neighborhood_overview',  
    'amenities_list', 'host_response_time', 'host_id', 'host_name', 'host_listinga_count',   
    'source', 'id', 
    'minimum_nights', 'maximum_nights', 'minimum_minimum_nights', 'maximum_minimum_nights', 
    'minimum_maximum_nights', 'maximum_maximum_nights', 'minimum_nights_avg_ntm', 
    'maximum_nights_avg_ntm', 'has_availability', 'availability_30', 'availability_60', 
    'availability_90', 'availability_365', 'number_of_reviews_ltm', 'number_of_reviews_l300d', 
    'availability_eoy', 'number_of_reviews_ly', 'host_listings_count', 'neighbourhood_cleansed', 
    'bathrooms_text', 'number_of_reviews_l30d', 'estimated_occupancy_l365d', 'estimated_revenue_l365d', 
    'calculated_host_listings_count', 'calculated_host_listings_count_entire_homes',
    'calculated_host_listings_count_private_rooms', 'calculated_host_listings_count_shared_rooms',
    'property_type', 'neighbourhood_group_cleansed', 'picture_url'
]

existing_final_drops = [col for col in final_drop_cols if col in listing_data.columns]
print(f"Dropping {len(existing_final_drops)} columns: {existing_final_drops}")
listing_data = listing_data.drop(columns=existing_final_drops)

Dropping 10 columns: ['last_scraped', 'host_since', 'description', 'name', 'amenities', 'amenities_list', 'host_response_time', 'property_type', 'neighbourhood_group_cleansed', 'picture_url']


In [618]:
print("Final dataset shape:", listing_data.shape)

Final dataset shape: (7151, 207)


In [619]:
listing_data.head()

Unnamed: 0,accommodates,bathrooms,bedrooms,beds,price,number_of_reviews,review_scores_rating,review_scores_cleanliness,review_scores_location,host_is_superhost,host_total_listings_count,host_identity_verified,review_scores_accuracy,review_scores_checkin,review_scores_communication,review_scores_value,latitude,longitude,host_days_active,host_years_active,price_per_person,people_per_bedroom,has_picture,url_length,is_muscache,image_id_length,is_original,file_extension,url_has_size_param,url_path_segments,url_complexity_score,amenities_count,has_wifi,has_kitchen,has_air_conditioning,has_heating,has_tv,has_washer,has_dryer,has_iron,has_hair_dryer,has_essentials,has_smoke_alarm,has_carbon_monoxide_alarm,has_first_aid_kit,has_fire_extinguisher,has_security_cameras,has_lockbox,has_private_entrance,has_refrigerator,has_microwave,has_oven,has_dishwasher,has_coffee_maker,has_dining_table,has_cookware,has_blender,has_toaster,has_kettle,has_shampoo,has_conditioner,has_body_soap,has_hot_water,has_bathtub,has_bidet,has_bathroom_essentials,has_bed_linens,has_extra_pillows,has_hangers,has_closet,has_desk,has_chair,has_sofa,has_blackout_curtains,has_dedicated_workspace,has_laptop_friendly,has_ethernet,has_printer,has_monitor,has_sound_system,has_game_console,has_books,has_board_games,has_music,has_balcony,has_garden,has_bbq_grill,has_outdoor_furniture,has_beach_access,has_mountain_view,has_city_view,has_garden_view,has_pool,has_hot_tub,has_gym,has_sauna,has_concierge,has_room_service,has_luxury_toiletries,has_wine_cooler,has_free_parking,has_paid_parking,has_ev_charger,has_public_transport,has_bicycle,has_airport_shuttle,has_family_friendly,has_crib,has_high_chair,has_baby_bath,has_step_free_access,has_wide_doorways,has_accessible_bathroom,has_pets_allowed,has_pet_bowls,has_pet_bed,has_fan,has_fireplace,has_humidifier,has_air_purifier,has_mosquito_net,basic_amenities_count,safety_amenities_count,kitchen_dining_amenities_count,bathroom_amenities_count,bedroom_living_amenities_count,internet_office_amenities_count,entertainment_amenities_count,outdoor_recreation_amenities_count,luxury_amenities_count,transport_location_amenities_count,family_accessibility_amenities_count,pet_amenities_count,climate_environment_amenities_count,basic_amenities_score,luxury_amenities_score,convenience_amenities_score,name_length,name_word_count,name_luxury_score,name_location_score,name_mentions_apartment,name_mentions_house,name_mentions_studio,name_mentions_loft,name_mentions_room,name_comfort_score,name_mentions_private,name_mentions_entire,name_view_score,name_mentions_central,name_mentions_modern,desc_length,desc_word_count,desc_sentence_count,avg_word_length,desc_readability,desc_sentiment_score,desc_luxury_mentions,desc_location_mentions,desc_transport_mentions,desc_experience_mentions,desc_facility_mentions,desc_business_mentions,desc_safety_mentions,desc_cleanliness_mentions,desc_comfort_mentions,desc_view_mentions,desc_activity_mentions,desc_food_mentions,desc_family_mentions,desc_romantic_mentions,desc_exclamation_count,desc_question_count,desc_caps_ratio,desc_number_count,overall_text_quality,text_quality_percentile,host_response_time_encoded,room_type_Entire home/apt,room_type_Hotel room,room_type_Private room,room_type_Shared room,property_type_grouped_Entire condo,property_type_grouped_Entire cottage,property_type_grouped_Entire guest suite,property_type_grouped_Entire guesthouse,property_type_grouped_Entire home,property_type_grouped_Entire rental unit,property_type_grouped_Entire serviced apartment,property_type_grouped_Entire townhouse,property_type_grouped_Other,property_type_grouped_Private room,property_type_grouped_Private room in bed and breakfast,property_type_grouped_Private room in condo,property_type_grouped_Private room in home,property_type_grouped_Private room in rental unit,property_type_grouped_Private room in townhouse,property_type_grouped_Room in hotel,estimated_image_quality_original,estimated_image_quality_standard,text_quality_category_Average,text_quality_category_Basic,text_quality_category_Exceptional,text_quality_category_Good,text_quality_category_High,text_quality_category_Premium
0,3.0,1.5,2.0,2.0,45.0,81,4.92,4.94,4.68,True,3.0,True,4.93,4.97,4.94,4.89,53.50153,-2.26249,5011.0,13.71937,15.0,1.5,True,63,True,8,True,jpg,False,5,9.1,47,True,True,True,True,True,False,True,True,True,True,True,True,True,True,True,True,False,True,True,True,False,False,True,True,False,True,True,True,True,True,True,False,False,False,True,True,True,True,True,False,False,False,True,False,False,False,False,False,False,True,False,False,False,True,False,True,False,False,True,False,False,True,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,9,6,7,4,5,1,1,3,1,1,0,0,0,5,1,1,34,6,0,0,False,False,False,False,True,0,False,False,0,False,True,248,43,3,4.767442,50.630853,0.0,0,0,0,0,3,0,1,0,1,2,0,1,0,1,0,0,0.016129,3,13.875,41.441756,1,True,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False,True,False,False,False,False,False
1,2.0,1.0,1.0,1.0,75.0,10,5.0,4.9,4.8,False,1.0,False,5.0,5.0,5.0,5.0,53.56271,-2.21824,4851.0,13.281314,37.5,2.0,True,63,True,8,True,jpg,False,5,9.1,22,True,True,False,True,True,True,True,True,True,True,True,True,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,True,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,9,3,0,2,1,0,0,0,1,1,0,1,0,5,1,3,31,5,1,0,False,True,False,False,True,0,False,False,0,False,False,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,10.125,12.585652,4,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,True,False,False,True,False,False,False,False
2,1.0,1.0,1.0,1.0,38.0,65,4.66,4.27,4.81,True,6.0,True,4.75,4.78,4.86,4.65,53.48411,-2.22919,4813.0,13.177276,38.0,1.0,True,62,True,8,True,jpg,False,5,9.1,8,True,True,False,True,True,True,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,6,0,0,0,0,0,0,0,1,0,0,0,0,5,1,1,35,7,0,0,False,False,False,False,True,0,False,False,0,False,False,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,6.625,1.440358,1,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,True,False,False,True,False,False,False,False
3,1.0,1.0,1.0,1.0,42.0,81,4.86,4.94,4.87,False,12.0,True,4.88,4.93,4.94,4.82,53.480172,-2.232849,4792.0,13.119781,42.0,1.0,True,73,True,0,False,jpg,False,4,4.0,29,True,False,True,True,False,False,True,True,True,True,True,True,True,False,False,True,False,False,False,False,False,False,False,False,False,False,False,True,True,True,True,True,False,False,True,True,True,True,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,7,4,0,5,4,0,1,0,1,1,0,0,0,3,1,1,35,4,0,1,False,False,False,False,False,1,False,False,0,True,False,191,33,2,4.818182,46.523864,1.0,1,0,2,1,2,0,0,0,0,0,0,0,0,1,0,0,0.272251,1,15.125,53.328206,0,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,True,False,False,False,True,False,False
4,2.0,2.0,1.0,1.0,35.0,48,4.78,4.13,4.52,False,3.0,True,4.56,4.92,4.87,4.73,53.45466,-2.23542,4768.0,13.054073,17.5,2.0,True,63,True,8,True,jpg,False,5,9.1,28,True,True,False,True,False,True,True,True,True,True,True,True,False,False,False,False,False,True,True,True,False,False,False,True,True,True,True,False,False,False,True,True,False,False,True,False,True,True,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,8,2,7,2,3,0,1,0,0,0,0,0,0,4,0,2,35,6,0,0,False,False,False,False,True,0,False,False,0,False,False,244,42,6,4.833333,56.715714,0.952381,0,0,1,0,3,0,0,0,0,0,0,1,0,0,0,0,0.02459,0,11.380952,20.948119,1,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,True,False,False,True,False,False,False,False


In [625]:
listing_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6295 entries, 0 to 7150
Columns: 207 entries, accommodates to text_quality_category_Premium
dtypes: bool(132), float64(26), int64(48), object(1)
memory usage: 4.4+ MB


In [620]:
# Check for any remaining issues
remaining_missing = listing_data.isnull().sum()
cols_with_missing = remaining_missing[remaining_missing > 0]

print(cols_with_missing)

price                           856
review_scores_rating           1474
review_scores_cleanliness      1475
review_scores_location         1474
host_total_listings_count         2
review_scores_accuracy         1474
review_scores_checkin          1474
review_scores_communication    1474
review_scores_value            1474
host_days_active                  2
host_years_active                 2
price_per_person                856
dtype: int64


In [623]:
# Handle any remaining missing values
final_missing = listing_data.isnull().sum()
cols_with_final_missing = final_missing[final_missing > 0]

if len(cols_with_final_missing) > 0:
    
    # Drop rows with missing target variable (price)
    if 'price' in cols_with_final_missing.index:
        before_count = len(listing_data)
        listing_data = listing_data.dropna(subset=['price'])
        after_count = len(listing_data)
    
    # Fill remaining missing values
    for col in cols_with_final_missing.index:
        if col in listing_data.columns and col != 'price':
            if listing_data[col].dtype in ['float64', 'int64']:
                fill_value = listing_data[col].median()
            else:
                fill_value = listing_data[col].mode().iloc[0] if len(listing_data[col].mode()) > 0 else 'unknown'
            
            listing_data[col] = listing_data[col].fillna(fill_value)


In [624]:
output_filename = 'airbnb_processed_data.csv'
listing_data.to_csv(output_filename, index=False)