### Example:
- data cleaning
- feature creation
- feature engineering
- Sample data virtualization

In [46]:
import pandas as pd
import numpy as np

# Read the data
df = pd.read_csv('holidays.csv')

def process_month(month_str):
    """Extract the first month from potentially multiple months."""
    # Split on common separators and take first month
    month_str = str(month_str).split('/')[0].split('-')[0].split()[0].strip()
    
    # Convert month name to number
    month_map = {
        'January': 1, 'February': 2, 'March': 3, 'April': 4,
        'May': 5, 'June': 6, 'July': 7, 'August': 8,
        'September': 9, 'October': 10, 'November': 11, 'December': 12
    }
    return month_map.get(month_str, np.nan)

def process_day(day_str):
    """Extract the first day from potentially multiple days or ranges."""
    if pd.isna(day_str) or day_str == 'Varies' or 'Full Moon' in str(day_str):
        return np.nan
    
    # Split on common separators and take first day
    try:
        day = str(day_str).split('-')[0].split('/')[0].split()[0].strip()
        return int(day)
    except (ValueError, AttributeError):
        return np.nan

def process_holiday_type(type_str):
    """Categorize holiday types into main categories."""
    type_str = str(type_str).lower()
    if 'thai national' in type_str:
        return 'Thai National'
    elif 'thai festival' in type_str or 'thai buddhist' in type_str:
        return 'Thai Festival'
    elif 'international' in type_str or 'regional' in type_str:
        return 'International'
    elif 'general' in type_str:
        return 'General Season'
    else:
        return 'Other'

def process_season(season_str):
    """Categorize seasons into main categories."""
    season_str = str(season_str).lower()
    if 'high season' in season_str or 'peak' in season_str:
        return 'High Season'
    elif 'shoulder season' in season_str:
        return 'Shoulder Season'
    elif 'low season' in season_str:
        return 'Low Season'
    else:
        return 'Unknown'

# Create processed DataFrame
processed_df = pd.DataFrame()
processed_df['Holiday/Festival'] = df['Holiday/Festival']

# Process month and day
processed_df['Processed_Month'] = df['Month'].apply(process_month)
processed_df['Processed_Day'] = df['Day'].apply(process_day)

# Process holiday type and season
processed_df['Holiday_Type'] = df['Type'].apply(process_holiday_type)
processed_df['Season_Flag'] = df['Season'].apply(process_season)

# Save the processed data
processed_df.to_csv('processed_holidays.csv', index=False)

In [48]:
import pandas as pd
import numpy as np

# Read the tourism data
df = pd.read_csv('thailand_tourism.csv')

# Function to process the data
def process_tourism_data(df):
    # Melt the dataframe to convert months from columns to rows
    melted_df = pd.melt(df, 
                        id_vars=['Months'], 
                        var_name='Date', 
                        value_name='Tourists')
    
    # Clean the tourist numbers (remove commas and convert to numeric)
    melted_df['Tourists'] = melted_df['Tourists'].str.replace(',', '').astype(float)
    
    # Parse the date column
    melted_df['Date'] = pd.to_datetime(melted_df['Date'], format='%b %Y')
    
    # Extract numeric month
    melted_df['Month'] = melted_df['Date'].dt.month
    
    # Group by month and sum tourists across all regions
    monthly_totals = melted_df.groupby('Month')['Tourists'].sum().reset_index()
    
    return melted_df, monthly_totals

# Process the data
melted_df, monthly_totals = process_tourism_data(df)

# Save processed data
monthly_totals.to_csv('processed_tourism.csv', index=False)

In [58]:
import pandas as pd
import numpy as np
from datetime import datetime

# Read the listings data
df = pd.read_csv('listings.csv')

def process_airbnb_data(df):
    """Process Airbnb listings data."""
    # Create a copy to avoid modifying original data
    processed_df = df.copy()

    # Drop rows where 'last_review' is missing
    processed_df = processed_df.dropna(subset=['last_review'])
    
    # Drop rows where 'price' is missing
    processed_df = processed_df.dropna(subset=['price'])
    
    # Convert last_review to datetime and extract month
    processed_df['last_review'] = pd.to_datetime(processed_df['last_review'])
    processed_df['last_review_month'] = processed_df['last_review'].dt.month
    
    # Clean price column (convert to float)
    processed_df['price'] = pd.to_numeric(processed_df['price'], errors='coerce')
    
    # Create dummy variables for room_type
    room_type_dummies = pd.get_dummies(processed_df['room_type'], prefix='room_type')
    
    # Combine relevant columns
    final_df = pd.concat([
        processed_df[['last_review_month', 'price']],
        room_type_dummies
    ], axis=1)
    
    return final_df, processed_df

# Process the data
processed_df, full_processed_df = process_airbnb_data(df)

room_type_cols = [col for col in processed_df.columns if col.startswith('room_type_')]
room_type_distribution = processed_df[room_type_cols].sum().sort_values(ascending=False)

# Save processed data
processed_df.to_csv('processed_listings.csv', index=False)


In [59]:
import pandas as pd
import numpy as np

# Read the processed datasets
listings_df = pd.read_csv('processed_listings.csv')
tourism_df = pd.read_csv('processed_tourism.csv')
holidays_df = pd.read_csv('processed_holidays.csv')

def merge_datasets(listings_df, tourism_df, holidays_df):
    """
    Merge the three datasets based on month and handle missing values.
    """
    # First merge: Listings with Tourism
    merged_df = pd.merge(
        listings_df,
        tourism_df,
        left_on='last_review_month',
        right_on='Month',
        how='left'
    )
    
    # Drop duplicate Month column
    merged_df = merged_df.drop('Month', axis=1)
    
    # Second merge: Add Holiday data
    final_df = pd.merge(
        merged_df,
        holidays_df,
        left_on='last_review_month',
        right_on='Processed_Month',
        how='left'
    )
    
    # Drop duplicate month column and unnecessary columns
    columns_to_drop = ['Processed_Month', 'Processed_Day', 'Holiday_Type']
    final_df = final_df.drop(columns_to_drop, axis=1, errors='ignore')
    
    # Handle missing values
    final_df['Holiday/Festival'] = final_df['Holiday/Festival'].fillna('No Holiday')
    final_df['Season_Flag'] = final_df['Season_Flag'].fillna('Regular Season')
    final_df['Tourists'] = final_df['Tourists'].fillna(final_df['Tourists'].mean())
    
    # Reorder columns for clarity
    column_order = [
        'last_review_month',
        'price',
        'Tourists',
        'Holiday/Festival',
        'Season_Flag'
    ]
    
    # Add room type columns to the order
    room_type_cols = [col for col in final_df.columns if col.startswith('room_type_')]
    column_order.extend(room_type_cols)
    
    # Select and order columns
    final_df = final_df[column_order]
    
    return final_df

# Merge the datasets
final_df = merge_datasets(listings_df, tourism_df, holidays_df)

# Save the merged dataset
final_df.to_csv('merged_tourism_data.csv', index=False)