### Example:
- data cleaning
- feature creation
- feature engineering
- Sample data virtualization

In [22]:
import pandas as pd
import numpy as np

In [23]:
# Load the raw holidays dataset (holidays.csv)
df = pd.read_csv('holidays.csv')

# Preprocess the 'Month' and 'Day' columns
def process_month_day(row):
    month = row['Month']
    if '/' in month:
        month = month.split('/')[0]
    return month

df['Processed_Month'] = df.apply(process_month_day, axis=1)

def process_day(row):
    day = row['Day']
    if 'Varies' in day:
        return np.nan
    elif '-' in day:
        return day.split('-')[0]
    return day

df['Processed_Day'] = df.apply(process_day, axis=1)

# Add holiday type flags
def holiday_type(row):
    if 'Thai National Holiday' in row['Type']:
        return 'Thai National'
    elif 'Thai Festival' in row['Type']:
        return 'Thai Festival'
    elif 'International Holiday' in row['Type']:
        return 'International'
    elif 'Regional Holiday' in row['Type']:
        return 'Regional'
    else:
        return 'Other'

df['Holiday_Type'] = df.apply(holiday_type, axis=1)

# Add seasonality flags
def season_flag(row):
    if 'High Season' in row['Season']:
        return 'High Season'
    elif 'Shoulder Season' in row['Season']:
        return 'Shoulder Season'
    elif 'Low Season' in row['Season']:
        return 'Low Season'
    else:
        return 'High Season'

df['Season_Flag'] = df.apply(season_flag, axis=1)

# Save the updated dataset with new features
df.to_csv('processed_holidays.csv', index=False)


In [41]:
# Load the dataset
file_path = 'thailand-tourism-data.csv'
df = pd.read_csv(file_path)

# Clean column names by stripping extra spaces
df.columns = df.columns.str.strip()

# Remove commas and convert the numbers to integers
for column in df.columns[1:]:  # Skip the first column, which is 'Months'
    df[column] = df[column].str.replace(',', '').astype(int)

# Drop the 'Months' column and sum across the rows (summing all the regions for each month)
df_sum = df.drop('Months', axis=1).sum(axis=0)

# Convert the result into a DataFrame with 'Month' as index
df_cleaned = pd.DataFrame(df_sum).reset_index()
df_cleaned.columns = ['Month', 'Total_Tourists']

# Sort the data by month in chronological order
df_cleaned['Month'] = pd.to_datetime(df_cleaned['Month'], format='%b %Y')
df_cleaned = df_cleaned.sort_values('Month')

# Convert back to the original month format
df_cleaned['Month'] = df_cleaned['Month'].dt.strftime('%b %Y')

# Save the cleaned data to a new file
df_cleaned.to_csv('processed_thailand_tourism_monthly.csv', index=False)


In [56]:
# Clean the Airbnb data
airbnb_df = pd.read_csv('listings.csv')

# Clean 'price' by removing the currency symbol and commas, then convert to float
airbnb_df['price'] = airbnb_df['price'].replace({'\$': '', ',': ''}, regex=True).astype(float)

# Drop rows where 'price' is missing, as it's important for the analysis
airbnb_df = airbnb_df.dropna(subset=['price'])

# Fill missing 'reviews_per_month' with 0 (assuming missing means no reviews)
airbnb_df['reviews_per_month'] = airbnb_df['reviews_per_month'].fillna(0)

# Fill missing 'minimum_nights' with the median
airbnb_df['minimum_nights'] = airbnb_df['minimum_nights'].fillna(airbnb_df['minimum_nights'].median())

# Fill missing 'number_of_reviews' with 0 (assuming missing means no reviews)
airbnb_df['number_of_reviews'] = airbnb_df['number_of_reviews'].fillna(0)

# Convert 'last_review' to datetime
airbnb_df['last_review'] = pd.to_datetime(airbnb_df['last_review'], errors='coerce')

# Extract month from 'last_review' and add it as a new column 'last_review_month'
airbnb_df['last_review_month'] = airbnb_df['last_review'].dt.month

# Fill NaN values with a placeholder (e.g., 0)
airbnb_df['last_review_month'] = airbnb_df['last_review_month'].fillna(0).astype(int)

# Step 2: Feature Engineering

# Price per night (price / minimum_nights) for understanding price scaling
airbnb_df['price_per_night'] = airbnb_df['price'] / airbnb_df['minimum_nights']

# One-hot encode 'room_type'
airbnb_df = pd.get_dummies(airbnb_df, columns=['room_type'], drop_first=True)

# Drop the 'neighbourhood_group' column since it's not really useful
airbnb_df = airbnb_df.drop(columns=['neighbourhood_group'])

# Remove all neighborhood-related columns
airbnb_df = airbnb_df.drop(columns=[col for col in airbnb_df.columns if 'neighbourhood' in col])

# Remove any irrelevant columns for modeling
airbnb_df = airbnb_df.drop(columns=['host_name', 'id'])

# Save the cleaned and processed dataset
airbnb_df.to_csv('processed_listings.csv', index=False)

In [60]:
# Print the columns of each dataframe
print("Airbnb DataFrame columns:")
print(airbnb_df.columns)

print("\nTourism DataFrame columns:")
print(tourism_df.columns)

print("\nHoliday DataFrame columns:")
print(holiday_df.columns)


KeyError: "['listing_id', 'location'] not in index"