# Airbnb NYC 2019 Data Exploration

This notebook performs the following analyses:
1. Data Overview
   - Data size
   - Missing Values
   - Basic statistics
2. Visualization Results

## 1. Import libraries and load data

In [None]:
import sys
import os
import numpy as np


sys.path.append(os.path.join(os.path.dirname(os.getcwd()), 'src'))
from visualization import *
from data_processing import *

# Setup display
%matplotlib inline

In [None]:
data_numpy = read_csv('../data/raw/AB_NYC_2019.csv')

column_names = data_numpy[0]
data = data_numpy[1:]

print("Data loaded successfully")

## 2. Data Overview

### 2.1. Data size

In [None]:
print(f"Number of rows: {data.shape[0]:,}")
print(f"Number of columns: {data.shape[1]}")
print(f"\nTotal data cells: {data.shape[0] * data.shape[1]:,}")

In [None]:
# Column information
print("\nColumn information:")
print(f"Total columns: {len(column_names)}")
print("\nColumn list:")
for i, col in enumerate(column_names):
    print(f"  {i+1}. {col}")

In [None]:
# View first 5 rows
print("\nFirst 5 rows of data:")
for i in range(min(5, len(data))):
    print(f"Row {i+1}:", data[i])

### 2.2. Missing Values

In [None]:
print("Columns with missing values")
for i, col in enumerate(column_names):
    col_data = data[:, i]
    missing_count = np.sum(col_data == '')
    
    if missing_count > 0:
        missing_percent = (missing_count / len(data) * 100)
        print(f"\n{col}:")
        print(f"  - Missing count     : {missing_count:,}")
        print(f"  - Missing percentage: {missing_percent:.3f}%")

plot_missing_values(data, column_names, 
                           ['host_name', 'name'], 
                           title='Missing Values: host_name and name')

# Chart 2: Missing values for reviews_per_month and last_review
plot_missing_values(data, column_names, 
                           ['reviews_per_month', 'last_review'], 
                           title='Missing Values: reviews_per_month and last_review')

### 2.3. Basic statistics

In [None]:
# Numeric columns (index in column_names)
numeric_columns = ['price', 'minimum_nights', 'number_of_reviews', 
                   'reviews_per_month', 'calculated_host_listings_count', 'availability_365',
                   'latitude', 'longitude']

print("Basic statistics for numeric variables:")
for col in numeric_columns:
    if col in column_names:
        col_idx = np.where(column_names == col)[0][0]
        col_data = data[:, col_idx]
        
        # Convert to numeric, remove empty values
        numeric_data = []
        for val in col_data:
            if val != '':
                try:
                    numeric_data.append(float(val))
                except:
                    pass
        
        if len(numeric_data) > 0:
            numeric_data = np.array(numeric_data)
            print(f"\n{col}:")
            print(f"  Count        : {len(numeric_data)}")
            print(f"  Mean         : {np.mean(numeric_data):.3f}")
            print(f"  Std          : {np.std(numeric_data):.3f}")
            print(f"  Min          : {np.min(numeric_data):.3f}")
            print(f"  25%          : {np.percentile(numeric_data, 25):.3f}")
            print(f"  50% - Median : {np.percentile(numeric_data, 50):.3f}")
            print(f"  75%          : {np.percentile(numeric_data, 75):.3f}")
            print(f"  Max          : {np.max(numeric_data):.3f}")

In [None]:
categorical_columns = ['neighbourhood_group', 'neighbourhood', 'room_type']
print(f"List of columns to process: {categorical_columns}")
print("Statistics for categorical variables:")

for col in categorical_columns:
    col_idx = np.where(column_names == col)[0][0]
    col_data = data[:, col_idx]

    unique_vals, counts = np.unique(col_data, return_counts=True)
    
    print(f"\n{col.upper()}:")
    print(f"  - Number of unique values: {len(unique_vals)}")
    print(f"  - Top 5 most common values:")

    sorted_indices = np.argsort(counts)[::-1][:5]
    for idx in sorted_indices:
        print(f"      {unique_vals[idx]}: {counts[idx]}")

## 3. Visualization Results

### 3.2. Price Distribution

In [None]:
# Get price column
price_idx = np.where(column_names == 'price')[0][0]
price_data = data[:, price_idx]

# Convert to numeric
prices = []
for val in price_data:
    if val != '':
        try:
            prices.append(float(val))
        except:
            prices.append(np.nan)
    else:
        prices.append(np.nan)

prices = np.array(prices)

# Plot price distribution
plot_distribution(prices, 'price', bins=100, title='Airbnb Listing Price Distribution')

In [None]:
# Price distribution for values less than $500 for better visibility
prices_filtered = prices[(prices <= 500) & (~np.isnan(prices))]
plot_distribution(prices_filtered, 'price', bins=50, title='Price Distribution (<= $500)')

### 3.3. Distribution by Neighbourhood Group

In [None]:
# Distribution by neighbourhood_group
ng_idx = np.where(column_names == 'neighbourhood_group')[0][0]
neighbourhood_groups = data[:, ng_idx]

plot_categorical_distribution(neighbourhood_groups, 'neighbourhood_group', top_n=5)

### 3.4. Average price by neighbourhood

In [None]:
# Average price by neighbourhood_group
plot_price_by_category(neighbourhood_groups, prices, 'neighbourhood_group', top_n=5)

### 3.4.1. Price density and distribution by neighbourhood (Violin Plot)

In [None]:
# Plot violin plot for price distribution by neighbourhood_group
prices_filtered_600 = prices[(prices <= 500) & (~np.isnan(prices))]
neighbourhood_groups_filtered = neighbourhood_groups[(prices <= 500) & (~np.isnan(prices))]
plot_price_density_by_neighbourhood(neighbourhood_groups_filtered, prices_filtered_600)

### 3.5. Distribution by Room Type

In [None]:
# Distribution by room_type
rt_idx = np.where(column_names == 'room_type')[0][0]
room_types = data[:, rt_idx]

plot_categorical_distribution(room_types, 'room_type', top_n=3)

### 3.6. Price distribution by room type

In [None]:
# Price distribution by room type
plot_price_distribution_by_room_type(room_types, prices)

### 3.7. Geographic distribution of listings

In [None]:
# Get latitude and longitude data
lat_idx = np.where(column_names == 'latitude')[0][0]
lon_idx = np.where(column_names == 'longitude')[0][0]

latitudes = []
longitudes = []

for i in range(len(data)):
    try:
        lat = float(data[i, lat_idx])
        lon = float(data[i, lon_idx])
        latitudes.append(lat)
        longitudes.append(lon)
    except:
        latitudes.append(np.nan)
        longitudes.append(np.nan)

latitudes = np.array(latitudes)
longitudes = np.array(longitudes)

# Filter price <= 500 for colormap
prices_filtered_geo = np.where(prices > 500, np.nan, prices)

# Plot geographic distribution
plot_geographical_distribution(longitudes, latitudes, 
                               color_data=prices_filtered_geo,
                               title='Geographic Distribution of Airbnb Listings in NYC (color = price)')

### 3.8. Correlation matrix between numeric variables

In [None]:
# Create numeric data matrix
numeric_col_names = ['price', 'minimum_nights', 'number_of_reviews', 
                     'reviews_per_month', 'calculated_host_listings_count', 
                     'availability_365', 'latitude', 'longitude']

numeric_data_list = []

for col in numeric_col_names:
    if col in column_names:
        col_idx = np.where(column_names == col)[0][0]
        col_data = data[:, col_idx]
        
        # Convert to numeric
        numeric_vals = []
        for val in col_data:
            if val != '':
                try:
                    numeric_vals.append(float(val))
                except:
                    numeric_vals.append(0)
            else:
                numeric_vals.append(0)
        
        numeric_data_list.append(np.array(numeric_vals))

# Stack into 2D matrix
numeric_matrix = np.column_stack(numeric_data_list)

# Plot correlation heatmap
plot_correlation_heatmap(numeric_matrix, numeric_col_names)

### 3.9. Top hosts with most listings

In [None]:
# Get host data
host_id_idx = np.where(column_names == 'host_id')[0][0]
host_name_idx = np.where(column_names == 'host_name')[0][0]

host_ids = data[:, host_id_idx]
host_names = data[:, host_name_idx]

# Plot top hosts
plot_top_hosts(host_ids, host_names, top_n=15)

### 3.10. Top 10 most and least expensive neighbourhoods

In [None]:
# Get neighbourhood data
neighbourhood_idx = np.where(column_names == 'neighbourhood')[0][0]
neighbourhoods = data[:, neighbourhood_idx]

# Top 10 most expensive
plot_top_expensive_neighbourhoods(neighbourhoods, prices, top_n=10, most_expensive=True)

In [None]:
# Top 10 least expensive
plot_top_expensive_neighbourhoods(neighbourhoods, prices, top_n=10, most_expensive=False)

### 3.11. Top 10 most common words in listing names

In [None]:
name_idx = np.where(column_names == 'name')[0][0]
listing_names = data[:, name_idx]

# Plot top 10 most common words
plot_top_words_name(data, column_names, top_n=10)