# Clean the Airbnb Dataset

## Downloading the Dataset

You will need to download some prerequisite packages in order to run all the code below. Let's install them!

In [2]:
%%capture
!pip install numpy pandas streamlit gdown currencyconverter

Collecting gdown
  Downloading gdown-4.6.4-py3-none-any.whl (14 kB)
Collecting currencyconverter
  Downloading CurrencyConverter-0.17.5-py3-none-any.whl (563 kB)
     -------------------------------------- 563.6/563.6 kB 2.1 MB/s eta 0:00:00
Collecting filelock
  Downloading filelock-3.9.0-py3-none-any.whl (9.7 kB)
Collecting PySocks!=1.5.7,>=1.5.6
  Downloading PySocks-1.7.1-py3-none-any.whl (16 kB)
Installing collected packages: PySocks, filelock, currencyconverter, gdown
Successfully installed PySocks-1.7.1 currencyconverter-0.17.5 filelock-3.9.0 gdown-4.6.4


In [3]:
import os
import shutil
import gdown

import numpy as np
from numpy import genfromtxt

# For readability purposes, we will disable scientific notation for numbers
np.set_printoptions(suppress=True)

# Download file from Google Drive
# This file is based on data from: http://insideairbnb.com/get-the-data/
file_id_1 = "13fyESiH1ZEnMV6eabAyhe20t4W6peEWK"
downloaded_file_1 = "WK1_Airbnb_Amsterdam_listings_proj.csv"

# Download the file from Google Drive
gdown.download(id=file_id_1, output=downloaded_file_1)

Downloading...
From: https://drive.google.com/uc?id=13fyESiH1ZEnMV6eabAyhe20t4W6peEWK
To: C:\Users\panos\Desktop\Personal\Extra Knowledge\Python For Data Science Course\Projects\WK1_Airbnb_Amsterdam_listings_proj.csv
100%|████████████████████████████████████████████████████████████████████████████████| 246k/246k [00:00<00:00, 600kB/s]


'WK1_Airbnb_Amsterdam_listings_proj.csv'

## Preprocessing the Dataset

In [5]:
#my_data[1:10] As we see,we find out that our delimiter is '|'
my_data = genfromtxt(downloaded_file_1,delimiter='|',dtype='unicode')

#Check our first four columns
my_data[:5,:4]

array([['', '0', '1', '2'],
       ['id', '23726706', '35815036', '31553121'],
       ['price', '$88.00', '$105.00', '$152.00'],
       ['latitude', '52.34916', '52.42419', '52.43237'],
       ['longitude', '4.97879', '4.95689', '4.91821']], dtype='<U18')

In [6]:
# Remove the first column and row, because we don't need them
matrix = np.delete(my_data,0,axis=0)
matrix = np.delete(matrix,0,axis=1)

# Print out the first four columns
matrix[:5,:4]

array([['23726706', '35815036', '31553121', '34745823'],
       ['$88.00', '$105.00', '$152.00', '$87.00'],
       ['52.34916', '52.42419', '52.43237', '52.2962'],
       ['4.97879', '4.95689', '4.91821', '5.01231']], dtype='<U18')

In [7]:
# Shift the matrix by 90 degrees
matrix = matrix.transpose()

# Print out the first five rows
# Entries: airbnb_id, price_usd, latitude, longitude
matrix[:5,:4]

array([['23726706', '$88.00', '52.34916', '4.97879'],
       ['35815036', '$105.00', '52.42419', '4.95689'],
       ['31553121', '$152.00', '52.43237', '4.91821'],
       ['34745823', '$87.00', '52.2962', '5.01231'],
       ['44586947', '$160.00', '52.31475', '5.0303']], dtype='<U18')

String characters like commas and dollar signs are yet again present in the dataset.So we remove them

In [10]:
# Remove the dollar sign
matrix = np.char.replace(matrix,'$','')

# Remove the comma
matrix = np.char.replace(matrix,',','')

array([], dtype='<U18')

In [11]:
# Check if the dollar sign is in our dataset
matrix[np.char.find(matrix,'$')>-1]

array([], dtype='<U18')

In [12]:
# Check if the comma sign is in our dataset
matrix[np.char.find(matrix,',')>-1]

array([], dtype='<U18')

Enabling numerical operations (calculations) requires you to change the `dtype` from string/Unicode characters to float of 32-bit precision

In [13]:
# Change Unicode to float32
matrix = matrix.astype('float32')

# Print out the first five rows (and inspect the dtype for correctness)
# Entries: airbnb_id, price_usd, latitude, longitude
matrix[:5,:4]

array([[23726706.     ,       88.     ,       52.34916,        4.97879],
       [35815036.     ,      105.     ,       52.42419,        4.95689],
       [31553120.     ,      152.     ,       52.43237,        4.91821],
       [34745824.     ,       87.     ,       52.2962 ,        5.01231],
       [44586948.     ,      160.     ,       52.31475,        5.0303 ]],
      dtype=float32)

## Pick the right price

Our next objective is to change the currency from US dollars to another currency. This can be any currency you like, except for the US dollar. Let's first import the library that helps us to make these conversions. Then let's have another look at the first 5 rows of our matrix.

In [14]:
from currency_converter import CurrencyConverter

cc = CurrencyConverter()

# Entries: airbnb_id, price_usd, latitude, longitude
matrix[:5,:]

array([[23726706.     ,       88.     ,       52.34916,        4.97879],
       [35815036.     ,      105.     ,       52.42419,        4.95689],
       [31553120.     ,      152.     ,       52.43237,        4.91821],
       [34745824.     ,       87.     ,       52.2962 ,        5.01231],
       [44586948.     ,      160.     ,       52.31475,        5.0303 ]],
      dtype=float32)

In [None]:
# Change Unicode to float32
matrix = matrix.astype('float32')

# Print out the first five rows (and inspect the dtype for correctness)
# Entries: airbnb_id, price_usd, latitude, longitude
matrix[:5,:4]

In [18]:
#The tool we are using has a total of 42 currencies.
#Check them below and chose one
cc.currencies

{'AUD',
 'BGN',
 'BRL',
 'CAD',
 'CHF',
 'CNY',
 'CYP',
 'CZK',
 'DKK',
 'EEK',
 'EUR',
 'GBP',
 'HKD',
 'HRK',
 'HUF',
 'IDR',
 'ILS',
 'INR',
 'ISK',
 'JPY',
 'KRW',
 'LTL',
 'LVL',
 'MTL',
 'MXN',
 'MYR',
 'NOK',
 'NZD',
 'PHP',
 'PLN',
 'ROL',
 'RON',
 'RUB',
 'SEK',
 'SGD',
 'SIT',
 'SKK',
 'THB',
 'TRL',
 'TRY',
 'USD',
 'ZAR'}

In [16]:
#The currency conversion calculations we will be performing should be applied to the second column.
eur_rate = cc.convert(1,"USD","EUR")
matrix[:,1]*eur_rate

array([ 81.37599,  97.09636, 140.55853, ..., 166.4509 , 160.90253,
        60.10727], dtype=float32)

In [19]:
# Multiply the dollar column by the inflation percentage (1.00 + inflation)
#Inflation rate for Euro is 8.5 (Feb 2023)
inflation = 0.085
matrix[:, 1] = matrix[:,1] * (1 + inflation)
matrix[:, 1]

# Round down the new currency column to 2 decimals
matrix[:,1] = np.around(matrix[:,1],2)

array([ 95.48   , 113.925  , 164.92001, ..., 195.3    , 188.79001,
        70.525  ], dtype=float32)

# Let's prepare our dataset for Streamlit App

## Choose our desired location we want to stay in Amsterdam

##### >Find coordinates in Google

In [21]:
# Desired location
latitude = 52.356498574
longitude = 4.819330056

## Listing All Listings

<center>
  <img src=https://images0.persgroep.net/rcs/vnd5KBhggcKV72YJjpLWH_-xljU/diocontent/131036963/_crop/34/170/1378/778/_fitwidth/763?appId=93a17a8fd81db0de025c8abd1cca1279&quality=0.8&desiredformat=webp width="500" align="center" />
</center>


Imagine Airbnb Amsterdam decided to deviate from Airbnb Global and provide a feature on their website that showed the best listings for you based on the locations you were planning to visit. Wouldn't it make sense to choose a place to stay in a location closest to where you're likely to go most often?

In [22]:
# Change altitude and longitude coordinates to meters

def from_location_to_airbnb_listing_in_meters(lat1: float, lon1: float, lat2: list, lon2: list):   
    R = 6371000  # Radius of Earth in meters
    phi_1 = np.radians(lat1) 
    phi_2 = np.radians(lat2) 

    delta_phi = np.radians(lat2 - lat1)
    delta_lambda = np.radians(lon2 - lon1) 

    a = (
        np.sin(delta_phi / 2.0) ** 2
        + np.cos(phi_1) * np.cos(phi_2) * np.sin(delta_lambda / 2.0) ** 2 
    )

    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a)) 

    meters = R * c  # Output distance in meters

    return np.around(meters, 0)

In [23]:
# Run the converted NumPy method and check if it works
convertToMeters = np.vectorize(from_location_to_airbnb_listing_in_meters)
convertToMeters(latitude,longitude,matrix[:,2],matrix[:,3])

array([10861., 11991., 10779., ...,  9585.,  5108.,  9574.])

In [24]:
meters = from_location_to_airbnb_listing_in_meters( latitude, longitude, matrix[:, 2], matrix[:, 3])

# Add an axis to make concatenation possible
meters = meters.reshape(-1, 1)

# Append the distance in meters to the matrix
matrix = np.concatenate((matrix, meters), axis=1)

In [25]:
# Append a color to the matrix in order to color distance from our place
colors = np.zeros(meters.shape)
matrix = np.concatenate((matrix, colors), axis=1)

# Append our entry to the matrix
fav_entry = np.array([1, 0, 52.356498574, 4.819330056, 0, 1]).reshape(1, -1) # Change coordinates to your favorite location
matrix = np.concatenate((fav_entry, matrix), axis=0)

# Entries: airbnb_id, price, latitude, longitude,
# meters from favorite point, color
matrix[:5, :]

array([[       1.        ,        0.        ,       52.35649857,
               4.81933006,        0.        ,        1.        ],
       [23726706.        ,       95.48000336,       52.34915924,
               4.97878981,    10861.        ,        0.        ],
       [35815036.        ,      113.91999817,       52.42419052,
               4.95689011,    11991.        ,        0.        ],
       [31553120.        ,      164.91999817,       52.43236923,
               4.91821003,    10779.        ,        0.        ],
       [34745824.        ,       94.40000153,       52.2961998 ,
               5.01231003,    14729.        ,        0.        ]])

In [None]:
# Export the data to use in the primer for next week
np.savetxt("WK1_Airbnb_Amsterdam_listings_proj_solution.csv", matrix, delimiter=",")

### Download the Dataset to Our Local Machine!

Google Colab comes with its own Python packages, allowing us to quickly download generated files like so:

In [None]:
from google.colab import files

# Download the file locally
files.download('WK1_Airbnb_Amsterdam_listings_proj_solution.csv')