In [7]:
#purpose of file ....
# Install the pandas library using the pip command
%pip install pandas

# Import the pandas library with the alias 'pd'
import pandas as pd

# Read data from CSV files into separate DataFrames
booking_part_1 = pd.read_csv('data/booking/bookingData-2024-06.csv')
booking_part_2 = pd.read_csv('data/booking/bookingData-2024-09.csv')
booking_part_3 = pd.read_csv('data/booking/bookingData-2024-12.csv')
booking_part_4 = pd.read_csv('data/booking/bookingData-2024-12-2.csv')
booking_part_5 = pd.read_csv('data/booking/bookingData-2024-1.csv')
booking_part_6 = pd.read_csv('data/booking/bookingDataServer.csv')

# Concatenate the individual DataFrames into a single DataFrame
booking = pd.concat([booking_part_1, booking_part_2, booking_part_3, booking_part_4, booking_part_5, booking_part_6])

# Clean up the 'price' column by removing special characters and converting it to a float
booking['price'] = booking['price'].str.replace('€', '').str.replace(' ', '').str.replace(',','.').astype(float)

# Select specific columns from the DataFrame and rename them
booking_data = booking[['month', 'year', 'lat', 'lng', 'price', 'room type']].rename(columns={'lat': 'latitude','lng': 'longitude', 'room type': 'room_type'})

# Remove duplicate rows from the 'booking' DataFrame
booking = booking.drop_duplicates()

# Save the cleaned and processed data to a new CSV file
booking_data.to_csv('data/booking/bookingData.csv', index=False)

Note: you may need to restart the kernel to use updated packages.


In [8]:
# Read Airbnb data from a CSV file into a DataFrame
airbnb = pd.read_csv('airbnb_data.csv')

# Convert 'check_in' and 'check_out' columns to datetime objects
airbnb["check_in"] = pd.to_datetime(airbnb["check_in"])
airbnb["check_out"] = pd.to_datetime(airbnb["check_out"])

# Extract month and year information from the 'check_in' column
airbnb['month'] = airbnb['check_in'].dt.month
airbnb['year'] = airbnb['check_in'].dt.year

# Select specific columns from the DataFrame for analysis
airbnb_data = airbnb[['month', 'year', 'latitude', 'longitude', 'price', 'room_type']]


In [9]:
# Concatenate the data from both 'booking_data' and 'airbnb_data' DataFrames
data = pd.concat([booking_data, airbnb_data])

# Save the combined data to a new CSV file named 'data.csv', without including the index
data.to_csv('data.csv', index=False)


In [10]:
# Read the combined data from the CSV file into a DataFrame
data = pd.read_csv('data.csv')

# Group the data by 'month' and calculate the mean of the 'price' column for each month
data_graph = data.groupby('month')['price'].mean().reset_index()

# Round the 'price' values to two decimal places for clarity
data_graph['price'] = data_graph['price'].round(2)

# Save the aggregated and rounded data to a new CSV file named 'data_graph.csv', without including the index
data_graph.to_csv('data_graph.csv', index=False)


In [11]:
# Install the geopy library using the pip command
%pip install geopy

# Import the Nominatim class from the geopy.geocoders module
from geopy.geocoders import Nominatim

# Function to get the district information based on latitude and longitude
def get_district(latitude, longitude):
    # Initialize Nominatim API with a custom user agent and a longer timeout
    geolocator = Nominatim(user_agent="map_app_airbnb", timeout=720000)

    # Get location information using reverse geocoding
    location = geolocator.reverse((latitude, longitude), exactly_one=True)

    # Check if location information is available
    if location:
        # Extract address information from the raw location data
        address = location.raw['address']

        # Try to get the city district; if not available, get the suburb
        district = address.get('city_district')
        if district == None:
            district = address.get('suburb')         

        # Return the district information
        return district
    else:
        # Return a message indicating that the district was not found
        return "District not found"

# Example usage
# Specify latitude and longitude values for testing
latitude = 40.748817
longitude = -73.985428

# Call the function with the example coordinates and print the result
print(get_district(40.3789571457818, -3.6703623401582193))

Note: you may need to restart the kernel to use updated packages.
Puente de Vallecas


In [12]:
# Group the data by 'latitude', 'longitude', and 'month', and calculate the mean of the 'price' column for each group
data_map = data.groupby(['latitude', 'longitude', 'month'])['price'].mean().reset_index()

# Add a new column 'region' to the DataFrame by applying the get_district function to each row
data_map['region'] = data_map.apply(lambda x: get_district(x['latitude'], x['longitude']), axis=1)

# Save the aggregated and region-mapped data to a new CSV file named 'data_map.csv', without including the index
data_map.to_csv('data_map.csv', index=False)


In [13]:
data_map.groupby(['region', 'month'])['price'].mean().reset_index().to_csv('data_map_agg.csv', index=False)