# Import Airbnb data

## Import neccessary packages, modules and credentials

In [11]:
import requests
import pandas as pd
import psycopg2
from sqlalchemy import create_engine
from credentials import host, port, user, password, dbname_raw, table_name, Google_API_KEY

## Ping Database to test connection

In [12]:
def ping_db(host, port, dbname, user, password):
    """
    Test connection to the PostgreSQL database
    """
    try:
        # Create a connection
        conn = psycopg2.connect(
            dbname=dbname,
            user=user,
            password=password,
            host=host,
            port=port
        )
        # Create a cursor object
        cur = conn.cursor()
        # Execute a simple query
        cur.execute("SELECT 1")
        # Close the cursor and connection
        cur.close()
        conn.close()
        print("Successfully connected to the database.")
    except Exception as e:
        print(f"Error connecting to the database: {e}")
        return False

ping_db(host, port, dbname_raw, user, password)

Successfully connected to the database.


## Load the data from the csv file into a dataframe

In [13]:
# Specify the path to the CSV file
csv_file = '../cbs-dmv-final-assignment/data/AirBnbListings.csv'

# Load the CSV file into a DataFrame
df = pd.read_csv(csv_file)

## Define the relevant functions for the neighborhood calculations based on the Google API

In [14]:
run_count = 0

def get_neighborhood(latitude, longitude):
    # global run_count
    # run_count += 1
    # print(f"Run count: {run_count}")
    url = f"https://maps.googleapis.com/maps/api/geocode/json?latlng={latitude},{longitude}&key={Google_API_KEY}"
    try:
        response = requests.get(url)
        if response.status_code == 200:
            results = response.json()['results']
            if results:
                # Try different components if 'neighborhood' is not found
                address_components = results[0]['address_components']
                for component in address_components:
                    if 'neighborhood' in component['types']:
                        return component['long_name']
                    elif 'sublocality' in component['types']:
                        return component['long_name']
                    elif 'locality' in component['types']:
                        return component['long_name']
        else:
            print(f"Error fetching data for {latitude}, {longitude}: {response.status_code}")
    except Exception as e:
        print(f"Error fetching data for {latitude}, {longitude}: {e}")
        return 'Unknown_Neighborhood'

# Function to categorize neighborhoods
def categorize_neighborhood(neighborhood):
    if neighborhood in ['København C', 'Indre By', 'København K']:
        return 'København K'
    elif neighborhood in ['København V', 'Vesterbro']:
        return 'København V'
    elif neighborhood in ['København N', 'Nørrebro']:
        return 'København N'
    elif neighborhood in ['København S', 'København s', 'Amager', 'Amagerbro', 'Amager Øst', 'Amager Vest']:
        return 'København S'
    elif neighborhood in ['København Ø', 'Østerbro']:
        return 'København Ø'
    elif neighborhood in ['Frederiksberg C', 'Frederiksberg']:
        return 'Frederiksberg'
    elif neighborhood in ['København', 'Copenhagen']:
        return 'Unknown_Neighborhood'
    elif pd.isna(neighborhood) or neighborhood == 'Not Found':
        return 'Unknown_Neighborhood'
    else:
        return neighborhood

## Apply the functions onto the dataframe and check the output

In [15]:
# Please only run this cell if really necessary. Otherwise it will use up all the Google API credits.
print("Fetching neighborhood data...")
df['API_Neighborhood'] = df.apply(lambda row: get_neighborhood(row['latitude'], row['longitude']), axis=1)
df['API_Neighborhood'] = df['API_Neighborhood'].apply(categorize_neighborhood)
df['API_Neighborhood'].value_counts()

Fetching neighborhood data...


API_Neighborhood
København N             1958
København S             1955
København V             1691
København K             1615
København Ø             1462
Frederiksberg           1253
København NV             493
Valby                    473
København SV             420
Vanløse                  259
Brønshøj-Husum           221
Unknown_Neighborhood     135
Hellerup                 120
Kastrup                  103
Hvidovre                  83
Søborg                    79
Rødovre                   67
Bispebjerg                38
Dyssegård                 30
Herlev                    24
Gentofte                  16
Name: count, dtype: int64

## Write the dataframe to the PostgreSQL database

In [16]:
try:
    # Create a SQLAlchemy engine
    engine = create_engine(f'postgresql://{user}:{password}@{host}:{port}/{dbname_raw}')

    # Write the DataFrame to the database table
    df.to_sql(table_name, engine, if_exists='replace', index=False)

    # Confirm the data has been written to the raw database
    print("Data has been written to the raw database.")
except Exception as e:
    print(f"Error writing data to the database: {e}")

Data has been written to the raw database.
