### First step: data exploration
As the initial step of data exploration, only the first two rows of the dataset are loaded to gain an initial overview of the data's structure and format. This approach deliberately utilizes a stream-based reader to avoid loading the entire dataset into memory. This method significantly optimizes processing time and enhances computational efficiency.

In [5]:
import csv

data = []

with open(r'C:\Users\yamid\Documents\Persönliche Übungen\archive\AB_NYC_2019.csv', 'r') as file: # \ is reserved for special control characters
    reader = csv.reader(file) # reads one line, converts to list

    for i, row in enumerate(reader):
        if i > 1:
            break
        data.append(row)


for row in data:
    print(row)  

['id', 'name', 'host_id', 'host_name', 'neighbourhood_group', 'neighbourhood', 'latitude', 'longitude', 'room_type', 'price', 'minimum_nights', 'number_of_reviews', 'last_review', 'reviews_per_month', 'calculated_host_listings_count', 'availability_365']
['2539', 'Clean & quiet apt home by the park', '2787', 'John', 'Brooklyn', 'Kensington', '40.64749', '-73.97237', 'Private room', '149', '1', '9', '2018-10-19', '0.21', '6', '365']


Next I want to examine the data.Therefore I am useing the Pandas library. The computation time of the pandas library is longer than that of the numpy library, however the pandas library is particularly helpful for datasets that consist of strings and numbers and are strongly heterogeneously structured. 

In [20]:
import pandas as pd
import numpy as np
import importlib
import Airbnb_tools

file_path = 'C:\\Users\\yamid\\Documents\\Persönliche Übungen\\archive\\AB_NYC_2019.csv'
importlib.reload(Airbnb_tools)

def main():

    df = pd.read_csv(file_path) # converts in to dataframe with appropriate data type (string, float)

    prices_series = df['price'] # extracts the column with the prices
    prices(prices_series)

    district = df['neighbourhood_group']
    city_district(district)



def prices(prices_series):

    prices_np = prices_series.to_numpy() # converts pandas series to numpy array
    min = np.min(prices_np) 
    max = np.max(prices_np) 
    mean = np.mean(prices_np)
    median = Airbnb_tools.median_R(prices_np) 

    print(f"Min: {min}$")
    print(f"Max: {max}$")
    print(f"Mean: {mean:.2f}$")
    print(f"Median: {median}$")
    print(' ')


def city_district(district_series):

    max_mode, max_frequency, min_mode, min_frequency = Airbnb_tools.frequency_R(district_series)

    print(f"Most common district: {max_mode}, {max_frequency}")
    print(f"Least common district: {min_mode}, {min_frequency}")
    print(' ')


if __name__ == "__main__":
    main()

Min: 0$
Max: 10000$
Mean: 152.72$
Median: 106.0$
 
Most common district: Manhattan, 21661
Least common district: Staten Island, 373
 
