In [1]:
import pandas as pd
df = pd.read_csv('/Users/niharikarawat/Documents/VSCODE/airbnb_price_estimator/D-cleaned_airbnb.csv')

In [2]:
df.info()
print(df.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38100 entries, 0 to 38099
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   host_id            38100 non-null  int64  
 1   neighbourhood      38100 non-null  object 
 2   latitude           38100 non-null  float64
 3   longitude          38100 non-null  float64
 4   room_type          38100 non-null  object 
 5   price              38100 non-null  float64
 6   minimum_nights     38100 non-null  int64  
 7   number_of_reviews  38100 non-null  int64  
 8   availability_365   38100 non-null  int64  
 9   city               38100 non-null  object 
dtypes: float64(3), int64(4), object(3)
memory usage: 2.9+ MB
            host_id      latitude     longitude         price  minimum_nights  \
count  3.810000e+04  38100.000000  38100.000000  38100.000000    38100.000000   
mean   2.034773e+08     34.652103   -119.076606    379.565223       12.897822   
std    2.0

In [3]:
df['city'].unique()

array(['san_diego-ca', 'oakland-ca', 'pacific_grove-ca',
       'san_francisco-ca', 'santa_cruz_county-ca',
       'santa_clara_county-ca', 'los_angeles-ca', 'san_mateo_county-ca'],
      dtype=object)

In [4]:
df['city'] = df['city'].replace({'san_diego-ca': 'san diego', 'oakland-ca': 'oakland', 'pacific_grove-ca': 'pacific grove', 'san_francisco-ca': 'san francisco', 'santa_cruz_county-ca': 'santa cruz county', 'santa_clara_county-ca': 'santa clara county', 'los_angeles-ca': 'los angeles', 'san_mateo_county-ca': 'san mateo county'})

In [5]:
df['listings_per_host'] = df.groupby('host_id')['host_id'].transform('count')
df['avg_reviews_per_host'] = df.groupby('host_id')['number_of_reviews'].transform('mean')
df = df.drop(columns=['host_id'])

In [6]:
df.neighbourhood = df.neighbourhood.apply(lambda x: x.strip())

neighbourhood_stats = df.groupby('neighbourhood')['neighbourhood'].agg('count').sort_values(ascending=True)
neighbourhood_stats

neighbourhood
Sausal Creek               1
Hasley Canyon              1
Harrington                 1
Tuxedo                     1
Santa Fe Springs           1
                        ... 
Pacific Beach            851
San Jose                 899
Venice                   942
Unincorporated Areas    1071
Mission Bay             1519
Name: neighbourhood, Length: 516, dtype: int64

In [7]:
len(neighbourhood_stats[neighbourhood_stats < 20])

253

In [8]:
neighbourhood_stats_less_than_20 = neighbourhood_stats[neighbourhood_stats < 20]
neighbourhood_stats_less_than_20

neighbourhood
Sausal Creek                  1
Hasley Canyon                 1
Harrington                    1
Tuxedo                        1
Santa Fe Springs              1
                             ..
Boyle Heights                18
Southeast Antelope Valley    18
Scripps Ranch                18
Encanto                      19
San Dimas                    19
Name: neighbourhood, Length: 253, dtype: int64

In [9]:
len(df.neighbourhood.unique())

516

In [10]:
df.neighbourhood = df.neighbourhood.apply(lambda x: 'other' if x in neighbourhood_stats_less_than_20 else x)
len(df.neighbourhood.unique())  

264

In [11]:
df[['city', 'neighbourhood']].to_csv('/Users/niharikarawat/Documents/VSCODE/airbnb_price_estimator/F-city_neighbourhood_map.csv', index=False)

In [12]:
neighbourhood_freq = df['neighbourhood'].value_counts().to_dict()
df['neighbourhood_freq'] = df['neighbourhood'].map(neighbourhood_freq)

In [13]:
df = pd.get_dummies(df, columns=['neighbourhood'], drop_first=False)

In [14]:
from math import radians, sin, cos, sqrt, atan2

def haversine(lat1, lon1, lat2, lon2):
    """
    Calculate the great-circle distance between two points 
    on the Earth (specified in decimal degrees).
    Returns distance in kilometers.
    """
    R = 6371.0  # Radius of the Earth in kilometers

    # Convert decimal degrees to radians
    lat1, lon1, lat2, lon2 = map(radians, [lat1, lon1, lat2, lon2])

    # Difference in coordinates
    dlat = lat2 - lat1
    dlon = lon2 - lon1

    # Haversine formula
    a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))

    distance = R * c
    return distance

landmarks = {
    'Silicon_Valley(PaloAlto)': (37.4419, -122.1430),
    'Stanford_University': (37.4275, -122.1697),
    'LAX(Los Angeles International)': (33.9416, -118.4085),
    'SFO(San Francisco International)': (37.6213, -122.3790),
    'SAN(San Diego International)': (32.7338, -117.1933),
    'Golden_Gate_Bridge': (37.8199, -122.4783),
    'Santa_Monica_Pier': (34.0100, -118.4962),
    'Venice_Beach': (33.9850, -118.4695),
    'Disneyland': (33.8121, -117.9190),
    'Hollywood_WalkOfFame': (34.1016, -118.3269),
    'Universal_Studios': (34.1381, -118.3534),
    'Griffith_Observatory': (34.1184, -118.3004),
    'Big_Sur': (36.3615, -121.8563),
    'Yosemite_Valley': (37.7456, -119.5936),
    'Lake_Tahoe': (39.0968, -120.0324)
}


for name, (lat, lon) in landmarks.items():
    col_name = f'dist_to_{name}(in kms)'
    df[col_name] = df.apply(
        lambda row: haversine(row['latitude'], row['longitude'], lat, lon),
        axis=1
    )

In [15]:
df = pd.get_dummies(df, columns=['room_type'], drop_first=False)

In [16]:
def group_min_nights(x):
    if x <= 3:
        return 'upto_3_days'
    elif x <= 10:
        return 'upto_10_days'
    elif x <= 90:
        return 'upto_3_months'
    else:
        return 'long_term_rental'

df['min_nights_group'] = df['minimum_nights'].apply(group_min_nights)
df = pd.get_dummies(df, columns=['min_nights_group'], drop_first=False)
df.drop(columns=['minimum_nights'], inplace=True)

In [17]:
df = pd.get_dummies(df, columns=['city'], prefix='city', drop_first=False)


In [18]:
df = df[df['availability_365'] != 0]


In [19]:
df.to_csv('/Users/niharikarawat/Documents/VSCODE/airbnb_price_estimator/G-featured_airbnb.csv', index=False)