## 1. Importing Libraries

In [128]:
import os

import numpy as np

import pandas as pd

from sklearn.model_selection import train_test_split

## 2. Read the data

In [129]:
RAW_DATA_DIR = r'..\data\raw'
PRO_DATA_DIR = r'..\data\processed'

In [130]:
def get_data(file_name):
    file_name_w_ext = f'{file_name}.csv'
    file_path = os.path.join(RAW_DATA_DIR, file_name_w_ext)
    return pd.read_csv(file_path)

In [131]:
airbnb = get_data('AB_NYC_2019')
airbnb

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,2018-10-19,0.21,6,365
1,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355
2,3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.94190,Private room,150,3,0,,,1,365
3,3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,2019-07-05,4.64,1,194
4,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,0.10,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48890,36484665,Charming one bedroom - newly renovated rowhouse,8232441,Sabrina,Brooklyn,Bedford-Stuyvesant,40.67853,-73.94995,Private room,70,2,0,,,2,9
48891,36485057,Affordable room in Bushwick/East Williamsburg,6570630,Marisol,Brooklyn,Bushwick,40.70184,-73.93317,Private room,40,4,0,,,2,36
48892,36485431,Sunny Studio at Historical Neighborhood,23492952,Ilgar & Aysel,Manhattan,Harlem,40.81475,-73.94867,Entire home/apt,115,10,0,,,1,27
48893,36485609,43rd St. Time Square-cozy single bed,30985759,Taz,Manhattan,Hell's Kitchen,40.75751,-73.99112,Shared room,55,1,0,,,6,2


In [132]:
airbnb.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48895 entries, 0 to 48894
Data columns (total 16 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              48895 non-null  int64  
 1   name                            48879 non-null  object 
 2   host_id                         48895 non-null  int64  
 3   host_name                       48874 non-null  object 
 4   neighbourhood_group             48895 non-null  object 
 5   neighbourhood                   48895 non-null  object 
 6   latitude                        48895 non-null  float64
 7   longitude                       48895 non-null  float64
 8   room_type                       48895 non-null  object 
 9   price                           48895 non-null  int64  
 10  minimum_nights                  48895 non-null  int64  
 11  number_of_reviews               48895 non-null  int64  
 12  last_review                     

## 3. Check Data Types

In [133]:
airbnb.dtypes

id                                  int64
name                               object
host_id                             int64
host_name                          object
neighbourhood_group                object
neighbourhood                      object
latitude                          float64
longitude                         float64
room_type                          object
price                               int64
minimum_nights                      int64
number_of_reviews                   int64
last_review                        object
reviews_per_month                 float64
calculated_host_listings_count      int64
availability_365                    int64
dtype: object

## 4. Check for duplicates

In [134]:
airbnb.duplicated().sum()

np.int64(0)

## 5. Detailed Analysis

### `name`

In [135]:
airbnb.name

0                       Clean & quiet apt home by the park
1                                    Skylit Midtown Castle
2                      THE VILLAGE OF HARLEM....NEW YORK !
3                          Cozy Entire Floor of Brownstone
4         Entire Apt: Spacious Studio/Loft by central park
                               ...                        
48890      Charming one bedroom - newly renovated rowhouse
48891        Affordable room in Bushwick/East Williamsburg
48892              Sunny Studio at Historical Neighborhood
48893                 43rd St. Time Square-cozy single bed
48894    Trendy duplex in the very heart of Hell's Kitchen
Name: name, Length: 48895, dtype: object

In [136]:
(
    airbnb
    .name
    .str.findall(r'[^A-Za-z0-9 ]+')
    .explode()
    .unique()
)

array(['&', nan, '....', '!', ':', '/', "'", '-', ',', '+', '#', '.', ';',
       '$', '*', '...', '!!!', '**', '!!', '(', ')', '—', '.,', '’',
       '.....', '.(', '~/', '•', '>', '@', '/++', ')!', '-⭐️', '--', '..',
       ':,', '⚡', '☆', '!-', '=', ')*', '"', '✿✿✿', '{', '}', '-à-', '_',
       '²', '++', ':\n', '–', '\n', '|', 'ä', '.........', 'é', ':)',
       '❤️', '******', '*****', '!*', '->', '<', './', '♥️', '?', ';)',
       '☆☆', '~', '%', '!!!!', '♥', '^', '.!', '***', '❤', '//', '★',
       '*,', '™', '.)', '!(', '“', '(&', '!)', '\n\n', '.-', '.✨', ').',
       '¡¡', '[', ']', '★★★★★-', '|❤', '-(', '..!', '::', '••', '!!••',
       'º', '!,', '!--', '☀️', 'â', '!~', '!❤️❤️❤️❤️❤️', '...!!',
       '(安靜公寓)', '!☆', '”\n', ':-)', '\n+', '....)', '皇后区“曼哈顿”-森林小丘别墅!',
       '别墅二楼主卧房', '别墅的地下室', '($', 'à', '-$-', '三楼阁楼', '别墅二楼卧房',
       'ﾆｭｰﾖｰｸの中心', '曼哈顿上西区林肯中心附近优质公寓短租!', '=)', '法拉盛中心温馨两房两厅公寓｡近一切｡',
       '":', 'Ü', 'ó', '!\n', '/♥', '❤️️', '*$$', '汤母小屋', '纽约之家(',
       '“

In [137]:
(
    airbnb
    .name
    .fillna('')
    .str.normalize('NFKD')
    .str.encode('ascii', errors='ignore')
    .str.decode('ascii')
    .str.replace(r'[^0-9A-Za-z ]', ' ', regex=True)
    .str.replace(r' +', ' ', regex=True)
    .str.strip()
    .str.lower()
    # .loc[lambda ser: ~ser.str.contains(
    #     r'\b(?:yacht\s+tour|yacht\s+cruise|boat\s+tour|events?\s+listing|storage\s+only|parking\s+only)\b',
    #     case=False, na=False
    # )]
)

0                         clean quiet apt home by the park
1                                    skylit midtown castle
2                           the village of harlem new york
3                          cozy entire floor of brownstone
4          entire apt spacious studio loft by central park
                               ...                        
48890        charming one bedroom newly renovated rowhouse
48891        affordable room in bushwick east williamsburg
48892              sunny studio at historical neighborhood
48893                  43rd st time square cozy single bed
48894    trendy duplex in the very heart of hell s kitchen
Name: name, Length: 48895, dtype: object

### `host_id`

In [138]:
airbnb.host_id

0            2787
1            2845
2            4632
3            4869
4            7192
           ...   
48890     8232441
48891     6570630
48892    23492952
48893    30985759
48894    68119814
Name: host_id, Length: 48895, dtype: int64

In [139]:
airbnb.host_id.duplicated().sum()

np.int64(11438)

In [140]:
airbnb.groupby('host_id').transform('size')

0        6
1        2
2        1
3        1
4        1
        ..
48890    2
48891    2
48892    1
48893    6
48894    1
Length: 48895, dtype: int64

### `neighbourhood_group`

In [141]:
airbnb.neighbourhood_group

0         Brooklyn
1        Manhattan
2        Manhattan
3         Brooklyn
4        Manhattan
           ...    
48890     Brooklyn
48891     Brooklyn
48892    Manhattan
48893    Manhattan
48894    Manhattan
Name: neighbourhood_group, Length: 48895, dtype: object

In [142]:
airbnb.neighbourhood_group.unique()

array(['Brooklyn', 'Manhattan', 'Queens', 'Staten Island', 'Bronx'],
      dtype=object)

In [143]:
airbnb.neighbourhood_group.value_counts()

neighbourhood_group
Manhattan        21661
Brooklyn         20104
Queens            5666
Bronx             1091
Staten Island      373
Name: count, dtype: int64

In [144]:
(
    airbnb
    .neighbourhood_group
    .astype('category')
)

0         Brooklyn
1        Manhattan
2        Manhattan
3         Brooklyn
4        Manhattan
           ...    
48890     Brooklyn
48891     Brooklyn
48892    Manhattan
48893    Manhattan
48894    Manhattan
Name: neighbourhood_group, Length: 48895, dtype: category
Categories (5, object): ['Bronx', 'Brooklyn', 'Manhattan', 'Queens', 'Staten Island']

### `neighbourhood`

In [145]:
airbnb.neighbourhood

0                Kensington
1                   Midtown
2                    Harlem
3              Clinton Hill
4               East Harlem
                ...        
48890    Bedford-Stuyvesant
48891              Bushwick
48892                Harlem
48893        Hell's Kitchen
48894        Hell's Kitchen
Name: neighbourhood, Length: 48895, dtype: object

In [146]:
airbnb.neighbourhood.unique()

array(['Kensington', 'Midtown', 'Harlem', 'Clinton Hill', 'East Harlem',
       'Murray Hill', 'Bedford-Stuyvesant', "Hell's Kitchen",
       'Upper West Side', 'Chinatown', 'South Slope', 'West Village',
       'Williamsburg', 'Fort Greene', 'Chelsea', 'Crown Heights',
       'Park Slope', 'Windsor Terrace', 'Inwood', 'East Village',
       'Greenpoint', 'Bushwick', 'Flatbush', 'Lower East Side',
       'Prospect-Lefferts Gardens', 'Long Island City', 'Kips Bay',
       'SoHo', 'Upper East Side', 'Prospect Heights',
       'Washington Heights', 'Woodside', 'Brooklyn Heights',
       'Carroll Gardens', 'Gowanus', 'Flatlands', 'Cobble Hill',
       'Flushing', 'Boerum Hill', 'Sunnyside', 'DUMBO', 'St. George',
       'Highbridge', 'Financial District', 'Ridgewood',
       'Morningside Heights', 'Jamaica', 'Middle Village', 'NoHo',
       'Ditmars Steinway', 'Flatiron District', 'Roosevelt Island',
       'Greenwich Village', 'Little Italy', 'East Flatbush',
       'Tompkinsville', 'Asto

In [147]:
airbnb.neighbourhood.isna().sum()

np.int64(0)

In [148]:
(
    airbnb
    .neighbourhood
    .str.strip()
    .astype('category')
)

0                Kensington
1                   Midtown
2                    Harlem
3              Clinton Hill
4               East Harlem
                ...        
48890    Bedford-Stuyvesant
48891              Bushwick
48892                Harlem
48893        Hell's Kitchen
48894        Hell's Kitchen
Name: neighbourhood, Length: 48895, dtype: category
Categories (221, object): ['Allerton', 'Arden Heights', 'Arrochar', 'Arverne', ..., 'Woodhaven', 'Woodlawn', 'Woodrow', 'Woodside']

### `latitude` & `longitude`

In [149]:
airbnb.loc[:,['latitude', 'longitude']]

Unnamed: 0,latitude,longitude
0,40.64749,-73.97237
1,40.75362,-73.98377
2,40.80902,-73.94190
3,40.68514,-73.95976
4,40.79851,-73.94399
...,...,...
48890,40.67853,-73.94995
48891,40.70184,-73.93317
48892,40.81475,-73.94867
48893,40.75751,-73.99112


In [150]:
airbnb.loc[:,['latitude', 'longitude']].max()

latitude     40.91306
longitude   -73.71299
dtype: float64

In [151]:
airbnb.loc[:,['latitude', 'longitude']].min()

latitude     40.49979
longitude   -74.24442
dtype: float64

### `room_type`

In [152]:
airbnb.room_type

0           Private room
1        Entire home/apt
2           Private room
3        Entire home/apt
4        Entire home/apt
              ...       
48890       Private room
48891       Private room
48892    Entire home/apt
48893        Shared room
48894       Private room
Name: room_type, Length: 48895, dtype: object

In [153]:
airbnb.room_type.unique()

array(['Private room', 'Entire home/apt', 'Shared room'], dtype=object)

In [154]:
(
    airbnb
    .room_type
    .astype('category')
)

0           Private room
1        Entire home/apt
2           Private room
3        Entire home/apt
4        Entire home/apt
              ...       
48890       Private room
48891       Private room
48892    Entire home/apt
48893        Shared room
48894       Private room
Name: room_type, Length: 48895, dtype: category
Categories (3, object): ['Entire home/apt', 'Private room', 'Shared room']

### `price`

In [155]:
airbnb.price

0        149
1        225
2        150
3         89
4         80
        ... 
48890     70
48891     40
48892    115
48893     55
48894     90
Name: price, Length: 48895, dtype: int64

In [156]:
airbnb.price.le(0).sum()

np.int64(11)

In [157]:
(
    airbnb
    .loc[lambda df_: df_.price.le(0)]
)

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
23161,18750597,"Huge Brooklyn Brownstone Living, Close to it all.",8993084,Kimberly,Brooklyn,Bedford-Stuyvesant,40.69023,-73.95428,Private room,0,4,1,2018-01-06,0.05,4,28
25433,20333471,★Hostel Style Room | Ideal Traveling Buddies★,131697576,Anisha,Bronx,East Morrisania,40.83296,-73.88668,Private room,0,2,55,2019-06-24,2.56,4,127
25634,20523843,"MARTIAL LOFT 3: REDEMPTION (upstairs, 2nd room)",15787004,Martial Loft,Brooklyn,Bushwick,40.69467,-73.92433,Private room,0,2,16,2019-05-18,0.71,5,0
25753,20608117,"Sunny, Quiet Room in Greenpoint",1641537,Lauren,Brooklyn,Greenpoint,40.72462,-73.94072,Private room,0,2,12,2017-10-27,0.53,2,0
25778,20624541,Modern apartment in the heart of Williamsburg,10132166,Aymeric,Brooklyn,Williamsburg,40.70838,-73.94645,Entire home/apt,0,5,3,2018-01-02,0.15,1,73
25794,20639628,Spacious comfortable master bedroom with nice ...,86327101,Adeyemi,Brooklyn,Bedford-Stuyvesant,40.68173,-73.91342,Private room,0,1,93,2019-06-15,4.28,6,176
25795,20639792,Contemporary bedroom in brownstone with nice view,86327101,Adeyemi,Brooklyn,Bedford-Stuyvesant,40.68279,-73.9117,Private room,0,1,95,2019-06-21,4.37,6,232
25796,20639914,Cozy yet spacious private brownstone bedroom,86327101,Adeyemi,Brooklyn,Bedford-Stuyvesant,40.68258,-73.91284,Private room,0,1,95,2019-06-23,4.35,6,222
26259,20933849,the best you can find,13709292,Qiuchi,Manhattan,Murray Hill,40.75091,-73.97597,Entire home/apt,0,3,0,,,1,0
26841,21291569,Coliving in Brooklyn! Modern design / Shared room,101970559,Sergii,Brooklyn,Bushwick,40.69211,-73.9067,Shared room,0,30,2,2019-06-22,0.11,6,333


In [158]:
airbnb.loc[lambda df_: df_.price.idxmax()]

id                                                            7003697
name                              Furnished room in Astoria apartment
host_id                                                      20582832
host_name                                                    Kathrine
neighbourhood_group                                            Queens
neighbourhood                                                 Astoria
latitude                                                      40.7681
longitude                                                   -73.91651
room_type                                                Private room
price                                                           10000
minimum_nights                                                    100
number_of_reviews                                                   2
last_review                                                2016-02-13
reviews_per_month                                                0.04
calculated_host_list

In [159]:
airbnb.price.max()

np.int64(10000)

In [160]:
(
    airbnb
    .loc[lambda df_: df_.price.ge(10000)]
)

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
9151,7003697,Furnished room in Astoria apartment,20582832,Kathrine,Queens,Astoria,40.7681,-73.91651,Private room,10000,100,2,2016-02-13,0.04,1,0
17692,13894339,Luxury 1 bedroom apt. -stunning Manhattan views,5143901,Erin,Brooklyn,Greenpoint,40.7326,-73.95739,Entire home/apt,10000,5,5,2017-07-27,0.16,1,0
29238,22436899,1-BR Lincoln Center,72390391,Jelena,Manhattan,Upper West Side,40.77213,-73.98665,Entire home/apt,10000,30,0,,,1,83


In [161]:
# indexes to drop
airbnb.loc[lambda df_: df_.price.le(0)].index

Index([23161, 25433, 25634, 25753, 25778, 25794, 25795, 25796, 26259, 26841,
       26866],
      dtype='int64')

### `minimum_nights`

In [162]:
airbnb.minimum_nights

0         1
1         1
2         3
3         1
4        10
         ..
48890     2
48891     4
48892    10
48893     1
48894     7
Name: minimum_nights, Length: 48895, dtype: int64

In [163]:
airbnb.minimum_nights.le(0).sum()

np.int64(0)

In [164]:
airbnb.minimum_nights.unique()

array([   1,    3,   10,   45,    2,    5,    4,   90,    7,   14,   60,
         29,   30,  180,    9,   31,    6,   15,    8,   26,   28,  200,
         50,   17,   21,   11,   25,   13,   35,   27,   18,   20,   40,
         44,   65,   55,  120,  365,  122,   19,  240,   88,  115,  150,
        370,   16,   80,  181,  265,  300,   59,  185,  360,   56,   12,
         70,   39,   24,   32, 1000,  110,  270,   22,   75,  250,   62,
         23, 1250,  364,   74,  198,  100,  500,   43,   91,  480,   53,
         99,  160,   47,  999,  186,  366,   68,   93,   87,  183,  299,
        175,   98,  133,  354,   42,   33,   37,  225,  400,  105,  184,
        153,  134,  222,   58,  210,  275,  182,  114,   85,   36])

In [165]:
(
    airbnb
    .loc[lambda df_: df_.minimum_nights.gt(370)]
)

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
2854,1615764,,6676776,Peter,Manhattan,Battery Park City,40.71239,-74.0162,Entire home/apt,400,1000,0,,,1,362
5767,4204302,Prime W. Village location 1 bdrm,17550546,Genevieve,Manhattan,Greenwich Village,40.73293,-73.99782,Entire home/apt,180,1250,2,2014-11-09,0.03,1,365
7355,5431845,Beautiful Fully Furnished 1 bed/bth,3680008,Aliya,Queens,Long Island City,40.75104,-73.93863,Entire home/apt,134,500,30,2018-06-24,0.57,1,90
8014,6169897,Wonderful Large 1 bedroom,10720264,John,Manhattan,Harlem,40.82135,-73.95521,Entire home/apt,75,500,0,,,1,362
10829,8341919,Brand New Luxury Apt Lease Takeover,43945071,Shining,Queens,Long Island City,40.74654,-73.95778,Entire home/apt,199,480,0,,,1,365
11193,8668115,Zen Room in Crown Heights Brooklyn,8996336,Laura,Brooklyn,Crown Heights,40.67255,-73.94914,Private room,50,500,10,2016-09-22,0.22,1,365
13404,10053943,Historic Designer 2 Bed. Apartment,2697686,Glenn H.,Manhattan,Harlem,40.82915,-73.94034,Entire home/apt,99,999,2,2018-01-04,0.07,1,42
14285,11096888,Peaceful apartment close to F/G,2228137,Amanda,Brooklyn,Kensington,40.64779,-73.97956,Private room,45,500,0,,,1,358
26341,20990053,Beautiful place in Brooklyn! #2,151084261,Angie,Brooklyn,Williamsburg,40.71772,-73.95059,Private room,79,999,24,2018-06-28,1.12,6,249
34487,27362309,Not available,14621589,Sol,Brooklyn,Bedford-Stuyvesant,40.69974,-73.94658,Private room,50,400,0,,,1,90


### `number_of_reviews`

In [166]:
airbnb.number_of_reviews

0          9
1         45
2          0
3        270
4          9
        ... 
48890      0
48891      0
48892      0
48893      0
48894      0
Name: number_of_reviews, Length: 48895, dtype: int64

In [167]:
airbnb.number_of_reviews.value_counts()

number_of_reviews
0      10052
1       5244
2       3465
3       2520
4       1994
       ...  
372        1
299        1
436        1
310        1
341        1
Name: count, Length: 394, dtype: int64

### `last_review`

In [168]:
airbnb.last_review

0        2018-10-19
1        2019-05-21
2               NaN
3        2019-07-05
4        2018-11-19
            ...    
48890           NaN
48891           NaN
48892           NaN
48893           NaN
48894           NaN
Name: last_review, Length: 48895, dtype: object

In [169]:
pd.to_datetime(airbnb.last_review)

0       2018-10-19
1       2019-05-21
2              NaT
3       2019-07-05
4       2018-11-19
           ...    
48890          NaT
48891          NaT
48892          NaT
48893          NaT
48894          NaT
Name: last_review, Length: 48895, dtype: datetime64[ns]

### `reviews_per_month`

In [170]:
airbnb.reviews_per_month

0        0.21
1        0.38
2         NaN
3        4.64
4        0.10
         ... 
48890     NaN
48891     NaN
48892     NaN
48893     NaN
48894     NaN
Name: reviews_per_month, Length: 48895, dtype: float64

In [171]:
airbnb.reviews_per_month.min()

np.float64(0.01)

In [172]:
airbnb.reviews_per_month.max()

np.float64(58.5)

### `calculated_host_listings_count`

In [173]:
airbnb.calculated_host_listings_count

0        6
1        2
2        1
3        1
4        1
        ..
48890    2
48891    2
48892    1
48893    6
48894    1
Name: calculated_host_listings_count, Length: 48895, dtype: int64

In [174]:
airbnb.calculated_host_listings_count.max()

np.int64(327)

In [175]:
airbnb.calculated_host_listings_count.min()

np.int64(1)

In [176]:
# (
#     airbnb
#     .assign(
#         host_type=lambda df_: np.select(
#             [
#                 df_.calculated_host_listings_count == 1,
#                 df_.calculated_host_listings_count <= 3,
#                 df_.calculated_host_listings_count <= 10,
#             ],
#             [
#                 'new_host',
#                 'casual_host',
#                 'experienced_host'
#             ],
#             default='professional_host'
#         )
#     )
# )

### `availability_365`

In [177]:
airbnb.availability_365

0        365
1        355
2        365
3        194
4          0
        ... 
48890      9
48891     36
48892     27
48893      2
48894     23
Name: availability_365, Length: 48895, dtype: int64

In [178]:
airbnb.availability_365.value_counts()

availability_365
0      17533
365     1295
364      491
1        408
89       361
       ...  
195       26
183       24
196       24
181       23
202       20
Name: count, Length: 366, dtype: int64

In [179]:
airbnb.availability_365.unique()

array([365, 355, 194,   0, 129, 220, 188,   6,  39, 314, 333,  46, 321,
        12,  21, 249, 347, 364, 304, 233,  85,  75, 311,  67, 255, 284,
       359, 269, 340,  22,  96, 345, 273, 309,  95, 215, 265, 192, 251,
       302, 140, 234, 257,  30, 301, 294, 320, 154, 263, 180, 231, 297,
       292, 191,  72, 362, 336, 116,  88, 224, 322, 324, 132, 295, 238,
       209, 328,  38,   7, 272,  26, 288, 317, 207, 185, 158,   9, 198,
       219, 342, 312, 243, 152, 137, 222, 346, 208, 279, 250, 164, 298,
       260, 107, 199, 299,  20, 318, 216, 245, 189, 307, 310, 213, 278,
        16, 178, 275, 163,  34, 280,   1, 170, 214, 248, 262, 339,  10,
       290, 230,  53, 126,   3,  37, 353, 177, 246, 225,  18, 343, 326,
       162, 240, 363, 247, 323, 125,  91, 286,  60,  58, 351, 201, 232,
       258, 341, 244, 329, 253, 348,   2,  56,  68, 360,  76,  15, 226,
       349,  11, 316, 281, 287,  14,  86, 261, 331,  51, 254, 103,  42,
       325,  35, 203,   5, 276, 102,  71,  78,   8, 182,  79,  4

## 6. Cleaning Operations

In [180]:
airbnb.columns

Index(['id', 'name', 'host_id', 'host_name', 'neighbourhood_group',
       'neighbourhood', 'latitude', 'longitude', 'room_type', 'price',
       'minimum_nights', 'number_of_reviews', 'last_review',
       'reviews_per_month', 'calculated_host_listings_count',
       'availability_365'],
      dtype='object')

In [181]:
index_to_drop = (
    airbnb
    .loc[lambda df_: df_.name.str.contains(
        r'\b(?:yacht\s+tour|yacht\s+cruise|boat\s+tour|events?\s+listing|storage\s+only|parking\s+only)\b',
        case=False, na=False
    )]
    .index
)
index_to_drop = index_to_drop.append(airbnb.loc[lambda df_: df_.price.le(0)].index)
index_to_drop = index_to_drop.append(airbnb.loc[lambda df_: df_.minimum_nights.gt(35)].index)
index_to_drop

Index([21176, 47130, 48282, 48310, 23161, 25433, 25634, 25753, 25778, 25794,
       ...
       47957, 47973, 48043, 48145, 48205, 48312, 48325, 48368, 48446, 48784],
      dtype='int64', length=536)

In [182]:
def clean_data(df):
    return (
        df
        .drop(index=index_to_drop)
        .assign(**{
            col: df[col].str.strip()
            for col in df.select_dtypes(include='O').columns
        })
        .drop(columns=['id', 'host_id', 'name', 'host_name'])
        .assign(**{
            col: df[col].astype('category')
            for col in ['neighbourhood_group', 'room_type', 'neighbourhood']
        })
        .assign(
            last_review=lambda df_: pd.to_datetime(df_.last_review, format='%Y-%m-%d')
        )
    )
clean_data(airbnb)

Unnamed: 0,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,2018-10-19,0.21,6,365
1,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355
2,Manhattan,Harlem,40.80902,-73.94190,Private room,150,3,0,NaT,,1,365
3,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,2019-07-05,4.64,1,194
4,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,0.10,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
48890,Brooklyn,Bedford-Stuyvesant,40.67853,-73.94995,Private room,70,2,0,NaT,,2,9
48891,Brooklyn,Bushwick,40.70184,-73.93317,Private room,40,4,0,NaT,,2,36
48892,Manhattan,Harlem,40.81475,-73.94867,Entire home/apt,115,10,0,NaT,,1,27
48893,Manhattan,Hell's Kitchen,40.75751,-73.99112,Shared room,55,1,0,NaT,,6,2


In [183]:
airbnb_cleaned = clean_data(airbnb)
airbnb_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Index: 48359 entries, 0 to 48894
Data columns (total 12 columns):
 #   Column                          Non-Null Count  Dtype         
---  ------                          --------------  -----         
 0   neighbourhood_group             48359 non-null  category      
 1   neighbourhood                   48359 non-null  category      
 2   latitude                        48359 non-null  float64       
 3   longitude                       48359 non-null  float64       
 4   room_type                       48359 non-null  category      
 5   price                           48359 non-null  int64         
 6   minimum_nights                  48359 non-null  int64         
 7   number_of_reviews               48359 non-null  int64         
 8   last_review                     38548 non-null  datetime64[ns]
 9   reviews_per_month               38548 non-null  float64       
 10  calculated_host_listings_count  48359 non-null  int64         
 11  availab

## 7. Split the Data

In [184]:
X = airbnb_cleaned.drop(columns='price')
y = airbnb_cleaned.price.copy()

In [185]:
X_, X_test, y_, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_, y_, test_size=0.2, random_state=42)

print(X_train.shape, y_train.shape)
print(X_val.shape, y_val.shape)
print(X_test.shape, y_test.shape)

(30949, 11) (30949,)
(7738, 11) (7738,)
(9672, 11) (9672,)


## 8. Export the Subsets

In [186]:
def export_data(X, y, name):
    file_name = f'{name}.csv'
    file_path = os.path.join(PRO_DATA_DIR, file_name)
    
    X.join(y).to_csv(file_path, index=False)

    return pd.read_csv(file_path).head()

In [187]:
export_data(X_train, y_train, "train")

Unnamed: 0,neighbourhood_group,neighbourhood,latitude,longitude,room_type,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365,price
0,Manhattan,Washington Heights,40.84996,-73.93937,Private room,5,1,2016-05-28,0.03,1,0,50
1,Manhattan,Chinatown,40.7161,-73.99828,Entire home/apt,2,43,2019-01-03,1.32,1,0,140
2,Queens,Elmhurst,40.73088,-73.87488,Entire home/apt,2,47,2019-06-27,4.18,1,315,125
3,Manhattan,Harlem,40.82396,-73.94388,Entire home/apt,5,81,2019-06-09,1.48,3,303,450
4,Queens,Queens Village,40.72085,-73.7496,Entire home/apt,2,33,2019-06-30,3.6,1,282,68


In [188]:
export_data(X_val, y_val, "val")

Unnamed: 0,neighbourhood_group,neighbourhood,latitude,longitude,room_type,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365,price
0,Manhattan,Chelsea,40.7417,-73.9978,Entire home/apt,3,31,2016-12-05,0.63,1,0,135
1,Queens,Astoria,40.76728,-73.90781,Entire home/apt,1,5,2019-07-07,5.0,1,65,95
2,Manhattan,Harlem,40.82349,-73.94053,Private room,2,10,2019-06-28,0.32,1,8,80
3,Brooklyn,Clinton Hill,40.69011,-73.96656,Private room,2,0,,,1,0,100
4,Manhattan,Upper West Side,40.80137,-73.9615,Private room,3,1,2016-07-06,0.03,1,0,80


In [189]:
export_data(X_test, y_test, "test")

Unnamed: 0,neighbourhood_group,neighbourhood,latitude,longitude,room_type,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365,price
0,Manhattan,Stuyvesant Town,40.73193,-73.97971,Private room,1,0,,,2,0,115
1,Manhattan,Upper East Side,40.78117,-73.95688,Private room,1,3,2019-03-15,0.28,1,90,105
2,Manhattan,Little Italy,40.71889,-73.99686,Entire home/apt,2,2,2019-03-10,0.19,2,3,400
3,Brooklyn,Bedford-Stuyvesant,40.68037,-73.93821,Private room,3,4,2018-07-29,0.28,1,0,44
4,Queens,Long Island City,40.75571,-73.9338,Private room,1,54,2019-06-22,2.54,9,265,55


In [190]:
export_data(X, y, 'cleaned_full')

Unnamed: 0,neighbourhood_group,neighbourhood,latitude,longitude,room_type,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365,price
0,Brooklyn,Kensington,40.64749,-73.97237,Private room,1,9,2018-10-19,0.21,6,365,149
1,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,1,45,2019-05-21,0.38,2,355,225
2,Manhattan,Harlem,40.80902,-73.9419,Private room,3,0,,,1,365,150
3,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,1,270,2019-07-05,4.64,1,194,89
4,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,10,9,2018-11-19,0.1,1,0,80
