# EDA

## Import data and set up Weights and Biases to track code

In [1]:
import wandb
import pandas as pd
import os

os.environ["WANDB_NOTEBOOK_NAME"] = "EDA.ipynb"
run = wandb.init(project="nyc_airbnb", group="eda", save_code=True)
local_path = wandb.use_artifact("sample.csv:latest").file()
df = pd.read_csv(local_path)

[34m[1mwandb[0m: Currently logged in as: [33mseraphdev[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [2]:
df.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,9138664,Private Lg Room 15 min to Manhattan,47594947,Iris,Queens,Sunnyside,40.74271,-73.92493,Private room,74,2,6,2019-05-26,0.13,1,5
1,31444015,TIME SQUARE CHARMING ONE BED IN HELL'S KITCHEN...,8523790,Johlex,Manhattan,Hell's Kitchen,40.76682,-73.98878,Entire home/apt,170,3,0,,,1,188
2,8741020,Voted #1 Location Quintessential 1BR W Village...,45854238,John,Manhattan,West Village,40.73631,-74.00611,Entire home/apt,245,3,51,2018-09-19,1.12,1,0
3,34602077,Spacious 1 bedroom apartment 15min from Manhattan,261055465,Regan,Queens,Astoria,40.76424,-73.92351,Entire home/apt,125,3,1,2019-05-24,0.65,1,13
4,23203149,Big beautiful bedroom in huge Bushwick apartment,143460,Megan,Brooklyn,Bushwick,40.69839,-73.92044,Private room,65,2,8,2019-06-23,0.52,2,8


## Attempt to use `pandas_profiling`

In [3]:
# import pandas_profiling

# profile = pandas_profiling.ProfileReport(df)
# profile.to_widgets()

## Apparently `pandas_profiling` has been deprecated. So let's do this the old fashioned way

In [4]:
df.shape

(20000, 16)

In [5]:
df.describe()

Unnamed: 0,id,host_id,latitude,longitude,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
count,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,15877.0,20000.0,20000.0
mean,18923800.0,67460340.0,40.728455,-73.952125,153.26905,6.9921,23.2741,1.377446,6.95545,112.9012
std,11012230.0,78579360.0,0.054755,0.046559,243.325609,21.645449,44.927793,1.683006,32.433831,131.762226
min,2539.0,2571.0,40.50873,-74.23914,0.0,1.0,0.0,0.01,1.0,0.0
25%,9393540.0,7853718.0,40.68942,-73.98303,69.0,1.0,1.0,0.19,1.0,0.0
50%,19521170.0,31114310.0,40.72273,-73.95564,105.0,2.0,5.0,0.72,1.0,44.0
75%,29129360.0,106842600.0,40.76299,-73.93638,175.0,5.0,23.0,2.01,2.0,229.0
max,36485610.0,274273300.0,40.91306,-73.71795,10000.0,1250.0,607.0,27.95,327.0,365.0


## There seems to be quite a few otliers, so let's take it one column at a time

In [6]:
def find_min_max(s: pd.Series):
    q3 = s.quantile(0.75)
    q1 = s.quantile(0.25)
    iqr = q3 - q1
    bottom = q1 - 1.5 * iqr
    top = q3 + 1.5 * iqr
    return bottom, top

for column in ["price", "minimum_nights"]:
    print(column + ":\n\tmin: %.2f\n\tmax: %.2f" % find_min_max(df[column]))

price:
	min: -90.00
	max: 334.00
minimum_nights:
	min: -5.00
	max: 11.00


### it seems most of the outliers are on the higher end of the dataset, so lets use common sense to filter lower half of the data and round to the nearest $50 mark for price

#### This leaves us with a range of $10 - $350 for price (AirBnB has a minimum price of $10 as found by a google search) and a range of 1 - 11 minimum nights which we will round up to 14 for 2 weeks

In [7]:
price_filter = df["price"].between(10,350)
min_nights_filter = df["minimum_nights"].between(1,14)

new_df = df[price_filter & min_nights_filter].copy()
new_df.describe()

Unnamed: 0,id,host_id,latitude,longitude,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
count,16747.0,16747.0,16747.0,16747.0,16747.0,16747.0,16747.0,13842.0,16747.0,16747.0
mean,18733160.0,66001280.0,40.727028,-73.949359,121.082642,2.860572,25.856392,1.469779,3.105153,97.520571
std,10903900.0,77701220.0,0.056214,0.04792,70.664779,2.291717,47.30526,1.73422,20.105021,124.838908
min,2539.0,2571.0,40.50873,-74.23914,10.0,1.0,0.0,0.01,1.0,0.0
25%,9287530.0,7748101.0,40.687595,-73.98123,67.0,1.0,1.0,0.21,1.0,0.0
50%,19317830.0,30523480.0,40.72007,-73.95365,100.0,2.0,7.0,0.83,1.0,25.0
75%,28616560.0,101980400.0,40.76298,-73.932665,159.0,3.0,27.0,2.18,2.0,179.0
max,36485610.0,274273300.0,40.91306,-73.71795,350.0,14.0,607.0,27.95,327.0,365.0


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 16 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              20000 non-null  int64  
 1   name                            19993 non-null  object 
 2   host_id                         20000 non-null  int64  
 3   host_name                       19992 non-null  object 
 4   neighbourhood_group             20000 non-null  object 
 5   neighbourhood                   20000 non-null  object 
 6   latitude                        20000 non-null  float64
 7   longitude                       20000 non-null  float64
 8   room_type                       20000 non-null  object 
 9   price                           20000 non-null  int64  
 10  minimum_nights                  20000 non-null  int64  
 11  number_of_reviews               20000 non-null  int64  
 12  last_review                     

### Additionally `last_review` appearns to be a date, but is read as a string. Let's encode that as a datetype

In [9]:
new_df["last_review"] = pd.to_datetime(new_df["last_review"])
new_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16747 entries, 0 to 19999
Data columns (total 16 columns):
 #   Column                          Non-Null Count  Dtype         
---  ------                          --------------  -----         
 0   id                              16747 non-null  int64         
 1   name                            16740 non-null  object        
 2   host_id                         16747 non-null  int64         
 3   host_name                       16740 non-null  object        
 4   neighbourhood_group             16747 non-null  object        
 5   neighbourhood                   16747 non-null  object        
 6   latitude                        16747 non-null  float64       
 7   longitude                       16747 non-null  float64       
 8   room_type                       16747 non-null  object        
 9   price                           16747 non-null  int64         
 10  minimum_nights                  16747 non-null  int64         
 11  nu

## That looks much better now just to close the run and move on to the next step

In [10]:
run.finish()