In [14]:
import pandas as pd
import numpy as np

In [7]:
df = pd.read_csv('income.csv', engine='c', sep=',', header=0, converters={
    'income': lambda x : int(x.replace('$', ''))
})

### Use Percentile

In [8]:
df.describe()

Unnamed: 0,income
count,7.0
mean,1433929.0
std,3777283.0
min,4000.0
25%,5500.0
50%,7000.0
75%,7750.0
max,10000000.0


In [11]:
# df.income.quantile(0.75)
# df.income.quantile(0.75, interpolation='higher')
df.income.quantile(0.75, interpolation='lower')

7500

In [12]:
percentile_99 = df.income.quantile(0.99)
percentile_99

9400479.999999994

In [13]:
df_no_outlier = df[df.income < percentile_99]
df_no_outlier

Unnamed: 0,name,income
0,Rob,5000
1,Rafiq,6000
2,Nina,4000
3,Sofia,7500
4,Mohan,8000
5,Tao,7000


In [21]:
df['income'][3] = np.NaN

In [22]:
df

Unnamed: 0,name,income
0,Rob,5000.0
1,Rafiq,6000.0
2,Nina,4000.0
3,Sofia,
4,Mohan,8000.0
5,Tao,7000.0
6,Elon Musk,10000000.0


In [24]:
df.income.mean()

1671666.6666666667

Here filling NA with mean will not work

In [25]:
df_no_na = df.fillna(df.income.mean())
df_no_na

Unnamed: 0,name,income
0,Rob,5000.0
1,Rafiq,6000.0
2,Nina,4000.0
3,Sofia,1671667.0
4,Mohan,8000.0
5,Tao,7000.0
6,Elon Musk,10000000.0


Use Median instead of mean

In [27]:
df_no_na = df.fillna(df.income.median())
df_no_na

Unnamed: 0,name,income
0,Rob,5000.0
1,Rafiq,6000.0
2,Nina,4000.0
3,Sofia,6500.0
4,Mohan,8000.0
5,Tao,7000.0
6,Elon Musk,10000000.0


# AirBnb Dataset

In [28]:
airbnb_df = pd.read_csv('AB_NYC_2019.csv', engine='c', sep=',')
airbnb_df.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,2018-10-19,0.21,6,365
1,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355
2,3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.9419,Private room,150,3,0,,,1,365
3,3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,2019-07-05,4.64,1,194
4,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,0.1,1,0


In [36]:
airbnb_df.describe()

Unnamed: 0,id,host_id,latitude,longitude,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
count,48895.0,48895.0,48895.0,48895.0,48895.0,48895.0,48895.0,38843.0,48895.0,48895.0
mean,19017140.0,67620010.0,40.728949,-73.95217,152.720687,7.029962,23.274466,1.373221,7.143982,112.781327
std,10983110.0,78610970.0,0.05453,0.046157,240.15417,20.51055,44.550582,1.680442,32.952519,131.622289
min,2539.0,2438.0,40.49979,-74.24442,0.0,1.0,0.0,0.01,1.0,0.0
25%,9471945.0,7822033.0,40.6901,-73.98307,69.0,1.0,1.0,0.19,1.0,0.0
50%,19677280.0,30793820.0,40.72307,-73.95568,106.0,3.0,5.0,0.72,1.0,45.0
75%,29152180.0,107434400.0,40.763115,-73.936275,175.0,5.0,24.0,2.02,2.0,227.0
max,36487240.0,274321300.0,40.91306,-73.71299,10000.0,1250.0,629.0,58.5,327.0,365.0


In [None]:
airbnb_df_nona = airbnb_df.fillna({
    'price': airbnb_df.price.median(), 
    'minimum_nights': airbnb_df.minimum_nights.median(), 
    'number_of_reviews': airbnb_df.number_of_reviews.median(), 
    'reviews_per_month': airbnb_df.reviews_per_month.median(),
    'calculated_host_listings_count': airbnb_df.calculated_host_listings_count.median()
  })
airbnb_df_nona.head()

In [58]:
min_threshold, max_threshold = airbnb_df_nona.price.quantile([0.01, 0.999])
min_threshold, max_threshold

(30.0, 3000.0)

In [59]:
airbnb_df_clean = airbnb_df_nona[(airbnb_df_nona.price > min_threshold) & (airbnb_df_nona.price < max_threshold)]
airbnb_df_clean.describe()

Unnamed: 0,id,host_id,latitude,longitude,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
count,48183.0,48183.0,48183.0,48183.0,48183.0,48183.0,48183.0,48183.0,48183.0,48183.0
mean,18962720.0,67230350.0,40.729155,-73.952498,148.772036,7.011104,23.393915,1.239819,7.190046,112.74022
std,10994040.0,78515400.0,0.054231,0.045894,153.594795,20.526646,44.701116,1.521897,33.175856,131.523239
min,2539.0,2438.0,40.49979,-74.24442,31.0,1.0,0.0,0.01,1.0,0.0
25%,9411247.0,7719674.0,40.690355,-73.98319,70.0,1.0,1.0,0.28,1.0,0.0
50%,19603060.0,30384190.0,40.72328,-73.95593,110.0,3.0,5.0,0.72,1.0,45.0
75%,29104930.0,107245500.0,40.76318,-73.93696,179.0,5.0,24.0,1.58,2.0,227.0
max,36487240.0,274321300.0,40.91306,-73.71299,2999.0,1250.0,629.0,58.5,327.0,365.0
