# Import liabraries & Dependencies


In [1]:
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
import numpy as np

# Load and read Dataset

In [2]:
df = pd.read_csv("AB_NYC_2019.csv")

In [3]:
df.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,2018-10-19,0.21,6,365
1,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355
2,3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.9419,Private room,150,3,0,,,1,365
3,3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,2019-07-05,4.64,1,194
4,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,0.1,1,0


#  Analysis of the Dataset

In [4]:
# determine the shape of the data 

In [5]:
df.shape

(48895, 16)

In [6]:
df.columns

Index(['id', 'name', 'host_id', 'host_name', 'neighbourhood_group',
       'neighbourhood', 'latitude', 'longitude', 'room_type', 'price',
       'minimum_nights', 'number_of_reviews', 'last_review',
       'reviews_per_month', 'calculated_host_listings_count',
       'availability_365'],
      dtype='object')

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48895 entries, 0 to 48894
Data columns (total 16 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              48895 non-null  int64  
 1   name                            48879 non-null  object 
 2   host_id                         48895 non-null  int64  
 3   host_name                       48874 non-null  object 
 4   neighbourhood_group             48895 non-null  object 
 5   neighbourhood                   48895 non-null  object 
 6   latitude                        48895 non-null  float64
 7   longitude                       48895 non-null  float64
 8   room_type                       48895 non-null  object 
 9   price                           48895 non-null  int64  
 10  minimum_nights                  48895 non-null  int64  
 11  number_of_reviews               48895 non-null  int64  
 12  last_review                     

In [8]:
# identity the type of data stored in each columns 

In [9]:
df.dtypes


id                                  int64
name                               object
host_id                             int64
host_name                          object
neighbourhood_group                object
neighbourhood                      object
latitude                          float64
longitude                         float64
room_type                          object
price                               int64
minimum_nights                      int64
number_of_reviews                   int64
last_review                        object
reviews_per_month                 float64
calculated_host_listings_count      int64
availability_365                    int64
dtype: object

In [10]:
df.nunique()

id                                48895
name                              47905
host_id                           37457
host_name                         11452
neighbourhood_group                   5
neighbourhood                       221
latitude                          19048
longitude                         14718
room_type                             3
price                               674
minimum_nights                      109
number_of_reviews                   394
last_review                        1764
reviews_per_month                   937
calculated_host_listings_count       47
availability_365                    366
dtype: int64

# Check for missing values

In [11]:
#identify the empty cells 

In [12]:
 df.isnull().sum()
print(df.isnull().sum())

id                                    0
name                                 16
host_id                               0
host_name                            21
neighbourhood_group                   0
neighbourhood                         0
latitude                              0
longitude                             0
room_type                             0
price                                 0
minimum_nights                        0
number_of_reviews                     0
last_review                       10052
reviews_per_month                 10052
calculated_host_listings_count        0
availability_365                      0
dtype: int64


#  Removing duplicated and missing  data

In [13]:
print(df.duplicated().sum())

0


In [14]:
df.dropna(inplace=True)

In [15]:
 df.isnull().sum()

id                                0
name                              0
host_id                           0
host_name                         0
neighbourhood_group               0
neighbourhood                     0
latitude                          0
longitude                         0
room_type                         0
price                             0
minimum_nights                    0
number_of_reviews                 0
last_review                       0
reviews_per_month                 0
calculated_host_listings_count    0
availability_365                  0
dtype: int64


# Descriptive analysis


In [16]:
df.describe()
print(df.describe())

                 id       host_id      latitude     longitude         price  \
count  3.882100e+04  3.882100e+04  38821.000000  38821.000000  38821.000000   
mean   1.810081e+07  6.424582e+07     40.728129    -73.951149    142.332526   
std    1.069372e+07  7.589752e+07      0.054991      0.046693    196.994756   
min    2.539000e+03  2.438000e+03     40.506410    -74.244420      0.000000   
25%    8.721444e+06  7.029525e+06     40.688640    -73.982460     69.000000   
50%    1.887286e+07  2.837092e+07     40.721710    -73.954810    101.000000   
75%    2.756746e+07  1.018905e+08     40.762990    -73.935020    170.000000   
max    3.645581e+07  2.738417e+08     40.913060    -73.712990  10000.000000   

       minimum_nights  number_of_reviews  reviews_per_month  \
count    38821.000000       38821.000000       38821.000000   
mean         5.869220          29.290255           1.373229   
std         17.389026          48.182900           1.680328   
min          1.000000           1.00


# Ensuring  consistent formatting and units across the dataset for accurate analysis


In [17]:
df.drop(columns=['name'], inplace=True)
df['host_name'] = df['host_name'].str.strip()

In [18]:

df['host_name'] = df['host_name'].str.lower()
df['neighbourhood_group'] = df['neighbourhood_group'].str.lower()
df['neighbourhood'] = df['neighbourhood'].str.lower()
df['room_type'] = df['room_type'].str.lower()
df.head()


Unnamed: 0,id,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2539,2787,john,brooklyn,kensington,40.64749,-73.97237,private room,149,1,9,2018-10-19,0.21,6,365
1,2595,2845,jennifer,manhattan,midtown,40.75362,-73.98377,entire home/apt,225,1,45,2019-05-21,0.38,2,355
3,3831,4869,lisaroxanne,brooklyn,clinton hill,40.68514,-73.95976,entire home/apt,89,1,270,2019-07-05,4.64,1,194
4,5022,7192,laura,manhattan,east harlem,40.79851,-73.94399,entire home/apt,80,10,9,2018-11-19,0.1,1,0
5,5099,7322,chris,manhattan,murray hill,40.74767,-73.975,entire home/apt,200,3,74,2019-06-22,0.59,1,129


In [19]:
df['reviews_per_month'] = df['reviews_per_month'].round()
df['reviews_per_month'] = df['reviews_per_month'].astype(int)
df.dtypes
df.head()  

Unnamed: 0,id,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2539,2787,john,brooklyn,kensington,40.64749,-73.97237,private room,149,1,9,2018-10-19,0,6,365
1,2595,2845,jennifer,manhattan,midtown,40.75362,-73.98377,entire home/apt,225,1,45,2019-05-21,0,2,355
3,3831,4869,lisaroxanne,brooklyn,clinton hill,40.68514,-73.95976,entire home/apt,89,1,270,2019-07-05,5,1,194
4,5022,7192,laura,manhattan,east harlem,40.79851,-73.94399,entire home/apt,80,10,9,2018-11-19,0,1,0
5,5099,7322,chris,manhattan,murray hill,40.74767,-73.975,entire home/apt,200,3,74,2019-06-22,1,1,129


# Treating outliers

In [20]:
# Calculating mean and standard deviation
mean = df['price'].mean()
std_dev = df['price'].std()

# Calculating Z-score for each data point
df['z_score'] = (df['price'] - mean) / std_dev

# Identify outliers (threshold = 3)
outliers = df[(df['z_score'] > 3) | (df['z_score'] < -3)]

# Print outliers
print("Outliers")
print(outliers)

Outliers
             id    host_id            host_name neighbourhood_group  \
85        19601      74303               maggie            brooklyn   
496      174966     836168                henry           manhattan   
662      250801    1314834                rhona           manhattan   
691      256328    1347034               janine           manhattan   
762      273190     605463         west village           manhattan   
...         ...        ...                  ...                 ...   
47041  35569459  163029687         anna + jason           manhattan   
47063  35576863   83819376          ryan & mary           manhattan   
47391  35713184   11503187                    a            brooklyn   
47400  35715171   71276635                  joe           manhattan   
47950  36000376  268449136  alexandra & william           manhattan   

                   neighbourhood  latitude  longitude        room_type  price  \
85              brooklyn heights  40.69723  -73.99268  en

In [21]:
# Filtering out outliers
data_filtered = df[(df['z_score'] <= 3) & (df['z_score'] >= -3)]

# Dropping the z_score column as it's no longer needed
data_filtered = data_filtered.drop(columns=['z_score'])

#  Reset the index of the filtered DataFrame
data_filtered = data_filtered.reset_index(drop=True)

# Print the filtered DataFrame
print("Filtered Data:")
print(data_filtered)

Filtered Data:
             id    host_id    host_name neighbourhood_group    neighbourhood  \
0          2539       2787         john            brooklyn       kensington   
1          2595       2845     jennifer           manhattan          midtown   
2          3831       4869  lisaroxanne            brooklyn     clinton hill   
3          5022       7192        laura           manhattan      east harlem   
4          5099       7322        chris           manhattan      murray hill   
...         ...        ...          ...                 ...              ...   
38504  36425863   83554966        rusaa           manhattan  upper east side   
38505  36427429  257683179         h ai              queens         flushing   
38506  36438336  211644523          ben       staten island      great kills   
38507  36442252  273841667       blaine               bronx       mott haven   
38508  36455809   74162901    christine            brooklyn         bushwick   

       latitude  longitu

In [22]:
data_filtered.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38509 entries, 0 to 38508
Data columns (total 15 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              38509 non-null  int64  
 1   host_id                         38509 non-null  int64  
 2   host_name                       38509 non-null  object 
 3   neighbourhood_group             38509 non-null  object 
 4   neighbourhood                   38509 non-null  object 
 5   latitude                        38509 non-null  float64
 6   longitude                       38509 non-null  float64
 7   room_type                       38509 non-null  object 
 8   price                           38509 non-null  int64  
 9   minimum_nights                  38509 non-null  int64  
 10  number_of_reviews               38509 non-null  int64  
 11  last_review                     38509 non-null  object 
 12  reviews_per_month               

In [23]:
data_filtered.head()

Unnamed: 0,id,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2539,2787,john,brooklyn,kensington,40.64749,-73.97237,private room,149,1,9,2018-10-19,0,6,365
1,2595,2845,jennifer,manhattan,midtown,40.75362,-73.98377,entire home/apt,225,1,45,2019-05-21,0,2,355
2,3831,4869,lisaroxanne,brooklyn,clinton hill,40.68514,-73.95976,entire home/apt,89,1,270,2019-07-05,5,1,194
3,5022,7192,laura,manhattan,east harlem,40.79851,-73.94399,entire home/apt,80,10,9,2018-11-19,0,1,0
4,5099,7322,chris,manhattan,murray hill,40.74767,-73.975,entire home/apt,200,3,74,2019-06-22,1,1,129
