# New York City Airbnb Open Data

In [1]:
#Importing necessary libraries
import pandas as pd
import numpy as np

In [2]:
#load the dataset
df = pd.read_csv("AB_NYC_2019.csv")

In [5]:
#Basic information about the database 
print("Basic Dataset info:")
df.info()

Basic Dataset info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48895 entries, 0 to 48894
Data columns (total 16 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              48895 non-null  int64  
 1   name                            48879 non-null  object 
 2   host_id                         48895 non-null  int64  
 3   host_name                       48874 non-null  object 
 4   neighbourhood_group             48895 non-null  object 
 5   neighbourhood                   48895 non-null  object 
 6   latitude                        48895 non-null  float64
 7   longitude                       48895 non-null  float64
 8   room_type                       48895 non-null  object 
 9   price                           48895 non-null  int64  
 10  minimum_nights                  48895 non-null  int64  
 11  number_of_reviews               48895 non-null  int64  
 12  last_review 

In [16]:
#Checking for missing values
print("\nMissing values:")
df.isnull().sum()


Missing values:


id                                    0
name                                 16
host_id                               0
host_name                            21
neighbourhood_group                   0
neighbourhood                         0
latitude                              0
longitude                             0
room_type                             0
price                                 0
minimum_nights                        0
number_of_reviews                     0
last_review                       10052
reviews_per_month                 10052
calculated_host_listings_count        0
availability_365                      0
dtype: int64

In [18]:
#Checking for duplicate rows
print("\nNumber of duplicates in the dataset:",df.duplicated().sum())


Number of duplicates in the dataset: 0


In [26]:
#Data integrity 

In [24]:
# describing numerical data for basic integrity checks
print("\nData integrity check (Range of values in critical columns):")
print(df[['price','minimum_nights','availability_365']].describe())


Data integrity check (Range of values in critical columns):
              price  minimum_nights  availability_365
count  48895.000000    48895.000000      48895.000000
mean     152.720687        7.029962        112.781327
std      240.154170       20.510550        131.622289
min        0.000000        1.000000          0.000000
25%       69.000000        1.000000          0.000000
50%      106.000000        3.000000         45.000000
75%      175.000000        5.000000        227.000000
max    10000.000000     1250.000000        365.000000


In [25]:
# checking for out-of-range values
outlier_check = df[(df['price']<0) | (df['minimum_nights']<0)]
print("\nOut-of-range values (if any):")
print(outlier_check)


Out-of-range values (if any):
Empty DataFrame
Columns: [id, name, host_id, host_name, neighbourhood_group, neighbourhood, latitude, longitude, room_type, price, minimum_nights, number_of_reviews, last_review, reviews_per_month, calculated_host_listings_count, availability_365]
Index: []


In [27]:
# Missing data handling

In [28]:
#fill missing 'name' and 'host_name' with 'Unknow'
df['name'].fillna('Unknow',inplace=True)
df['host_name'].fillna('Unknown',inplace=True)

In [30]:
#fill missing 'reviews_per_month' with 0(assuming no reviews)
df['reviews_per_month'].fillna(0,inplace=True)

In [31]:
#fill missing 'last_review' with a placeholder
df['last_review'].fillna('NO Reviws',inplace=True)

In [32]:
#Re-check missing values
print("\nMissing values after filling:")
print(df.isnull().sum())


Missing values after filling:
id                                0
name                              0
host_id                           0
host_name                         0
neighbourhood_group               0
neighbourhood                     0
latitude                          0
longitude                         0
room_type                         0
price                             0
minimum_nights                    0
number_of_reviews                 0
last_review                       0
reviews_per_month                 0
calculated_host_listings_count    0
availability_365                  0
dtype: int64


In [36]:
#Duplicate removal (if any duplicates are found)
df.drop_duplicates(inplace=True)
print("\nNumber of duplicates after removal:",df.duplicated().sum())


Number of duplicates after removal: 0


In [37]:
#Standardization of text dara('name and 'host_name')
df['name']=df['name'].str.title()
df['host_name']=df['host_name'].str.title()

In [38]:
#verify check 
print("\nnSample of standardized'name' and 'host_name':")
print(df[['name','host_name']].head())


nSample of standardized'name' and 'host_name':
                                               name    host_name
0                Clean & Quiet Apt Home By The Park         John
1                             Skylit Midtown Castle     Jennifer
2               The Village Of Harlem....New York !    Elisabeth
3                   Cozy Entire Floor Of Brownstone  Lisaroxanne
4  Entire Apt: Spacious Studio/Loft By Central Park        Laura


In [40]:
#Outlier Detection using IQR method

In [42]:
#function to detect outliers
def detect_outliers(df,column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return df[(df[column] < lower_bound) | (df[column] > upper_bound)]

In [45]:
# Detect outliers in 'price' and 'minimum_nights'
price_outliers = detect_outliers(df, 'price')
min_nights_outliers = detect_outliers(df, 'minimum_nights')

In [48]:
# Output the results

In [49]:
print("\nPrice Outliers (first 5 rows):")
print(price_outliers[['id', 'price']].head())


Price Outliers (first 5 rows):
        id  price
61   15396    375
85   19601    800
103  23686    500
114  26933    350
121  27659    400


In [50]:
print("\nMinimum Nights Outliers (first 5 rows):")
print(min_nights_outliers[['id', 'minimum_nights']].head())


Minimum Nights Outliers (first 5 rows):
       id  minimum_nights
6    5121              45
14   6090              90
29   9657              14
36  11452              60
45  12627              29


In [52]:
#Final Check on Clean Data
print("\nFinal Dataset Info After Cleaning:")
df.info()


Final Dataset Info After Cleaning:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48895 entries, 0 to 48894
Data columns (total 16 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              48895 non-null  int64  
 1   name                            48895 non-null  object 
 2   host_id                         48895 non-null  int64  
 3   host_name                       48895 non-null  object 
 4   neighbourhood_group             48895 non-null  object 
 5   neighbourhood                   48895 non-null  object 
 6   latitude                        48895 non-null  float64
 7   longitude                       48895 non-null  float64
 8   room_type                       48895 non-null  object 
 9   price                           48895 non-null  int64  
 10  minimum_nights                  48895 non-null  int64  
 11  number_of_reviews               48895 non-null  int64  
 