In [1]:
import pandas as pd

# Load the dataset
file_path = 'E:\cleaning data\AB_NYC_2019.csv'
data = pd.read_csv(file_path)

# Display the first few rows of the dataset
data.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,2018-10-19,0.21,6,365
1,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355
2,3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.9419,Private room,150,3,0,,,1,365
3,3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,2019-07-05,4.64,1,194
4,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,0.1,1,0


In [2]:
# Checking for missing values in the dataset
missing_data = data.isnull().sum()
missing_data_percentage = (missing_data / len(data)) * 100

missing_data_summary = pd.DataFrame({
    'Missing Values': missing_data,
    'Percentage': missing_data_percentage
})

missing_data_summary.sort_values(by='Missing Values', ascending=False)

Unnamed: 0,Missing Values,Percentage
last_review,10052,20.558339
reviews_per_month,10052,20.558339
host_name,21,0.042949
name,16,0.032723
id,0,0.0
host_id,0,0.0
neighbourhood_group,0,0.0
neighbourhood,0,0.0
latitude,0,0.0
longitude,0,0.0


In [3]:
# Fill missing values in 'reviews_per_month' with 0
data['reviews_per_month'].fillna(0, inplace=True)

# Fill missing values in 'last_review' with a placeholder 'No reviews'
data['last_review'].fillna('No reviews', inplace=True)

# Fill missing values in 'host_name' and 'name' with 'Unknown'
data['host_name'].fillna('Unknown', inplace=True)
data['name'].fillna('Unknown', inplace=True)

# Verify that there are no missing values left
missing_data_after = data.isnull().sum()
missing_data_after

id                                0
name                              0
host_id                           0
host_name                         0
neighbourhood_group               0
neighbourhood                     0
latitude                          0
longitude                         0
room_type                         0
price                             0
minimum_nights                    0
number_of_reviews                 0
last_review                       0
reviews_per_month                 0
calculated_host_listings_count    0
availability_365                  0
dtype: int64

In [4]:
# Check for duplicate rows
duplicates = data.duplicated().sum()

duplicates

0

In [5]:
duplicates = data.duplicated().sum()
print(f"Number of duplicate rows: {duplicates}")

Number of duplicate rows: 0


In [6]:
data.drop_duplicates(inplace=True)

In [7]:
data['last_review'] = pd.to_datetime(data['last_review'], errors='coerce')

In [8]:
data['neighbourhood_group'] = data['neighbourhood_group'].str.title()
data['neighbourhood'] = data['neighbourhood'].str.title()
data['room_type'] = data['room_type'].str.replace('_', ' ').str.title()

In [9]:
numeric_columns = ['price', 'minimum_nights', 'number_of_reviews', 'reviews_per_month', 'calculated_host_listings_count', 'availability_365']
data[numeric_columns].describe()

Unnamed: 0,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
count,48895.0,48895.0,48895.0,48895.0,48895.0,48895.0
mean,152.720687,7.029962,23.274466,1.09091,7.143982,112.781327
std,240.15417,20.51055,44.550582,1.597283,32.952519,131.622289
min,0.0,1.0,0.0,0.0,1.0,0.0
25%,69.0,1.0,1.0,0.04,1.0,0.0
50%,106.0,3.0,5.0,0.37,1.0,45.0
75%,175.0,5.0,24.0,1.58,2.0,227.0
max,10000.0,1250.0,629.0,58.5,327.0,365.0
