In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("AB_NYC_2019.csv")

In [3]:
df.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,19-10-2018,0.21,6,365
1,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,21-05-2019,0.38,2,355
2,3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.9419,Private room,150,3,0,,,1,365
3,3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,05-07-2019,4.64,1,194
4,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,19-11-2018,0.1,1,0


## Changing the Data type to work on it

In [24]:

df["id"]=df["id"].astype(str) # This converts the values in the "id" column of the DataFrame df to string data type.
df["host_id"]=df["host_id"].astype(str) # This converts the values in the "host_id" column of the DataFrame df to string data type.
df["latitude"]=df["latitude"].astype(str)   # This converts the values in the "latitude" column of the DataFrame df to string data type.
df["longitude"]=df["longitude"].astype(str) # This converts the values in the "longitude" column of the DataFrame df to string data type.


In [5]:

df.describe()

Unnamed: 0,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
count,48906.0,48906.0,48906.0,38854.0,48906.0,48906.0
mean,152.711324,7.031612,23.300454,1.373151,7.142702,112.782031
std,240.128713,20.512489,44.607175,1.68027,32.948926,131.62037
min,0.0,1.0,0.0,0.01,1.0,0.0
25%,69.0,1.0,1.0,0.19,1.0,0.0
50%,106.0,3.0,5.0,0.72,1.0,45.0
75%,175.0,5.0,24.0,2.02,2.0,227.0
max,10000.0,1250.0,629.0,58.5,327.0,365.0


## Categorical Data

In [6]:
df.nunique() # This returns the number of unique values in each column of the DataFrame df.

id                                48895
name                              47896
host_id                           37457
host_name                         11452
neighbourhood_group                   5
neighbourhood                       221
latitude                          19048
longitude                         14718
room_type                             3
price                               674
minimum_nights                      109
number_of_reviews                   394
last_review                        1764
reviews_per_month                   937
calculated_host_listings_count       47
availability_365                    366
dtype: int64

In [7]:

df.columns

Index(['id', 'name', 'host_id', 'host_name', 'neighbourhood_group',
       'neighbourhood', 'latitude', 'longitude', 'room_type', 'price',
       'minimum_nights', 'number_of_reviews', 'last_review',
       'reviews_per_month', 'calculated_host_listings_count',
       'availability_365'],
      dtype='object')

In [None]:
df["room_type"].value_counts() # This returns the number of occurrences of each unique value in the "room_type" column of the DataFrame df.


room_type
Entire home/apt    25414
Private room       22332
Shared room         1160
Name: count, dtype: int64

In [None]:
df["room_type"].value_counts(normalize = True) # This returns the proportion of each unique value in the "room_type" column of the DataFrame df.


room_type
Entire home/apt    0.519650
Private room       0.456631
Shared room        0.023719
Name: proportion, dtype: float64

In [None]:
df["neighbourhood_group"].value_counts()


neighbourhood_group
Manhattan        21669
Brooklyn         20107
Queens            5666
Bronx             1091
Staten Island      373
Name: count, dtype: int64

## Numerical Data

In [None]:
df["price"].value_counts(bins = 5) # This returns the number of occurrences of each unique value in the "price" column of the DataFrame df, grouped into 5 bins.


(-10.001, 2000.0]    48820
(2000.0, 4000.0]        54
(4000.0, 6000.0]        16
(6000.0, 8000.0]         9
(8000.0, 10000.0]        7
Name: count, dtype: int64

In [None]:
bins = [-10,0, 50,100, 200,500,800,2000,4000,10000]
df["price"].value_counts(bins = bins) # Define custom bins for the "price" column of the DataFrame df.
# Helpul in small datasets

(50.0, 100.0]        17373
(100.0, 200.0]       16588
(200.0, 500.0]        7340
(0.0, 50.0]           6550
(500.0, 800.0]         624
(800.0, 2000.0]        334
(2000.0, 4000.0]        54
(4000.0, 10000.0]       32
(-10.001, 0.0]          11
Name: count, dtype: int64

## Measures of Central Tendency

In [13]:
df["price"].mean()

152.71132376395533

In [14]:
df["price"].median()

106.0

In [15]:
df["price"].std()

240.1287131622437

In [16]:
df["minimum_nights"].mean()

7.031611663190611

In [17]:
df["minimum_nights"].median()

3.0

## Measure of Spread

In [None]:
df["price"].skew() # This returns the skewness of the "price" column of the DataFrame df.

19.120831694826197

In [None]:
df["price"].kurt() # This returns the kurtosis of the "price" column of the DataFrame df.
# Kurtosis is a statistical measure that describes the shape of a probability distribution, 
# particularly the "tailedness" or the sharpness of the peak. 

585.7930484394186

In [20]:
df[df["availability_365"]==365].shape[0]

1295

In [33]:
df.corr(numeric_only=True) # This returns the correlation matrix of the numeric columns of the DataFrame df.


Unnamed: 0,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
price,1.0,0.042771,-0.048014,-0.030608,0.057478,0.081817
minimum_nights,0.042771,1.0,-0.080093,-0.121772,0.127917,0.144146
number_of_reviews,-0.048014,-0.080093,1.0,0.549291,-0.072375,0.172002
reviews_per_month,-0.030608,-0.121772,0.549291,1.0,-0.009414,0.185818
calculated_host_listings_count,0.057478,0.127917,-0.072375,-0.009414,1.0,0.22568
availability_365,0.081817,0.144146,0.172002,0.185818,0.22568,1.0


#### Correlation Matrix Explanation:
- Correlation values range from -1 to 1:
1) 1 indicates a perfect positive correlation (both variables move in the same direction).
2) -1 indicates a perfect negative correlation (both variables move in opposite directions).
3) 0 indicates no linear relationship between the variables.