In [51]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly as py

In [52]:
df = pd.read_csv('Airbnb.csv')


Columns (25) have mixed types. Specify dtype option on import or set low_memory=False.



In [53]:
df.head()

Unnamed: 0,id,NAME,host id,host_identity_verified,host name,neighbourhood group,neighbourhood,lat,long,country,...,service fee,minimum nights,number of reviews,last review,reviews per month,review rate number,calculated host listings count,availability 365,house_rules,license
0,1001254,Clean & quiet apt home by the park,80014485718,unconfirmed,Madaline,Brooklyn,Kensington,40.64749,-73.97237,United States,...,$193,10.0,9.0,10/19/2021,0.21,4.0,6.0,286.0,Clean up and treat the home the way you'd like...,
1,1002102,Skylit Midtown Castle,52335172823,verified,Jenna,Manhattan,Midtown,40.75362,-73.98377,United States,...,$28,30.0,45.0,5/21/2022,0.38,4.0,2.0,228.0,Pet friendly but please confirm with me if the...,
2,1002403,THE VILLAGE OF HARLEM....NEW YORK !,78829239556,,Elise,Manhattan,Harlem,40.80902,-73.9419,United States,...,$124,3.0,0.0,,,5.0,1.0,352.0,"I encourage you to use my kitchen, cooking and...",
3,1002755,,85098326012,unconfirmed,Garry,Brooklyn,Clinton Hill,40.68514,-73.95976,United States,...,$74,30.0,270.0,7/5/2019,4.64,4.0,1.0,322.0,,
4,1003689,Entire Apt: Spacious Studio/Loft by central park,92037596077,verified,Lyndon,Manhattan,East Harlem,40.79851,-73.94399,United States,...,$41,10.0,9.0,11/19/2018,0.1,3.0,1.0,289.0,"Please no smoking in the house, porch or on th...",


In [54]:
df.shape

(102599, 26)

In [55]:
df.isnull().sum()

Unnamed: 0,0
id,0
NAME,250
host id,0
host_identity_verified,289
host name,406
neighbourhood group,29
neighbourhood,16
lat,8
long,8
country,532


In [56]:
df.drop('license',axis = 1, inplace = True)

In [57]:
df.dtypes

Unnamed: 0,0
id,int64
NAME,object
host id,int64
host_identity_verified,object
host name,object
neighbourhood group,object
neighbourhood,object
lat,float64
long,float64
country,object


In [58]:
df['price'] = df['price'].replace('[\$,]', '', regex=True).astype(float)
df['service fee'] = df['service fee'].replace('[\$,]', '', regex=True).astype(float)


In [59]:
df.price.fillna(df['price'], inplace = True)

In [60]:
df['last review'] = pd.to_datetime(df['last review'], errors='coerce')


In [61]:
cat_cols = ['NAME', 'host_identity_verified', 'host name', 'neighbourhood group',
            'neighbourhood', 'country', 'country code', 'instant_bookable',
            'cancellation_policy', 'room type', 'house_rules']

for col in cat_cols:
    df[col] = df[col].astype('category')


In [62]:
df['minimum nights'] = df['minimum nights'].fillna(0).astype(int)
df['number of reviews'] = df['number of reviews'].fillna(0).astype(int)


In [63]:
df.dtypes

Unnamed: 0,0
id,int64
NAME,category
host id,int64
host_identity_verified,category
host name,category
neighbourhood group,category
neighbourhood,category
lat,float64
long,float64
country,category


In [64]:
df.isnull().sum()

Unnamed: 0,0
id,0
NAME,250
host id,0
host_identity_verified,289
host name,406
neighbourhood group,29
neighbourhood,16
lat,8
long,8
country,532


In [65]:
df['has_reviews'] = df['last review'].notna().astype(int)


In [66]:
df['neighbourhood group'].fillna(df['neighbourhood group'].mode(), inplace = True)


A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.





In [67]:
# Add 'Unknown' to categories first
df['host_identity_verified'] = df['host_identity_verified'].cat.add_categories('Unknown')

# Now fill missing values safely
df['host_identity_verified'] = df['host_identity_verified'].fillna('Unknown')


In [68]:
df['has_house_rules'] = df['house_rules'].notna().astype(int)

In [69]:
df.isnull().sum()

Unnamed: 0,0
id,0
NAME,250
host id,0
host_identity_verified,0
host name,406
neighbourhood group,29
neighbourhood,16
lat,8
long,8
country,532


In [70]:
df.NAME = df.NAME.cat.add_categories('Unknown')


In [71]:
df.NAME.fillna('Unknown', inplace = True)


A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.





In [72]:
df['host name'] = df['host name'].cat.add_categories('Unknown')

In [73]:
df['host name'].fillna('Unknown', inplace = True)


A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.





In [74]:
df.isnull().sum()

Unnamed: 0,0
id,0
NAME,0
host id,0
host_identity_verified,0
host name,0
neighbourhood group,29
neighbourhood,16
lat,8
long,8
country,532


In [75]:
# Fill with mode (most frequent value)
for col in ['neighbourhood group', 'neighbourhood', 'country', 'country code',
            'instant_bookable', 'cancellation_policy']:
    if df[col].dtype.name == 'category':
        df[col] = df[col].cat.add_categories('Unknown')
        df[col] = df[col].fillna('Unknown')
    else:
        df[col] = df[col].fillna(df[col].mode()[0])

# Fill coordinates with mean (or drop rows with missing lat/long if preferred)
df['lat'] = df['lat'].fillna(df['lat'].mean())
df['long'] = df['long'].fillna(df['long'].mean())

# Fill numerical columns with median or mean
df['Construction year'] = df['Construction year'].fillna(df['Construction year'].median())
df['price'] = df['price'].fillna(df['price'].median())
df['service fee'] = df['service fee'].fillna(df['service fee'].median())
df['review rate number'] = df['review rate number'].fillna(df['review rate number'].mean())
df['calculated host listings count'] = df['calculated host listings count'].fillna(df['calculated host listings count'].median())
df['availability 365'] = df['availability 365'].fillna(df['availability 365'].median())


In [76]:
df.isnull().sum()

Unnamed: 0,0
id,0
NAME,0
host id,0
host_identity_verified,0
host name,0
neighbourhood group,0
neighbourhood,0
lat,0
long,0
country,0


In [77]:
print(df.isnull().sum().sort_values(ascending=False))


house_rules                       52131
last review                       15893
reviews per month                 15879
host_identity_verified                0
host name                             0
neighbourhood group                   0
neighbourhood                         0
lat                                   0
id                                    0
NAME                                  0
host id                               0
country code                          0
country                               0
long                                  0
instant_bookable                      0
price                                 0
cancellation_policy                   0
room type                             0
Construction year                     0
number of reviews                     0
minimum nights                        0
service fee                           0
review rate number                    0
calculated host listings count        0
availability 365                      0


In [78]:
# Step 1: Add the new category "Not provided"
df['house_rules'] = df['house_rules'].astype('category')
df['house_rules'] = df['house_rules'].cat.add_categories(["Not provided"])

# Step 2: Now fill the NaNs
df['house_rules'] = df['house_rules'].fillna("Not provided")


In [79]:
df['last review'] = pd.to_datetime(df['last review'], errors='coerce')
df['reviews per month'] = df['reviews per month'].fillna(0)


In [80]:
df.isnull().sum()

Unnamed: 0,0
id,0
NAME,0
host id,0
host_identity_verified,0
host name,0
neighbourhood group,0
neighbourhood,0
lat,0
long,0
country,0


In [81]:
df = df.drop(columns=['last review'])


In [82]:
df.isnull().sum()

Unnamed: 0,0
id,0
NAME,0
host id,0
host_identity_verified,0
host name,0
neighbourhood group,0
neighbourhood,0
lat,0
long,0
country,0


In [83]:
import plotly.express as px
import plotly.graph_objects as go


In [84]:

print(df.shape)
print(df.info())

# Price distribution
df['price'] = pd.to_numeric(df['price'], errors='coerce')
print(df['price'].describe())

# Room type count
print(df['room type'].value_counts())

# Availability distribution
print(df['availability 365'].value_counts().head())

# Host listings count
print(df['calculated host listings count'].describe())

# Top 10 Neighbourhoods by listing count
top_neigh = df['neighbourhood'].value_counts().nlargest(10)


(102599, 26)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 102599 entries, 0 to 102598
Data columns (total 26 columns):
 #   Column                          Non-Null Count   Dtype   
---  ------                          --------------   -----   
 0   id                              102599 non-null  int64   
 1   NAME                            102599 non-null  category
 2   host id                         102599 non-null  int64   
 3   host_identity_verified          102599 non-null  category
 4   host name                       102599 non-null  category
 5   neighbourhood group             102599 non-null  category
 6   neighbourhood                   102599 non-null  category
 7   lat                             102599 non-null  float64 
 8   long                            102599 non-null  float64 
 9   country                         102599 non-null  category
 10  country code                    102599 non-null  category
 11  instant_bookable                102599 non-null  cat

In [85]:
fig = px.pie(df, names='room type', title='Room Type Distribution')
fig.show()


In [86]:
fig = px.histogram(df, x='price', title='Price Distribution (Log Scale)', log_y=True, nbins=50)
fig.show()


In [87]:
fig = px.scatter_mapbox(df, lat="lat", lon="long", hover_name="NAME",
                        color="room type", zoom=10, height=500)
fig.update_layout(mapbox_style="open-street-map")
fig.update_layout(title='Geographical Distribution of Listings')
fig.show()


In [93]:

# Count listings per neighbourhood group
df_grouped = df['neighbourhood group'].value_counts().reset_index()
df_grouped.columns = ['neighbourhood_group', 'count']

# Create bar chart with shades of blue
fig = px.bar(
    df_grouped,
    x='neighbourhood_group',
    y='count',
    color='count',
    color_continuous_scale='Reds',
    title='Listings by Neighbourhood Group'
)

fig.show()


In [89]:
import pandas as pd
import plotly.express as px
from dash import Dash, dcc, html, Input, Output



# Remove outliers for visualization (optional)
df = df[df['price'] < 1000]  # you can adjust this threshold

# Initialize Dash app
app = Dash(__name__)

# Layout of the dashboard
app.layout = html.Div([
    html.H1("Airbnb EDA Dashboard", style={'textAlign': 'center'}),

    html.Div([
        html.Label("Select Neighbourhood:"),
        dcc.Dropdown(
            options=[{"label": nb, "value": nb} for nb in df["neighbourhood"].unique()],
            value=None,
            id='neighbourhood-filter',
            placeholder="Select a neighbourhood"
        ),
    ], style={'width': '48%', 'display': 'inline-block'}),

    html.Div([
        html.Label("Select Room Type:"),
        dcc.Dropdown(
            options=[{"label": rt, "value": rt} for rt in df["room type"].unique()],
            value=None,
            id='room-filter',
            placeholder="Select a room type"
        ),
    ], style={'width': '48%', 'float': 'right', 'display': 'inline-block'}),

    dcc.Graph(id='price-hist'),
    dcc.Graph(id='room-bar')
])

# Callback to update graphs
@app.callback(
    [Output('price-hist', 'figure'),
     Output('room-bar', 'figure')],
    [Input('neighbourhood-filter', 'value'),
     Input('room-filter', 'value')]
)
def update_graphs(selected_neighbourhood, selected_room):
    filtered_df = df.copy()

    if selected_neighbourhood:
        filtered_df = filtered_df[filtered_df['neighbourhood'] == selected_neighbourhood]

    if selected_room:
        filtered_df = filtered_df[filtered_df['room type'] == selected_room]

    # Price histogram
    fig_price = px.histogram(
        filtered_df, x='price', nbins=50, title='Price Distribution',
        labels={'price': 'Price (USD)'}
    )

    # Room type bar chart
    room_counts = filtered_df['room type'].value_counts().reset_index()
    room_counts.columns = ['Room Type', 'Count']
    fig_room = px.bar(
        room_counts, x='Room Type', y='Count',
        title='Room Type Distribution',
        labels={'Room Type': 'Room Type', 'Count': 'Number of Listings'}
    )

    return fig_price, fig_room

# Run the Dash app
if __name__ == '__main__':
    app.run(debug=True)  # ✅ FIXED


<IPython.core.display.Javascript object>

In [95]:
import pandas as pd
import plotly.express as px
from dash import Dash, dcc, html, Input, Output

# Load and clean data (make sure it's already preprocessed)
# Example:
# df = pd.read_csv('airbnb_data.csv')

# Start Dash app
app = Dash(__name__)

# App layout
app.layout = html.Div([
    html.H1("NYC Airbnb EDA Dashboard", style={'textAlign': 'center', 'marginBottom': '20px'}),

    html.Div([
        html.Div([
            html.Label("Select Neighbourhood Group:"),
            dcc.Dropdown(
                id='neighbourhood-dropdown',
                options=[{'label': grp, 'value': grp} for grp in sorted(df['neighbourhood group'].unique())],
                value=None,
                placeholder="All Neighbourhood Groups"
            )
        ], style={'width': '48%', 'display': 'inline-block'}),

        html.Div([
            html.Label("Select Room Type:"),
            dcc.Dropdown(
                id='roomtype-dropdown',
                options=[{'label': room, 'value': room} for room in sorted(df['room type'].unique())],
                value=None,
                placeholder="All Room Types"
            )
        ], style={'width': '48%', 'display': 'inline-block', 'float': 'right'}),
    ], style={'marginBottom': '30px'}),

    dcc.Graph(id='price-histogram'),
    dcc.Graph(id='neighbourhood-bar')
])

# Callbacks for interactivity
@app.callback(
    Output('price-histogram', 'figure'),
    Output('neighbourhood-bar', 'figure'),
    Input('neighbourhood-dropdown', 'value'),
    Input('roomtype-dropdown', 'value')
)
def update_graphs(selected_group, selected_room):
    filtered_df = df.copy()

    if selected_group:
        filtered_df = filtered_df[filtered_df['neighbourhood group'] == selected_group]
    if selected_room:
        filtered_df = filtered_df[filtered_df['room type'] == selected_room]

    # Histogram
    fig_price = px.histogram(
        filtered_df,
        x='price',
        nbins=50,
        title='Price Distribution',
        color_discrete_sequence=['#1f77b4']
    )
    fig_price.update_layout(
        xaxis_title='Price (USD)',
        yaxis_title='Count',
        font=dict(size=14),
        yaxis_tickformat=',',
        bargap=0.05,
        title_font=dict(size=18)
    )

    # Bar Chart
    group_counts = filtered_df['neighbourhood group'].value_counts().reset_index()
    group_counts.columns = ['neighbourhood_group', 'count']

    fig_group = px.bar(
        group_counts,
        x='neighbourhood_group',
        y='count',
        color='count',
        color_continuous_scale='Blues',
        title='Listings by Neighbourhood Group'
    )
    fig_group.update_layout(
        xaxis_title='Neighbourhood Group',
        yaxis_title='Number of Listings',
        font=dict(size=14),
        xaxis_tickangle=-30,
        yaxis_tickformat=',',
        title_font=dict(size=18)
    )

    return fig_price, fig_group

# Run app
if __name__ == '__main__':
    app.run(debug=True)


<IPython.core.display.Javascript object>