## New York AirBNB Exploratory Analysis 

### Importing Packages and preparing data

In [1]:
## Data source: https://insideairbnb.com/get-the-data/

In [2]:
import os
#Checking for current directory and switching to project folder
current_directory = os.getcwd()
print(current_directory)
os.chdir('/Users/smarr/Documents/Data Sets/AirBNBAnalysis')

/Users/smarr/nba-pandas


In [3]:
# Verifying Directory was switched
os.getcwd()

'/Users/smarr/Documents/Data Sets/AirBNBAnalysis'

In [4]:
import pandas as pd

In [5]:
## Importing listings data and cleaning the price column to ensure successful numerical analysis
listings = pd.read_csv('../AirBNBAnalysis/data/listings_clean.csv')
listings['price']= listings['price'].replace('[\$,]', '', regex=True).astype(float)
listings.to_csv('listings_clean.csv', index=False)

  listings['price']= listings['price'].replace('[\$,]', '', regex=True).astype(float)


In [6]:
# Testing
listings.head()

Unnamed: 0,id,listing_url,scrape_id,last_scraped,source,name,description,neighborhood_overview,picture_url,host_id,...,review_scores_communication,review_scores_location,review_scores_value,license,instant_bookable,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month
0,36121,https://www.airbnb.com/rooms/36121,20250301213336,2025-03-03,city scrape,Lg Rm in Historic Prospect Heights,Cozy space share in the heart of a great neigh...,Full of tree-lined streets and beautiful brown...,https://a0.muscache.com/pictures/8776354/35b32...,62165,...,5.0,5.0,5.0,,f,1,0,1,0,0.05
1,36647,https://www.airbnb.com/rooms/36647,20250301213336,2025-03-03,city scrape,"1 Bedroom & your own Bathroom, Elevator Apartment",Private bedroom with your own bathroom in a 2 ...,"Manhattan, SE corner of 2nd Ave/ E. 110th street",https://a0.muscache.com/pictures/miso/Hosting-...,157798,...,4.9,4.38,4.71,,f,1,0,1,0,0.58
2,38663,https://www.airbnb.com/rooms/38663,20250301213336,2025-03-02,city scrape,Luxury Brownstone in Boerum Hill,"Beautiful, large home in great hipster neighbo...","diverse, lively, hip, cool: loaded with restau...",https://a0.muscache.com/pictures/miso/Hosting-...,165789,...,4.88,4.86,4.62,OSE-STRREG-0001784,f,1,0,1,0,0.28
3,38833,https://www.airbnb.com/rooms/38833,20250301213336,2025-03-03,city scrape,Spectacular West Harlem Garden Apt,This is a very large and unique space. An inc...,West Harlem is now packed with great restauran...,https://a0.muscache.com/pictures/7554f9e5-4ab0...,166532,...,4.96,4.79,4.82,OSE-STRREG-0000476,f,1,1,0,0,1.36
4,39282,https://www.airbnb.com/rooms/39282,20250301213336,2025-03-02,city scrape,“Work-from-home” from OUR home.,*Monthly Discount will automatically apply <br...,THE NEIGHBORHOOD:<br />Our apartment is locate...,https://a0.muscache.com/pictures/ef8f43ad-d967...,168525,...,4.88,4.85,4.78,OSE-STRREG-0001150,f,2,0,2,0,1.54


In [7]:
# Importing SQL python package to work with the SQL database, establishing connection between python and SQL
from sqlalchemy import create_engine
engine = create_engine('postgresql://smarr@localhost:5432/airbnb_analysis', echo=False)
conn = engine.connect()

In [8]:
#Submitting our listings dataframe to SQL
listings.to_sql('listings', engine, if_exists='replace', index=False)

264

In [9]:
dfCalendar = pd.read_csv('../AirBNBAnalysis/data/calendar.csv')

  dfCalendar = pd.read_csv('../AirBNBAnalysis/data/calendar.csv')


In [None]:
# chunk processing since this calendar file is extremely large, cleaning string into integers for successful numerical analysis
dfCalendar = pd.read_csv('../AirBNBAnalysis/data/calendar.csv', chunksize=200000) 
for chunk in dfCalendar:
    chunk['price'] = chunk['price'].replace(r'[\$,]', '', regex=True).astype(float)
    chunk.to_sql('calendar', engine, if_exists='append', index=False, method='multi')

In [None]:
# Testing data frame for adjusted price column
dfCalendar

In [None]:
## call engine.dispose() or ensure .close() is used on connections to prevent table lock in postgresql

### Pivot table for understanding price change over time across neighbourhoods

In [None]:
df = pd.read_sql_query("""
    SELECT 
        TO_CHAR(c.date, 'YYYY-MM') AS month,
        l.neighbourhood_cleansed,
        ROUND(AVG(c.price)::numeric, 2) AS avg_price
    FROM calendar c
    JOIN listings l ON c.listing_id = l.id
    WHERE c.available = 't'
    GROUP BY TO_CHAR(c.date, 'YYYY-MM'), l.neighbourhood_cleansed
""", conn)

In [None]:
pivot_df = df.pivot(index='neighbourhood_cleansed', columns='month', values='avg_price')
pivot_df.head()

### Adding reviews csv file for sentiment analysis (Python and PostgreSQL)

In [None]:
dfReviews = pd.read_csv("../AirBNBAnalysis/data/reviews.csv", low_memory=False)

In [None]:
dfReviews.to_sql('reviews', engine, if_exists='replace', index=False)

In [None]:
dfReviews.head()

In [None]:
## Drop Null values as they will make it more difficult to do sentiment analysis
dfReviews=dfReviews.dropna(subset=['comments'])

In [None]:
## Checking length of Dataframe for verification purposes
len(dfReviews)

In [None]:
## Make all comments lower-case for standardizing purposes
dfReviews['comments']=dfReviews['comments'].str.lower()

### Sentiment Scoring

In [None]:
from textblob import TextBlob

In [None]:
## Define Function
def get_sentiment(text):
    return TextBlob(text).sentiment.polarity

In [None]:
## Apply it to Airbnb comments
dfReviews['Sentiment']=dfReviews['comments'].apply(get_sentiment)

In [None]:
dfReviews[['listing_id','Sentiment']].head()
## Doesn't quite get us our average sentiment by listing

In [None]:
dfReviews

### Average sentiment and count by listing ID

In [None]:
sentimentAvgandCount_by_listing=dfReviews.groupby('listing_id').agg(avg_sentiment=('Sentiment','mean'),sentiment_count=('Sentiment','count')).reset_index()
sentimentAvgandCount_by_listing

In [None]:
## Merge with Listings data
CombinedDF=listings.merge(sentimentAvgandCount_by_listing,left_on='id',right_on='listing_id')

In [None]:
CombinedDF

In [None]:
sentimentAvgandCount_by_listing.sort_values(by='avg_sentiment', ascending=False).head(10)
## Want to get listings that have a sentiment count greater than 10, to get a substantive analysis

In [None]:
sentimentAvgandCount_by_listing = sentimentAvgandCount_by_listing[sentimentAvgandCount_by_listing['sentiment_count']>10]

In [None]:
sentimentAvgandCount_by_listing.sort_values(by='avg_sentiment', ascending=False).head(10)
## Listing ID is pretty abstract. I want to see the URl and the neighbourhood that each listing is in.

In [None]:
sentimentAvgandCount_by_listingNew=sentimentAvgandCount_by_listing.merge(listings[['neighbourhood_cleansed', 'listing_url','id']], left_on = 'listing_id', right_on='id')

In [None]:
## Now lets run the group by and get a better idea of the area and general quality of the listing
sentimentAvgandCount_by_listingNew.sort_values(by='avg_sentiment', ascending=False).head(10).drop(columns=['id'], errors='ignore')

### Average sentiment and count by neighbourhood (Additional Merge is needed)

In [None]:
CombinedDF_NeighbourhoodGroupby=dfReviews.merge(listings[['id','neighbourhood_cleansed']], left_on = 'listing_id', right_on='id')

In [None]:
CombinedDF_NeighbourhoodGroupby

In [None]:
## Drop id_x and y values
CombinedDF_NeighbourhoodGroupby = CombinedDF_NeighbourhoodGroupby.drop(columns=['id_x', 'id_y'], errors='ignore')

In [None]:
CombinedDF_NeighbourhoodGroupby

In [None]:
sentimentAvgandCount_by_neighbourhood=CombinedDF_NeighbourhoodGroupby.groupby('neighbourhood_cleansed').agg(avg_sentiment=('Sentiment','mean'),sentiment_count=('Sentiment','count')).reset_index()

In [None]:
sentimentAvgandCount_by_neighbourhood.sort_values(by='avg_sentiment', ascending=False).head(10)