<a href="https://colab.research.google.com/github/SallyPosey/DW_app/blob/main/Real_Final_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1 | Dataset Acquisition and Loading


In [29]:
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats.mstats import winsorize

# Load the airbnb_df dataset from github
airbnb_df = pd.read_csv('https://raw.githubusercontent.com/Lechagria/dataset_lechagria/refs/heads/main/Airbnb_Broward.csv')

# 2 | Structural Investigation

In [30]:
airbnb_df.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365,number_of_reviews_ltm,license
0,57818,50% OFF Monthly 32+days! Snowbird! Walk to be...,275948,Von Inc,,Hollywood,26.01859,-80.1243,Entire home/apt,250.0,31,60,2024-03-08,0.41,2,305,1,
1,69824,2 bd/2ba Oceanfront Condo,351303,Tracy,,Hallandale Beach,25.97641,-80.12052,Entire home/apt,140.0,30,4,2023-06-03,0.09,2,291,0,
2,83449,2 miles from the beach 2/1 sleeps 5 #204,454736,"Jon, Mary Pop Apartments",,Dania Beach,26.03392,-80.14201,Entire home/apt,130.0,7,33,2024-12-14,0.21,16,127,2,
3,129099,MIAMI - AMAZING APARTMENT OCEANVIEW,637272,Bianca,,Hollywood,25.99414,-80.11814,Entire home/apt,328.0,15,28,2024-03-14,0.17,5,365,1,
4,191160,Tropical Beach Paradise Awaits You,924482,Dan,,Deerfield Beach,26.30707,-80.07793,Entire home/apt,119.0,2,25,2019-11-08,0.16,1,118,0,


In [31]:
# we wanted to make sure that the 'id' column is a unique identifier. If the output is 'True', we'll keep it as is.
print(airbnb_df['id'].nunique() == len(airbnb_df))

True


In [32]:
print(airbnb_df.info())
print(airbnb_df.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17167 entries, 0 to 17166
Data columns (total 18 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              17167 non-null  int64  
 1   name                            17167 non-null  object 
 2   host_id                         17167 non-null  int64  
 3   host_name                       17166 non-null  object 
 4   neighbourhood_group             0 non-null      float64
 5   neighbourhood                   17167 non-null  object 
 6   latitude                        17167 non-null  float64
 7   longitude                       17167 non-null  float64
 8   room_type                       17167 non-null  object 
 9   price                           15983 non-null  float64
 10  minimum_nights                  17167 non-null  int64  
 11  number_of_reviews               17167 non-null  int64  
 12  last_review                     

In [33]:
# updated column projection.
airbnb_df.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365,number_of_reviews_ltm,license
0,57818,50% OFF Monthly 32+days! Snowbird! Walk to be...,275948,Von Inc,,Hollywood,26.01859,-80.1243,Entire home/apt,250.0,31,60,2024-03-08,0.41,2,305,1,
1,69824,2 bd/2ba Oceanfront Condo,351303,Tracy,,Hallandale Beach,25.97641,-80.12052,Entire home/apt,140.0,30,4,2023-06-03,0.09,2,291,0,
2,83449,2 miles from the beach 2/1 sleeps 5 #204,454736,"Jon, Mary Pop Apartments",,Dania Beach,26.03392,-80.14201,Entire home/apt,130.0,7,33,2024-12-14,0.21,16,127,2,
3,129099,MIAMI - AMAZING APARTMENT OCEANVIEW,637272,Bianca,,Hollywood,25.99414,-80.11814,Entire home/apt,328.0,15,28,2024-03-14,0.17,5,365,1,
4,191160,Tropical Beach Paradise Awaits You,924482,Dan,,Deerfield Beach,26.30707,-80.07793,Entire home/apt,119.0,2,25,2019-11-08,0.16,1,118,0,


# 3  | Quality Investigation

##Dropping columns

In [34]:
#neighbourhood_group column is empty - so is better to drop it.
airbnb_df = airbnb_df.drop('neighbourhood_group', axis=1)

#license column only has 6 rows let's drop it also
airbnb_df = airbnb_df.drop('license', axis=1)

##Handling Nulls
(And creating one new column)

In [41]:
#Calculate the mean of 'last_review' (excluding NaT values)
mean_last_review = airbnb_df['last_review'].mean()

#Fill NaN values in 'last_review' with the mean using direct assignment
airbnb_df['last_review'] = airbnb_df['last_review'].fillna(mean_last_review)

#Calculate the mode of 'reviews_per_month'
mode_reviews_per_month = airbnb_df['reviews_per_month'].mode()[0]

#Fill NaN values in 'reviews_per_month' with the mode using direct assignment
airbnb_df['reviews_per_month'] = airbnb_df['reviews_per_month'].fillna(mode_reviews_per_month)

#There is a single null in the host_name column, let's just fill it with a random value "Dixie"
airbnb_df['host_name'] = airbnb_df['host_name'].fillna("Dixie")

#Group by 'neighbourhood' and calculate the mean 'price' for each group
neighborhood_mean_prices = airbnb_df.groupby('neighbourhood')['price'].mean()

#Create a new column to store the mean price, by neighbourhood
airbnb_df['neighborhood_mean_price'] = airbnb_df['neighbourhood'].map(neighborhood_mean_prices)

#Fill NaN values in 'price' with the corresponding 'neighborhood_mean_price'
airbnb_df['price'] = airbnb_df['price'].fillna(airbnb_df['neighborhood_mean_price'])


##Fixing data types

In [35]:
# Convert 'last_review' column to datetime objects
airbnb_df['last_review'] = pd.to_datetime(airbnb_df['last_review'])


##Identifying and Treating Outliers in the 'Price' Column

In [46]:
#What prices are in what percentiles

percentiles = [90, 91, 92, 93, 94, 95, 96, 97, 98, 99]
percentile_values = airbnb_df['price'].quantile(q=[p/100 for p in percentiles])

#Create a DataFrame to display the percentiles and their corresponding price values
percentile_df = pd.DataFrame({'Percentile': percentiles, 'Price': percentile_values})

percentile_df

Unnamed: 0,Percentile,Price
0.9,90,653.0
0.91,91,693.0
0.92,92,745.0
0.93,93,800.0
0.94,94,867.04
0.95,95,950.7
0.96,96,1034.36
0.97,97,1217.02
0.98,98,1509.4
0.99,99,2202.74


In [48]:
airbnb_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17167 entries, 0 to 17166
Data columns (total 17 columns):
 #   Column                          Non-Null Count  Dtype         
---  ------                          --------------  -----         
 0   id                              17167 non-null  int64         
 1   name                            17167 non-null  object        
 2   host_id                         17167 non-null  int64         
 3   host_name                       17167 non-null  object        
 4   neighbourhood                   17167 non-null  object        
 5   latitude                        17167 non-null  float64       
 6   longitude                       17167 non-null  float64       
 7   room_type                       17167 non-null  object        
 8   price                           17167 non-null  float64       
 9   minimum_nights                  17167 non-null  int64         
 10  number_of_reviews               17167 non-null  int64         
 11  la

# 4 | Data Integration

# 5 | Data Binning

In [None]:
bins = [0, 30, 90, 180, 365]
labels = ['Rarely Available', 'Seasonal', 'Half-Year', 'Year-Round']
airbnb_df['availability_cat'] = pd.cut(airbnb_df['availability_365'], bins=bins, labels=labels)
airbnb_df.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365,number_of_reviews_ltm,license,availability_cat
0,57818,50% OFF Monthly 32+days! Snowbird! Walk to be...,275948,Von Inc,Hollywood,26.01859,-80.1243,Entire home/apt,250.0,31,60,2024-03-08,0.41,2,305,1,,Year-Round
1,69824,2 bd/2ba Oceanfront Condo,351303,Tracy,Hallandale Beach,25.97641,-80.12052,Entire home/apt,140.0,30,4,2023-06-03,0.09,2,291,0,,Year-Round
2,83449,2 miles from the beach 2/1 sleeps 5 #204,454736,"Jon, Mary Pop Apartments",Dania Beach,26.03392,-80.14201,Entire home/apt,130.0,7,33,2024-12-14,0.21,16,127,2,,Half-Year
3,129099,MIAMI - AMAZING APARTMENT OCEANVIEW,637272,Bianca,Hollywood,25.99414,-80.11814,Entire home/apt,328.0,15,28,2024-03-14,0.17,5,365,1,,Year-Round
4,191160,Tropical Beach Paradise Awaits You,924482,Dan,Deerfield Beach,26.30707,-80.07793,Entire home/apt,119.0,2,25,2019-11-08,0.16,1,118,0,,Half-Year


# 6 | LAMBDA Function

In [None]:
airbnb_df['price_category'] = airbnb_df['price'].apply(lambda x: 'Low' if x < 100 else ('Mid' if x < 250 else 'High'))


In [None]:
airbnb_df.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365,number_of_reviews_ltm,license,availability_cat,price_category
0,57818,50% OFF Monthly 32+days! Snowbird! Walk to be...,275948,Von Inc,Hollywood,26.01859,-80.1243,Entire home/apt,250.0,31,60,2024-03-08,0.41,2,305,1,,Year-Round,High
1,69824,2 bd/2ba Oceanfront Condo,351303,Tracy,Hallandale Beach,25.97641,-80.12052,Entire home/apt,140.0,30,4,2023-06-03,0.09,2,291,0,,Year-Round,Mid
2,83449,2 miles from the beach 2/1 sleeps 5 #204,454736,"Jon, Mary Pop Apartments",Dania Beach,26.03392,-80.14201,Entire home/apt,130.0,7,33,2024-12-14,0.21,16,127,2,,Half-Year,Mid
3,129099,MIAMI - AMAZING APARTMENT OCEANVIEW,637272,Bianca,Hollywood,25.99414,-80.11814,Entire home/apt,328.0,15,28,2024-03-14,0.17,5,365,1,,Year-Round,High
4,191160,Tropical Beach Paradise Awaits You,924482,Dan,Deerfield Beach,26.30707,-80.07793,Entire home/apt,119.0,2,25,2019-11-08,0.16,1,118,0,,Half-Year,Mid


# 7 | Feature Engineering

In [None]:
from datetime import datetime

airbnb_df['last_review_days_ago'] = (datetime.now() - pd.to_datetime(airbnb_df['last_review'])).dt.days

airbnb_df.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood,latitude,longitude,room_type,price,minimum_nights,...,last_review,reviews_per_month,calculated_host_listings_count,availability_365,number_of_reviews_ltm,license,availability_cat,price_category,popularity_score,last_review_days_ago
0,57818,50% OFF Monthly 32+days! Snowbird! Walk to be...,275948,Von Inc,Hollywood,26.01859,-80.1243,Entire home/apt,250.0,31,...,2024-03-08,0.41,2,305,1,,Year-Round,High,125.05,351.0
1,69824,2 bd/2ba Oceanfront Condo,351303,Tracy,Hallandale Beach,25.97641,-80.12052,Entire home/apt,140.0,30,...,2023-06-03,0.09,2,291,0,,Year-Round,Mid,26.19,630.0
2,83449,2 miles from the beach 2/1 sleeps 5 #204,454736,"Jon, Mary Pop Apartments",Dania Beach,26.03392,-80.14201,Entire home/apt,130.0,7,...,2024-12-14,0.21,16,127,2,,Half-Year,Mid,26.67,70.0
3,129099,MIAMI - AMAZING APARTMENT OCEANVIEW,637272,Bianca,Hollywood,25.99414,-80.11814,Entire home/apt,328.0,15,...,2024-03-14,0.17,5,365,1,,Year-Round,High,62.05,345.0
4,191160,Tropical Beach Paradise Awaits You,924482,Dan,Deerfield Beach,26.30707,-80.07793,Entire home/apt,119.0,2,...,2019-11-08,0.16,1,118,0,,Half-Year,Mid,18.88,1933.0
