# Airbnb Data Warehousing Data Transformation Script

## Set Up

### Install Required Modules

In [1]:
import pandas as pd


### Read in files

In [2]:
reviews = pd.read_csv("data/reviews.csv")
listings = pd.read_csv("data/listings.csv")

### Getting General Information on Listings Table

In [22]:
listings.info(max_cols=None)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2668 entries, 0 to 2667
Data columns (total 69 columns):
 #   Column                                        Non-Null Count  Dtype         
---  ------                                        --------------  -----         
 0   id                                            2668 non-null   int64         
 1   listing_url                                   2668 non-null   object        
 2   scrape_id                                     2668 non-null   int64         
 3   last_scraped                                  2668 non-null   datetime64[ns]
 4   source                                        2668 non-null   object        
 5   name                                          2668 non-null   object        
 6   neighborhood_overview                         1826 non-null   object        
 7   picture_url                                   2668 non-null   object        
 8   host_id                                       2668 non-null   int64 

### Dropping redundant/empty columns

In [4]:
# These two columns are the same, we can drop one of them
print(pd.to_datetime(listings["last_scraped"]).equals(pd.to_datetime(listings["calendar_last_scraped"])))

# Others have no non-null values, let's remove all the uneccessary fluff

listings = listings.drop("calendar_last_scraped", axis=1)
listings = listings.drop("description",axis=1)
listings = listings.drop("calendar_updated",axis=1)
listings = listings.drop("bedrooms",axis=1)
listings = listings.drop("bathrooms",axis=1)
listings = listings.drop("neighbourhood_group_cleansed",axis=1)



True


### Setting datetime objects to datetime

In [5]:
# Changing the datetime columns to the datetime data type
listings.last_scraped = pd.to_datetime(listings.last_scraped)
listings.host_since = pd.to_datetime(listings.host_since)


## Making all the dimension tables

### Starting Off With Host Dimension Tables

In [6]:
host_df = listings[
    [
        "host_id",
        "host_url",
        "host_name",
        "host_since",
        "host_location",
        "host_about",
        "host_thumbnail_url",
        "host_picture_url",
        "host_neighbourhood",
        "host_response_time",
        "host_response_rate",
        "host_acceptance_rate",
        "host_is_superhost",
        "host_listings_count",
        "host_total_listings_count",
        "host_verifications",
        "host_has_profile_pic",
        "host_identity_verified",
        "calculated_host_listings_count",
        "calculated_host_listings_count_entire_homes",
        "calculated_host_listings_count_private_rooms",
        "calculated_host_listings_count_shared_rooms",
    ]
]

#### The Dimensions of the Hosts Table

In [7]:
host_ld_df = host_df[
    [
        "host_response_time",
        "host_response_rate",
        "host_acceptance_rate",
        "host_is_superhost",
        "host_listings_count",
        "host_total_listings_count",
        "host_verifications",
        "host_has_profile_pic",
        "host_identity_verified",
    ]
]

hqad_df = host_df[
    [
        "calculated_host_listings_count",
        "calculated_host_listings_count_entire_homes",
        "calculated_host_listings_count_private_rooms",
        "calculated_host_listings_count_shared_rooms",
    ]
]

#### Giving Each Host Dimension A Primary Key

In [8]:
host_ld_df["listing_diagnostics_id"] = host_ld_df.index
hqad_df["hqad_id"] = hqad_df.index

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  host_ld_df["listing_diagnostics_id"] = host_ld_df.index
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hqad_df["hqad_id"] = hqad_df.index


#### And Then Reorganizing so the ID is The First Column of Each Host Dimension

In [9]:
host_ld_df = host_ld_df[
    [
        "listing_diagnostics_id",
        "host_response_time",
        "host_response_rate",
        "host_acceptance_rate",
        "host_is_superhost",
        "host_listings_count",
        "host_total_listings_count",
        "host_verifications",
        "host_has_profile_pic",
        "host_identity_verified",
    ]
]

hqad_df = hqad_df[
    [
        "hqad_id",
        "calculated_host_listings_count",
        "calculated_host_listings_count_entire_homes",
        "calculated_host_listings_count_private_rooms",
        "calculated_host_listings_count_shared_rooms",
    ]
]

### We'll Be Doing What We Just Did To The Other Tables

### Next up, property dimension table

In [10]:
property_df = listings[[
"latitude", 
"longitude", 
"property_type", 
"room_type", 
"accommodates", 
"bathrooms_text", 
"beds", 
"amenities", 
"price", 
]]

In [11]:
# Set the index
property_df["property_id"] = property_df.index

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  property_df["property_id"] = property_df.index


In [12]:
property_df = property_df[
    [
    "property_id", 
    "latitude", 
    "longitude", 
    "property_type", 
    "room_type", 
    "accommodates", 
    "bathrooms_text", 
    "beds", 
    "amenities", 
    "price", 
    ]
]

### Now the reviews_diagnostics Dimension Table

In [13]:
reviews_diagnostics_df = listings[
    [
        "number_of_reviews", 
        "number_of_reviews_ltm", 
        "number_of_reviews_l30d", 
        "first_review", 
        "last_review", 
        "review_scores_rating", 
        "review_scores_accuracy", 
        "review_scores_cleanliness", 
        "review_scores_checkin", 
        "review_scores_communication", 
        "review_scores_location", 
        "review_scores_value", 
        "reviews_per_month",
    ]
]

In [14]:
reviews_diagnostics_df["rev_diag_id"] = reviews_diagnostics_df.index

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reviews_diagnostics_df["rev_diag_id"] = reviews_diagnostics_df.index


In [15]:
reviews_diagnostics_df = reviews_diagnostics_df[
    [
        "rev_diag_id",
        "number_of_reviews", 
        "number_of_reviews_ltm", 
        "number_of_reviews_l30d", 
        "first_review", 
        "last_review", 
        "review_scores_rating", 
        "review_scores_accuracy", 
        "review_scores_cleanliness", 
        "review_scores_checkin", 
        "review_scores_communication", 
        "review_scores_location", 
        "review_scores_value", 
        "reviews_per_month",
    ]
]

### Scrapings Dimension Table

In [16]:
scrapings_df = listings[
    [
        "scrape_id",
        "last_scraped",
        "source" 
    ]
]

In [17]:
scrapings_df["scrapings_id"] = scrapings_df.index

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  scrapings_df["scrapings_id"] = scrapings_df.index


In [18]:
scrapings_df = scrapings_df[
    [
        "scrapings_id",
        "scrape_id",
        "last_scraped",
        "source" 
    ]
]

### Neighbourhood Dimension

In [19]:
neighbourhood_df = listings[
    [
        "neighbourhood",
        "neighborhood_overview",
        "neighbourhood_cleansed",
    ]
]

#### Fixing Spelling Error and Assigning ID

In [20]:
# Fixing Typo
neighbourhood_df["neighbourhood_overview"] = neighbourhood_df["neighborhood_overview"]
neighbourhood_df.drop("neighborhood_overview", axis=1, inplace=True)

# Assigning ID
neighbourhood_df["neighbourhood_id"] = neighbourhood_df.index

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  neighbourhood_df["neighbourhood_overview"] = neighbourhood_df["neighborhood_overview"]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  neighbourhood_df.drop("neighborhood_overview", axis=1, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  neighbourhood_df["neighbourhood_id"] = neighbourhood_df.index


In [21]:
neighbourhood_df = neighbourhood_df[
    [
        "neighbourhood_id",
        "neighbourhood",
        "neighbourhood_overview",
        "neighbourhood_cleansed",
    ]
]

### MinMax_Insights

In [24]:
minmax_insights_df = listings[
    [
        "maximum_nights",
        "minimum_minimum_nights",
        "maximum_minimum_nights",
        "minimum_maximum_nights",
        "maximum_maximum_nights",
        "minimum_nights_avg_ntm",
        "maximum_nights_avg_ntm",
    ]
]

In [25]:
minmax_insights_df["minmax_insights_id"] = minmax_insights_df.index

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  minmax_insights_df["minmax_insights_id"] = minmax_insights_df.index


In [26]:
minmax_insights_df = minmax_insights_df[
    [
        "minmax_insights_id",
        "maximum_nights",
        "minimum_minimum_nights",
        "maximum_minimum_nights",
        "minimum_maximum_nights",
        "maximum_maximum_nights",
        "minimum_nights_avg_ntm",
        "maximum_nights_avg_ntm",
    ]
]

In [27]:
availibility_df = listings[
    [
        "has_availability",
        "availability_30",
        "availability_60",
        "availability_90",
        "availability_365",
    ]
]

In [28]:
availibility_df["avail_id"] = availibility_df.index

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  availibility_df["avail_id"] = availibility_df.index


In [29]:
availibility_df = availibility_df[
    [
        "avail_id",
        "has_availability",
        "availability_30",
        "availability_60",
        "availability_90",
        "availability_365",
    ]
]

In [32]:
pd.set_option("display.max_columns", None)
listings.head()

Unnamed: 0,id,listing_url,scrape_id,last_scraped,source,name,neighborhood_overview,picture_url,host_id,host_url,host_name,host_since,host_location,host_about,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_thumbnail_url,host_picture_url,host_neighbourhood,host_listings_count,host_total_listings_count,host_verifications,host_has_profile_pic,host_identity_verified,neighbourhood,neighbourhood_cleansed,latitude,longitude,property_type,room_type,accommodates,bathrooms_text,beds,amenities,price,minimum_nights,maximum_nights,minimum_minimum_nights,maximum_minimum_nights,minimum_maximum_nights,maximum_maximum_nights,minimum_nights_avg_ntm,maximum_nights_avg_ntm,has_availability,availability_30,availability_60,availability_90,availability_365,number_of_reviews,number_of_reviews_ltm,number_of_reviews_l30d,first_review,last_review,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,license,instant_bookable,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month
0,90676,https://www.airbnb.com/rooms/90676,20231225202549,2023-12-26,city scrape,Home in Columbus · ★4.82 · 3 bedrooms · 3 beds...,The Short North Italianate Cottage is located ...,https://a0.muscache.com/pictures/950e43cd-53f3...,483306,https://www.airbnb.com/users/show/483306,Audra & Lacey,2011-04-04,"Columbus, OH","Active, young professionals who love to travel...",within an hour,100%,100%,t,https://a0.muscache.com/im/users/483306/profil...,https://a0.muscache.com/im/users/483306/profil...,,3,3,"['email', 'phone', 'work_email']",t,t,"Columbus, Ohio, United States",Near North/University,39.98366,-83.00252,Entire home,Entire home/apt,6,2 baths,3.0,[],$132.00,1,365,1,1,365,365,1.0,365.0,t,0,0,0,0,726,101,9,2011-10-11,2023-12-17,4.82,4.85,4.82,4.93,4.88,4.93,4.77,2022-2475,f,3,3,0,0,4.88
1,543140,https://www.airbnb.com/rooms/543140,20231225202549,2023-12-25,city scrape,Home in Columbus · ★4.70 · 1 bedroom · 1 bed ·...,We are close to a lot of things!,https://a0.muscache.com/pictures/e720cdf0-e36b...,2350409,https://www.airbnb.com/users/show/2350409,Edward,2012-05-11,"Columbus, OH","Hello, hello.\r\n\r\nAbout me: pretty easy goi...",within an hour,90%,100%,t,https://a0.muscache.com/im/pictures/user/7d46e...,https://a0.muscache.com/im/pictures/user/7d46e...,,3,4,"['email', 'phone', 'work_email']",t,f,"Columbus, Ohio, United States",Near North/University,40.01114,-83.01005,Private room in home,Private room,1,1 shared bath,1.0,[],$29.00,7,1125,7,7,1125,1125,7.0,1125.0,t,16,25,52,327,133,10,1,2012-07-31,2023-12-09,4.7,4.75,4.33,4.93,4.89,4.77,4.79,2019-1344,f,3,0,3,0,0.96
2,591101,https://www.airbnb.com/rooms/591101,20231225202549,2023-12-26,city scrape,Loft in Columbus · ★4.92 · 1 bedroom · 1 bed ·...,A historic neighborhood of beautiful victorian...,https://a0.muscache.com/pictures/32b28442-ddf3...,2889677,https://www.airbnb.com/users/show/2889677,Gail,2012-07-10,"Columbus, OH","My husband Eric and I are both artists, sharin...",within an hour,100%,100%,t,https://a0.muscache.com/im/pictures/user/e7975...,https://a0.muscache.com/im/pictures/user/e7975...,,1,1,"['email', 'phone']",t,f,"Columbus, Ohio, United States",Near East,39.96041,-82.98005,Private room in loft,Private room,2,1 private bath,1.0,[],$110.00,2,30,2,2,1125,1125,2.0,1125.0,t,0,0,0,0,296,19,0,2012-08-10,2023-11-12,4.92,4.93,4.93,4.96,4.91,4.89,4.88,2019-1230,f,1,0,1,0,2.14
3,923248,https://www.airbnb.com/rooms/923248,20231225202549,2023-12-25,city scrape,Hostel in Columbus · ★4.74 · 1 bedroom · 5 bed...,We are located in the vibrant University Distr...,https://a0.muscache.com/pictures/29aabf51-4e6f...,4965048,https://www.airbnb.com/users/show/4965048,Mathew,2013-02-04,"Pāhoa, HI",I'm an outgoing type who values the experience...,within an hour,100%,99%,t,https://a0.muscache.com/im/pictures/user/User-...,https://a0.muscache.com/im/pictures/user/User-...,Hilo,24,25,"['email', 'phone', 'work_email']",t,t,"Columbus, Ohio, United States",Near North/University,40.01259,-83.00164,Shared room in hostel,Shared room,1,3 shared baths,5.0,[],$30.00,1,1125,1,1,1,1125,1.0,74.7,t,27,57,80,349,348,54,0,2013-02-26,2023-11-23,4.74,4.79,4.73,4.88,4.9,4.68,4.84,2019-1314,f,8,2,4,2,2.64
4,927867,https://www.airbnb.com/rooms/927867,20231225202549,2023-12-25,city scrape,Hostel in Columbus · ★4.72 · 1 bedroom · 1 bed...,We are located in the vibrant University Distr...,https://a0.muscache.com/pictures/08033ebe-286c...,4965048,https://www.airbnb.com/users/show/4965048,Mathew,2013-02-04,"Pāhoa, HI",I'm an outgoing type who values the experience...,within an hour,100%,99%,t,https://a0.muscache.com/im/pictures/user/User-...,https://a0.muscache.com/im/pictures/user/User-...,Hilo,24,25,"['email', 'phone', 'work_email']",t,t,"Columbus, Ohio, United States",Near North/University,40.01259,-83.00164,Private room in hostel,Private room,2,3 shared baths,1.0,[],$73.00,1,365,1,1,30,365,1.0,38.5,t,20,47,70,336,76,9,0,2013-09-11,2023-10-28,4.72,4.84,4.55,4.89,4.91,4.7,4.74,2019-1314,f,8,2,4,2,0.61
