In [101]:
from pathlib import Path
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import plotly.express as px
import plotly.io as pio

In [102]:
pio.templates.default = 'plotly_white'
BASE_DIR = Path.cwd().parent

In [115]:
def create_df(file_names, cols, drop_cols, merge_col='id'):
  dfs = [pd.read_csv(BASE_DIR / 'row_data' / file) for file in file_names]
  df = dfs[0].merge(dfs[1][cols], on=merge_col, how='inner').drop(drop_cols, axis=1)
  return df

In [116]:
def missing_values(df):
    nan = pd.DataFrame({
    'Missing Values': 
        df.isnull().sum().sort_values(ascending=False),
    'Missing Values, %': 
        (df.isnull().sum()
            .sort_values(ascending=False)/len(df) * 100)
            .astype(int)
        })
    return nan[nan['Missing Values'] > 0]

In [117]:
file_names = ['listings_short.csv', 'listings.csv']
cols = ['id', 'description', 'review_scores_rating', 'review_scores_accuracy',
       'review_scores_cleanliness', 'review_scores_checkin',
       'review_scores_communication', 'review_scores_location',
       'review_scores_value']
drop_cols = ['license', 'neighbourhood_group', 'minimum_nights', 
             'calculated_host_listings_count', 'availability_365']
df_listing = create_df(file_names, cols, drop_cols)
df_listing.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood,latitude,longitude,room_type,price,number_of_reviews,...,reviews_per_month,number_of_reviews_ltm,description,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value
0,32984,"City Hall, Stockholm",53396,Chris,Kungsholmens,59.32779,18.04998,Entire home/apt,1350,24,...,0.16,0,<b>The space</b><br />Beautiful very bright 43...,4.23,4.43,3.81,4.29,4.52,4.81,4.2
1,75590,Amazing nature location by a lake,397766,Peter,Skarpnäcks,59.30117,18.12833,Entire home/apt,949,10,...,0.09,0,"Apartment on the top floor, overlooking a lake...",4.8,5.0,4.89,4.89,5.0,4.78,4.78
2,164448,Double room in central Stockholm with Wi-Fi,784312,Li,Södermalms,59.31389,18.06087,Private room,829,358,...,2.52,34,I am renting out a nice double room on the top...,4.85,4.87,4.81,4.96,4.97,4.83,4.77
3,170651,Petit Charm Rooftop next to heaven,814021,Marie,Södermalms,59.31702,18.02946,Entire home/apt,828,42,...,0.3,3,My place is perfect for 1 person or mabey 2 if...,4.68,4.84,4.55,4.89,4.92,4.84,4.74
4,220851,One room in appartement,412283,Fredric,Kungsholmens,59.33351,18.03693,Private room,500,59,...,0.42,8,Welcome!<br /><br /><b>The space</b><br />Welc...,4.7,4.71,4.66,4.91,4.88,4.83,4.71


In [118]:
df_listing.describe()

Unnamed: 0,id,host_id,latitude,longitude,price,number_of_reviews,reviews_per_month,number_of_reviews_ltm,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value
count,3500.0,3500.0,3500.0,3500.0,3500.0,3500.0,2798.0,3500.0,2798.0,2780.0,2780.0,2780.0,2780.0,2780.0,2780.0
mean,2.477073e+17,135140300.0,59.318811,18.031954,1739.584286,26.758857,1.054982,8.879143,4.755425,4.804396,4.736216,4.852939,4.857327,4.800428,4.714601
std,3.442714e+17,148979600.0,0.03,0.063833,11324.378729,57.409084,1.468412,19.706462,0.537539,0.37006,0.424896,0.353636,0.344294,0.321867,0.385781
min,32984.0,4457.0,59.23195,17.77311,0.0,0.0,0.01,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0
25%,23033410.0,21072600.0,59.300792,18.004033,714.0,1.0,0.19,0.0,4.72,4.76,4.67,4.83,4.85,4.73,4.62
50%,44029510.0,64917660.0,59.3179,18.04954,1100.0,6.0,0.45,2.0,4.89,4.91,4.86,4.96,4.98,4.895,4.8
75%,6.482125e+17,220860400.0,59.33787,18.076097,1714.0,24.0,1.16,8.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0
max,8.580712e+17,507582600.0,59.417742,18.16607,480500.0,976.0,18.94,571.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0


In [119]:
missing_values(df_listing)

Unnamed: 0,Missing Values,"Missing Values, %"
review_scores_value,720,20
review_scores_location,720,20
review_scores_communication,720,20
review_scores_checkin,720,20
review_scores_cleanliness,720,20
review_scores_accuracy,720,20
reviews_per_month,702,20
review_scores_rating,702,20
last_review,702,20
description,102,2


In [120]:
df_listing[['name', 'description']] = df_listing[['name', 'description']].fillna('')
df_listing = df_listing.fillna(0)
df_listing['host_description'] = df_listing['name'] + ' ' + df_listing['description']
df_listing.drop(['name', 'description'], axis=1, inplace=True)

In [121]:
px.histogram(df_listing, x='price')

In [123]:
df_listing.query('price > 25000 or price == 0')

Unnamed: 0,id,host_id,host_name,neighbourhood,latitude,longitude,room_type,price,number_of_reviews,last_review,reviews_per_month,number_of_reviews_ltm,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,host_description
755,19529551,137155339,Hillerodsgrand,Rinkeby-Tensta,59.39556,17.94929,Entire home/apt,480500,8,2019-08-01,0.12,0,5.0,5.0,5.0,4.71,4.71,4.71,4.86,Sunny apartment 27 mins to Centralen The entir...
1694,43012846,310620582,Brommavik Hotel,Bromma,59.35951,17.96011,Hotel room,0,0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Brommavik Hotel
1796,45571018,368998845,Forenom,Bromma,59.33435,17.97866,Hotel room,0,3,2022-08-09,0.25,3,3.67,4.0,4.0,2.67,2.33,4.33,4.0,Forenom Aparthotel Stockholm Alvik
1797,45571095,368999421,Forenom,Bromma,59.3454,17.97001,Hotel room,0,4,2023-02-27,0.44,4,4.75,4.75,4.75,4.0,4.25,4.75,4.5,Forenom Aparthotel Stockholm Bromma
1803,45680148,370101820,Forenom,Norrmalms,59.33799,18.06619,Hotel room,0,8,2023-02-24,0.28,4,4.13,4.25,4.5,4.13,4.25,4.5,3.88,Forenom Serviced Apartments Stockholm Johannes...
1804,45730331,343919205,Forenom APH Stockholm Kista,Rinkeby-Tensta,59.40813,17.95183,Hotel room,0,5,2022-09-24,0.16,2,3.4,4.4,4.8,3.4,3.2,4.2,4.0,Forenom Aparthotel Stockholm Kista
2202,53198829,430664934,Camilla,Hässelby-Vällingby,59.37333,17.83749,Entire home/apt,450590,11,2022-12-11,0.73,5,5.0,5.0,5.0,5.0,5.0,4.64,5.0,2:a i Hässelby Fin liten lägenhet i lugnt områ...
3110,776549730665441181,46194379,Jakob,Kungsholmens,59.3238,18.0045,Entire home/apt,99999,0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Helhet Take it easy at this unique and tranqui...
3453,849709777456195565,505907207,FaVillas,Södermalms,59.32347,18.07524,Entire home/apt,25427,0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,stockss place with 1 bedroom stock''s place wi...
3454,849709953466213539,505907207,FaVillas,Södermalms,59.322931,18.073034,Entire home/apt,25427,0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,stockss place with 1 bedroom stock''s place wi...


In [124]:
df_calendar = pd.read_csv(BASE_DIR / 'row_data' / 'calendar.csv')
df_calendar['price'] = df_calendar['price'].str.replace('[\$,]', '', regex=True)
df_calendar['price'].agg(['max', 'min'])

max    99999.00
min        0.00
Name: price, dtype: object

In [125]:
min_price_calend = df_calendar.groupby('listing_id')['price'].min().astype(float).to_dict()
df_listing['price'] = df_listing['id'].map(min_price_calend)
df_listing.query('price > 90000')

Unnamed: 0,id,host_id,host_name,neighbourhood,latitude,longitude,room_type,price,number_of_reviews,last_review,reviews_per_month,number_of_reviews_ltm,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,host_description
3110,776549730665441181,46194379,Jakob,Kungsholmens,59.3238,18.0045,Entire home/apt,99999.0,0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Helhet Take it easy at this unique and tranqui...


In [126]:
df_listing.drop(3110, axis=0, inplace=True)
df_listing.fillna(0, inplace=True)

In [127]:
px.histogram(df_listing, x='price')

In [128]:
df_listing.to_csv(Path(BASE_DIR / 'staging_data' / 'listing_cleaned.csv'), index=False)

In [144]:
df_reviews = pd.read_csv(Path(BASE_DIR / 'row_data' / 'reviews_2023_03.csv'), parse_dates=['date'])
df_host_reviews = (df_reviews.merge(df_listing, left_on='listing_id', right_on='id', how='inner')
                      .drop(['id_x', 'id_y'], axis=1))

In [145]:
df_host_reviews

Unnamed: 0,listing_id,date,reviewer_id,reviewer_name,comments,host_id,host_name,neighbourhood,latitude,longitude,...,reviews_per_month,number_of_reviews_ltm,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,host_description
0,32984,2010-09-06,200247,Keenan,Great little apartment in the perfect spot in ...,53396,Chris,Kungsholmens,59.32779,18.049980,...,0.16,0,4.23,4.43,3.81,4.29,4.52,4.81,4.20,"City Hall, Stockholm <b>The space</b><br />Bea..."
1,32984,2010-09-12,126543,Olaf,Nice flat in a great area. Chris sorted things...,53396,Chris,Kungsholmens,59.32779,18.049980,...,0.16,0,4.23,4.43,3.81,4.29,4.52,4.81,4.20,"City Hall, Stockholm <b>The space</b><br />Bea..."
2,32984,2011-05-21,294223,Max,"Great location, Chris was responsive by email ...",53396,Chris,Kungsholmens,59.32779,18.049980,...,0.16,0,4.23,4.43,3.81,4.29,4.52,4.81,4.20,"City Hall, Stockholm <b>The space</b><br />Bea..."
3,32984,2011-05-28,577190,Eli,Chris' place is super great and close to every...,53396,Chris,Kungsholmens,59.32779,18.049980,...,0.16,0,4.23,4.43,3.81,4.29,4.52,4.81,4.20,"City Hall, Stockholm <b>The space</b><br />Bea..."
4,32984,2011-06-11,521818,Jonathan,This was a \very nice apartment in a GREAT loc...,53396,Chris,Kungsholmens,59.32779,18.049980,...,0.16,0,4.23,4.43,3.81,4.29,4.52,4.81,4.20,"City Hall, Stockholm <b>The space</b><br />Bea..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
93651,825983115454553546,2023-03-26,140580626,Kilian,Nous avons passé un super séjour chez Göran et...,6839482,Göran,Norrmalms,59.34468,18.032352,...,3.43,4,5.00,4.75,5.00,4.75,4.75,4.75,4.75,Room2stay Lev det enkla livet i detta fridfull...
93652,826818491548997858,2023-02-22,52396317,Ivar,Beautiful apartment. Central. Everything funct...,51002540,Fredrik,Östermalms,59.33784,18.081570,...,3.16,4,3.75,3.50,4.75,3.75,4.25,4.50,3.50,Spacious & luxurious 2br Apt on Östermalms-tor...
93653,826818491548997858,2023-03-02,183686765,Luc,Bonjour<br/>Appartement conforme au descriptif...,51002540,Fredrik,Östermalms,59.33784,18.081570,...,3.16,4,3.75,3.50,4.75,3.75,4.25,4.50,3.50,Spacious & luxurious 2br Apt on Östermalms-tor...
93654,826818491548997858,2023-03-10,112662633,Leonora,This apartment was absolutely beautiful! It wa...,51002540,Fredrik,Östermalms,59.33784,18.081570,...,3.16,4,3.75,3.50,4.75,3.75,4.25,4.50,3.50,Spacious & luxurious 2br Apt on Östermalms-tor...


In [146]:
le = LabelEncoder()
df_host_reviews['listing_id_encod'] = le.fit_transform(df_host_reviews['listing_id'])

In [147]:
df_host_reviews = df_host_reviews.iloc[:, np.r_[1, 24, 0, 2:7, 23, 7:23]]
df_host_reviews.head(1)

Unnamed: 0,date,listing_id_encod,listing_id,reviewer_id,reviewer_name,comments,host_id,host_name,host_description,neighbourhood,...,last_review,reviews_per_month,number_of_reviews_ltm,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value
0,2010-09-06,0,32984,200247,Keenan,Great little apartment in the perfect spot in ...,53396,Chris,"City Hall, Stockholm <b>The space</b><br />Bea...",Kungsholmens,...,2016-06-28,0.16,0,4.23,4.43,3.81,4.29,4.52,4.81,4.2


In [148]:
df_host_reviews.describe()

Unnamed: 0,listing_id_encod,listing_id,reviewer_id,host_id,latitude,longitude,price,number_of_reviews,reviews_per_month,number_of_reviews_ltm,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value
count,93656.0,93656.0,93656.0,93656.0,93656.0,93656.0,93656.0,93656.0,93656.0,93656.0,93656.0,93656.0,93656.0,93656.0,93656.0,93656.0,93656.0
mean,970.009877,4.739066e+16,133369500.0,95301740.0,59.319395,18.04902,1261.341409,149.890472,2.797927,40.848947,4.75469,4.8109,4.750516,4.861461,4.838456,4.826085,4.694639
std,665.72111,1.710297e+17,134726200.0,116442900.0,0.022302,0.04854,1088.596961,158.725379,2.711734,64.173547,0.226551,0.19699,0.25334,0.165376,0.201699,0.188714,0.208919
min,0.0,32984.0,81.0,4457.0,59.23195,17.79731,0.0,1.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,394.0,10222510.0,26234100.0,8547277.0,59.31176,18.03108,730.0,41.0,0.83,7.0,4.64,4.74,4.66,4.81,4.78,4.75,4.59
50%,862.0,24074860.0,81293980.0,46720320.0,59.32294,18.06604,1035.0,103.0,2.29,29.0,4.81,4.86,4.81,4.9,4.9,4.88,4.72
75%,1442.0,40344030.0,203061500.0,138864700.0,59.33407,18.07506,1450.0,200.0,3.97,56.0,4.9,4.93,4.91,4.95,4.96,4.95,4.83
max,2797.0,8.478345e+17,507030200.0,505546100.0,59.41744,18.16607,22995.0,976.0,18.94,571.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0


In [149]:
df_host_reviews.describe(include='object')

Unnamed: 0,reviewer_name,comments,host_name,host_description,neighbourhood,room_type,last_review
count,93656,93654,93656,93656,93656,93656,93656
unique,19810,91384,1067,2784,14,4,752
top,Anna,.,Hedvig,Studio Apartment Double bed These studio apart...,Södermalms,Entire home/apt,2023-03-26
freq,576,151,4189,976,42496,70334,8027


In [150]:
missing_values(df_host_reviews)

Unnamed: 0,Missing Values,"Missing Values, %"
comments,2,0


In [151]:
df_host_reviews[df_host_reviews['comments'].isna() == True]

Unnamed: 0,date,listing_id_encod,listing_id,reviewer_id,reviewer_name,comments,host_id,host_name,host_description,neighbourhood,...,last_review,reviews_per_month,number_of_reviews_ltm,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value
47701,2019-04-23,902,25051575,244070000,David,,26990407,Moa,Stor villa med sol hela dagen Stor villa på 17...,Skarpnäcks,...,2023-01-03,0.42,7,5.0,4.88,4.96,4.96,4.92,4.83,4.79
62929,2019-05-30,1090,31301631,263373046,Deepak,,362541894,ApartDirect,Superior Studio Apartment with Sofa Bed Locate...,Älvsjö,...,2023-03-13,1.63,24,4.38,4.52,4.34,4.85,4.46,4.41,4.41


In [152]:
df_host_reviews.dropna(subset=['comments'], inplace=True)

In [153]:
df_host_reviews['reviewer_id'].value_counts()

348992094    31
24622700     25
110641325    16
10805888     11
280949885    11
             ..
32023642      1
59702442      1
38577770      1
42229921      1
77261435      1
Name: reviewer_id, Length: 88204, dtype: int64

In [155]:
df_host_reviews.query('reviewer_id == 348992094').head(10)

Unnamed: 0,date,listing_id_encod,listing_id,reviewer_id,reviewer_name,comments,host_id,host_name,host_description,neighbourhood,...,last_review,reviews_per_month,number_of_reviews_ltm,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value
53092,2020-11-30,1099,31347264,348992094,Roger,Bra läge och mycket smidig in och utcheckning,8098890,Robin,# 12 Studiolägenhet utan fönster(no window) Ni...,Kungsholmens,...,2023-03-30,4.39,58,4.73,4.84,4.78,4.92,4.79,4.75,4.68
53097,2021-02-12,1099,31347264,348992094,Roger,Bra läge och alltid bra kommunikation. Allt fu...,8098890,Robin,# 12 Studiolägenhet utan fönster(no window) Ni...,Kungsholmens,...,2023-03-30,4.39,58,4.73,4.84,4.78,4.92,4.79,4.75,4.68
53099,2021-03-20,1099,31347264,348992094,Roger,Bra läge och mycket bra boende,8098890,Robin,# 12 Studiolägenhet utan fönster(no window) Ni...,Kungsholmens,...,2023-03-30,4.39,58,4.73,4.84,4.78,4.92,4.79,4.75,4.68
53132,2021-12-01,1099,31347264,348992094,Roger,Rent och fint. Allt man behöver finns. Lätt at...,8098890,Robin,# 12 Studiolägenhet utan fönster(no window) Ni...,Kungsholmens,...,2023-03-30,4.39,58,4.73,4.84,4.78,4.92,4.79,4.75,4.68
54397,2021-04-28,1062,30553531,348992094,Roger,Had yet another good stayover. Easy to find an...,8098890,Robin,# 3 Studiolägenhet (3 pers) Niro Hotel Apartme...,Kungsholmens,...,2023-03-19,4.26,72,4.78,4.85,4.86,4.91,4.87,4.8,4.78
54410,2021-08-03,1062,30553531,348992094,Roger,"Very nice place, clean and easy to check in an...",8098890,Robin,# 3 Studiolägenhet (3 pers) Niro Hotel Apartme...,Kungsholmens,...,2023-03-19,4.26,72,4.78,4.85,4.86,4.91,4.87,4.8,4.78
54468,2022-07-08,1062,30553531,348992094,Roger,Bra läge. Väldigt smidig in och utcheckning. A...,8098890,Robin,# 3 Studiolägenhet (3 pers) Niro Hotel Apartme...,Kungsholmens,...,2023-03-19,4.26,72,4.78,4.85,4.86,4.91,4.87,4.8,4.78
54786,2020-08-30,1063,30579247,348992094,Roger,"Mycket trevligt och centralt boende, smidig in...",8098890,Robin,# 2 Studiolägenhet (3 pers) Niro Hotel Apartme...,Kungsholmens,...,2023-03-28,5.51,91,4.76,4.85,4.78,4.87,4.79,4.75,4.73
55609,2021-02-23,1065,30685213,348992094,Roger,"Good location and nice, clean rooms, easy to c...",8098890,Robin,# 8 Studiolägenhet utan fönster (no window) Ni...,Kungsholmens,...,2023-03-27,3.44,53,4.79,4.91,4.84,4.89,4.82,4.76,4.72
55615,2021-05-31,1065,30685213,348992094,Roger,Bra läge med smidig in och utcheckning. Mycket...,8098890,Robin,# 8 Studiolägenhet utan fönster (no window) Ni...,Kungsholmens,...,2023-03-27,3.44,53,4.79,4.91,4.84,4.89,4.82,4.76,4.72


In [156]:
px.histogram(df_host_reviews, x='listing_id_encod')