In [1]:
import sqlite3
import pandas as pd

conn = sqlite3.connect('listings.db')


In [2]:
df = pd.read_csv('airbnb.csv')
df.to_sql('listings', conn, if_exists='append', index=False)

6893

In [3]:
import sqlalchemy
sqlalchemy.create_engine('sqlite:///listings.db')

Engine(sqlite:///listings.db)

In [4]:
%load_ext sql

In [5]:
%%capture 
%load_ext sql 
%sql sqlite:///listings.db

In [6]:
# %%sql

# SELECT * FROM listings
# LIMIT 10

In [7]:
%%sql

SELECT sql FROM sqlite_master
WHERE tbl_name = 'listings' AND type = 'table'

 * sqlite:///listings.db
Done.


sql
"CREATE TABLE ""listings"" ( ""id"" INTEGER,  ""listing_url"" TEXT,  ""scrape_id"" INTEGER,  ""last_scraped"" TEXT,  ""source"" TEXT,  ""name"" TEXT,  ""description"" TEXT,  ""neighborhood_overview"" TEXT,  ""picture_url"" TEXT,  ""host_id"" INTEGER,  ""host_url"" TEXT,  ""host_name"" TEXT,  ""host_since"" TEXT,  ""host_location"" TEXT,  ""host_about"" TEXT,  ""host_response_time"" TEXT,  ""host_response_rate"" TEXT,  ""host_acceptance_rate"" TEXT,  ""host_is_superhost"" TEXT,  ""host_thumbnail_url"" TEXT,  ""host_picture_url"" TEXT,  ""host_neighbourhood"" TEXT,  ""host_listings_count"" INTEGER,  ""host_total_listings_count"" INTEGER,  ""host_verifications"" TEXT,  ""host_has_profile_pic"" TEXT,  ""host_identity_verified"" TEXT,  ""neighbourhood"" TEXT,  ""neighbourhood_cleansed"" TEXT,  ""neighbourhood_group_cleansed"" REAL,  ""latitude"" REAL,  ""longitude"" REAL,  ""property_type"" TEXT,  ""room_type"" TEXT,  ""accommodates"" INTEGER,  ""bathrooms"" REAL,  ""bathrooms_text"" TEXT,  ""bedrooms"" REAL,  ""beds"" REAL,  ""amenities"" TEXT,  ""price"" TEXT,  ""minimum_nights"" INTEGER,  ""maximum_nights"" INTEGER,  ""minimum_minimum_nights"" REAL,  ""maximum_minimum_nights"" REAL,  ""minimum_maximum_nights"" REAL,  ""maximum_maximum_nights"" REAL,  ""minimum_nights_avg_ntm"" REAL,  ""maximum_nights_avg_ntm"" REAL,  ""calendar_updated"" REAL,  ""has_availability"" TEXT,  ""availability_30"" INTEGER,  ""availability_60"" INTEGER,  ""availability_90"" INTEGER,  ""availability_365"" INTEGER,  ""calendar_last_scraped"" TEXT,  ""number_of_reviews"" INTEGER,  ""number_of_reviews_ltm"" INTEGER,  ""number_of_reviews_l30d"" INTEGER,  ""first_review"" TEXT,  ""last_review"" TEXT,  ""review_scores_rating"" REAL,  ""review_scores_accuracy"" REAL,  ""review_scores_cleanliness"" REAL,  ""review_scores_checkin"" REAL,  ""review_scores_communication"" REAL,  ""review_scores_location"" REAL,  ""review_scores_value"" REAL,  ""license"" TEXT,  ""instant_bookable"" TEXT,  ""calculated_host_listings_count"" INTEGER,  ""calculated_host_listings_count_entire_homes"" INTEGER,  ""calculated_host_listings_count_private_rooms"" INTEGER,  ""calculated_host_listings_count_shared_rooms"" INTEGER,  ""reviews_per_month"" REAL )"


Распределение количества спален

In [8]:
%%sql

SELECT bedrooms, COUNT(*) as apartm_count FROM listings
GROUP BY bedrooms

 * sqlite:///listings.db
Done.


bedrooms,apartm_count
,5040
1.0,64064
2.0,25568
3.0,10816
4.0,3808
5.0,736
6.0,96
7.0,16
8.0,64
10.0,64


In [9]:
%%sql

SELECT accommodates, COUNT(*) as apartm_count FROM listings
GROUP BY accommodates

 * sqlite:///listings.db
Done.


accommodates,apartm_count
0,80
1,2736
2,56976
3,8304
4,34960
5,2352
6,3296
7,480
8,432
9,48


In [10]:
%%sql

update listings
set price = CAST (replace(price, '$', '') as INTEGER)

 * sqlite:///listings.db
110288 rows affected.


[]

In [17]:
%%sql


CREATE TEMP VIEW IF NOT EXISTS listings_price_rounded
AS     select *, ROUND(price/10.0)*10 as price_rounded from listings;



 * sqlite:///listings.db
Done.


[]

In [20]:
%%sql

DROP VIEW IF EXISTS price_groups;
CREATE TEMP VIEW IF NOT EXISTS price_groups
AS     
SELECT price_rounded, COUNT(*) as aprt_cnt,
 ROUND(CAST(COUNT(*) as float) / (SELECT COUNT(*) as total_cnt FROM listings), 2) as ratio_of_total
    FROM listings_price_rounded
    GROUP BY price_rounded
    
    



 * sqlite:///listings.db
Done.
Done.


[]

In [21]:
%%sql

select * from price_groups
ORDER BY aprt_cnt DESC
LIMIT 5

 * sqlite:///listings.db
Done.


price_rounded,aprt_cnt,ratio_of_total
150.0,6928,0.06
200.0,6816,0.06
100.0,6416,0.06
130.0,5584,0.05
180.0,5072,0.05


Самые популярные ценовые категории: `$150`,  `$200`, `$100`

In [23]:
%%sql

select *, sum(ratio_of_total) OVER () as sum_ratio from price_groups
WHERE price_rounded BETWEEN 100 AND 200
ORDER BY price_rounded ASC


 * sqlite:///listings.db
Done.


price_rounded,aprt_cnt,ratio_of_total,sum_ratio
100.0,6416,0.06,0.49
110.0,4112,0.04,0.49
120.0,4864,0.04,0.49
130.0,5584,0.05,0.49
140.0,4672,0.04,0.49
150.0,6928,0.06,0.49
160.0,3792,0.03,0.49
170.0,3024,0.03,0.49
180.0,5072,0.05,0.49
190.0,3408,0.03,0.49


Квартиры от `$100` до `$200` составляют половину предложений на сайте.

In [24]:
%%sql
DROP VIEW IF EXISTS revs_cnt_rounded;
CREATE TEMP VIEW IF NOT EXISTS revs_cnt_rounded
AS  
SELECT ROUND(number_of_reviews/100.0)*100 as rev_cnt_rounded
FROM listings;

 * sqlite:///listings.db
Done.
Done.


[]

In [35]:
%%sql

with reviews_cnt_distr
AS (
SELECT rev_cnt_rounded, COUNT(*) as aprtm_cnt FROM revs_cnt_rounded
GROUP BY rev_cnt_rounded
ORDER BY aprtm_cnt DESC
)

SELECT *, (aprtm_cnt+0.0)/(SUM(aprtm_cnt) OVER ()) as ratio_of_total FROM reviews_cnt_distr


 * sqlite:///listings.db
Done.


rev_cnt_rounded,aprtm_cnt,ratio_of_total
0.0,83952,0.7612070216161323
100.0,16192,0.1468156100391701
200.0,4832,0.0438125634701871
300.0,2880,0.0261134484259393
400.0,1232,0.0111707529377629
500.0,592,0.0053677643986653
600.0,256,0.002321195415639
700.0,160,0.0014507471347744
800.0,64,0.0005802988539097
1100.0,32,0.0002901494269548


Чаще всего встречается около 100 ревью о квартире.

76% объявлений имеют меньше 50 ревью.

(Примечание: rev_cnt_rounded - значение, полученное округлением до сотен)

Более детально. 
Насколько много обьявлений имеют меньше 5 ревью?

In [58]:
%%sql
with rev_cnt_distr 
AS
(
    SELECT number_of_reviews , count(*) as aprtm_cnt FROM listings
    GROUP BY  number_of_reviews
),

ratios
as
(SELECT *, (aprtm_cnt+0.0)/(SUM (aprtm_cnt) OVER ()) as ratio_from_total   
FROM rev_cnt_distr
ORDER BY number_of_reviews)

SELECT *, SUM(ratio_from_total) OVER (ORDER BY number_of_reviews) as ratio_cum_sum 
FROM ratios
LIMIT 10





 * sqlite:///listings.db
Done.


number_of_reviews,aprtm_cnt,ratio_from_total,ratio_cum_sum
0,10336,0.0937182649064268,0.0937182649064268
1,6176,0.0559988394022921,0.1497171043087189
2,5200,0.0471492818801682,0.1968663861888872
3,4800,0.0435224140432322,0.2403888002321195
4,4480,0.0406209197736834,0.281009720005803
5,3808,0.0345277818076309,0.3155375018134339
6,3168,0.0287247932685332,0.3442622950819672
7,2960,0.0268388219933265,0.3711011170752937
8,2368,0.0214710575946612,0.392572174669955
9,2016,0.0182794138981575,0.4108515885681125


20% объявлений имеют не больше 2 ревью

25% объявлений имеют не больше 3 ревью

28% объявлений имеют меньше 5 ревью