In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_score, train_test_split

from catboost import CatBoostRegressor
from xgboost import XGBRegressor

In [2]:
import re

from scipy import sparse

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Ridge

In [4]:
train_path = 'data/train.csv'
test_path = 'data/test.csv'
review_path = 'data/reviews.csv'
calendar_path = 'data/calendar.csv'

In [5]:
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)
review_df = pd.read_csv(review_path)
calendar_df = pd.read_csv(calendar_path)

In [6]:
calendar_df['date'] = pd.to_datetime(calendar_df['date'])
max_date = calendar_df['date'].max()
max_date

Timestamp('2019-11-06 00:00:00')

In [7]:
review_df['date'] = pd.to_datetime(review_df['date'])
review_df['days'] = (max_date - review_df['date']).apply(lambda x: x.days)

# all dataset agg

In [8]:
train_df.loc[train_df.price <= 10, 'price'] = 10
# train_df.loc[train_df.price >= 1000, 'price'] = 1000

full_df = pd.concat([train_df, test_df])
full_df['host_since'] = pd.to_datetime(full_df['host_since'])
full_df['host_active_days'] = (max_date - full_df['host_since']).apply(lambda x: x.days)
host_list_count = full_df.groupby('host_id')['id'].count().rename('host_id_counts').reset_index()

mid_active_days = full_df['host_active_days'].median()
mid_bathrooms = full_df['bathrooms'].median()
mid_bedrooms = full_df['bedrooms'].median()
mid_beds = full_df['beds'].median()
mid_active_days, mid_bathrooms, mid_bedrooms, mid_beds 

(1594.0, 1.0, 1.0, 1.0)

# name description

In [9]:
stop_words = set(stopwords.words('english'))
stemmer = WordNetLemmatizer()

In [10]:
def clean_text(text):
    # Converting to Lowercase
    text = text.lower()
    # Remove all the special characters
    text = re.sub(r'\W', ' ', text)
    # Substituting multiple spaces with single space
    text = re.sub(r'\s+', ' ', text, flags=re.I)
    # remove word less then 2 characters
    text = re.sub(r'\b\w{1,2}\b', '', text)

    tokens = word_tokenize(text)
    tokens = [stemmer.lemmatize(word) for word in tokens]
    tokens = [word for word in tokens if word not in stop_words]
    preprocessed_text = ' '.join(tokens)

    return preprocessed_text

In [11]:
y = train_df.price
X = train_df.drop('price', axis=1)

In [12]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.3, shuffle=True, random_state=23)

In [13]:
def lr_data_prepare(data_df, vect, f_train=True):
    amenities = (data_df['amenities'].str.replace('"', '').str.replace('{', '')
                 .str.replace('}', '').str.replace(' ', ''))
    union_text = (
        data_df['name'].fillna('unknown_name')
        + ' ' + data_df['description'].fillna('unknown_desc') 
        + ' ' + data_df['space'].fillna('unknown_space')
        + ' ' + data_df['access'].fillna('unknown_acess') 
        + ' ' + data_df['transit'].fillna('unknown_trans')
        + ' ' + data_df['neighborhood_overview'].fillna('unknown_neigh')
        + ' ' + amenities.fillna('unknown_amen')
        + ' ' + data_df['house_rules'].fillna('unknown_rules')
        + ' ' + data_df['interaction'].fillna('unknown_inter')
        + ' ' + data_df['notes'].fillna('unknown_note')
        + ' ' + data_df['host_about'].fillna('unknown_about')
    ).apply(lambda x: clean_text(x))

    
    if f_train:
        vect.fit(union_text)
    data_sparse = vect.transform(union_text)

    return data_sparse

In [14]:
def mape(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [16]:
vectorizer = TfidfVectorizer(min_df=0.01, max_df=0.7, max_features=20_000)

In [670]:
train_sparse = lr_data_prepare(X_train, vectorizer, f_train=True)

In [685]:
lr_reg = Ridge(alpha=4, random_state=23)
lr_reg.fit(train_sparse, np.log1p(y_train))

Ridge(alpha=4, copy_X=True, fit_intercept=True, max_iter=None, normalize=False,
      random_state=23, solver='auto', tol=0.001)

In [675]:
test_sparse = lr_data_prepare(X_valid,  vectorizer, f_train=False)

In [686]:
mape(y_valid, np.expm1(lr_reg.predict(test_sparse)))

38.63568163583526

In [18]:
lr_reg = Ridge(alpha=4, random_state=23)
train_sparse = lr_data_prepare(X, vectorizer, f_train=True)
lr_reg.fit(train_sparse, np.log1p(y))

Ridge(alpha=4, copy_X=True, fit_intercept=True, max_iter=None, normalize=False,
      random_state=23, solver='auto', tol=0.001)

In [19]:
full_sparse = lr_data_prepare(full_df, vectorizer, f_train=False)

In [20]:
full_df['lr_pred'] = np.expm1(lr_reg.predict(full_sparse))

In [21]:
max(full_df['lr_pred']), min(full_df['lr_pred'])

(773.3101743140637, 12.49484280360154)

# Boosting

In [25]:
def agg_reviews(data_df):
    days_std = review_df.groupby('listing_id')['days'].std()
    days_mean = review_df.groupby('listing_id')['days'].median()
    review_count = review_df.groupby('listing_id')['reviewer_id'].nunique()
    grouped_df = pd.concat([review_count, days_mean, days_std], axis=1).reset_index()
    grouped_df.columns = ['id', 'rev_count', 'rev_median', 'rev_std']
    return grouped_df

In [26]:
agg_df = agg_reviews(review_df)

In [27]:
#init

property_type_set = set(['House', 'Apartment', 'Other'])
property_replace_dict = {
    'Townhouse': 'House',
    'Serviced apartment': 'Apartment',
    'Loft': 'Apartment',
    'Bungalow': 'House',
    'Cottage': 'House',
    'Villa': 'House',
    'Tiny house': 'House',
    'Earth house': 'House',
    'Chalet': 'House'  
    }

In [28]:
def dist_to(lat_series, long_series, to_lat, to_long):
    "return distance to center in meters"
    R = 6373000
    
    lat_1, long_1 = np.radians(to_lat),  np.radians(to_long)
    lat_2, long_2 =  np.radians(lat_series),  np.radians(long_series)

    d_lat = lat_2 - lat_1
    d_long = long_2 - long_1

    a = np.sin(d_lat / 2) ** 2 + np.cos(lat_1) * np.cos(lat_2) * np.sin(d_long / 2) ** 2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
    distance = R * c
    
    return distance

In [29]:
def neigh_features(data_df):
    ans = []
    MAX_DIST_1 = 400
    MAX_DIST_2 = 800
    MAX_DIST_3 = 2000
    prices = data_df.price.values
    latitude = data_df.latitude.values
    longitude = data_df.longitude.values
    for ind,(_, row) in enumerate(data_df[['latitude', 'longitude']].iterrows()):
        distances = dist_to(latitude, longitude, row.latitude, row.longitude)
        distances[ind] = 10_000
        ans.append([ 
                    np.nanmedian(prices[distances < MAX_DIST_1]),
                    (distances < MAX_DIST_2).sum(), 
                    np.nanmedian(prices[distances < MAX_DIST_2]),
                    np.nanmedian(prices[distances < MAX_DIST_3]),
                    ])
    ans_df = pd.DataFrame(ans, columns=[
        f'mid_{MAX_DIST_1}_price',
        f'neigh_{MAX_DIST_2}_count', f'mid_{MAX_DIST_2}_price',
        f'mid_{MAX_DIST_3}_price',
    ])
    
    ans_df['id'] = data_df['id'].values
    return ans_df

In [30]:
MID_PRICE = 80.0

neigh_df = neigh_features(full_df)
neigh_df['mid_2000_price'] = neigh_df['mid_2000_price'].fillna(MID_PRICE)
neigh_df['mid_800_price'] = neigh_df['mid_800_price'].fillna(neigh_df['mid_2000_price'])
neigh_df['mid_400_price'] = neigh_df['mid_400_price'].fillna(neigh_df['mid_800_price'])

  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  overwrite_input=overwrite_input)


In [119]:
def make_features(data_df, vect):
    features_df = pd.DataFrame(data_df[['id', 'host_id']])
    
    host_active_days = (max_date - pd.to_datetime(data_df['host_since'])).apply(lambda x: x.days)
    features_df['host_active_days'] = host_active_days.fillna(mid_active_days).astype(int)
    
    features_df['accommodates'] = data_df.accommodates.astype(int)
    
    features_df['bathrooms'] = data_df.bathrooms.fillna(mid_bathrooms).astype(int)
    bedrooms = data_df.bedrooms.fillna(mid_bedrooms).astype(int)
    beds = data_df.beds.fillna(mid_beds).astype(int)

    features_df['guests_included'] = data_df.guests_included.astype(int)
#     features_df['beds_on_one_guest'] = (beds / data_df.guests_included).astype(int)
    features_df['bedrooms_on_one_guest'] = (bedrooms / data_df.guests_included).astype(int)
#     features_df['bathrooms_on_one_guest'] = features_df['bathrooms'] / data_df.guests_included
    
    features_df['extra_people'] = data_df.extra_people.fillna(0).astype(int)
    features_df['cleaning_fee'] = data_df.cleaning_fee.fillna(0).astype(int)
    features_df['security_deposit'] = data_df.security_deposit.fillna(0).astype(int)
    features_df['minimum_nights'] = data_df.minimum_nights.fillna(1).astype(int)
    
    features_df['cleaning_fee_by_ppl'] = (features_df['cleaning_fee'] / data_df.guests_included).astype(int)
    features_df['security_deposit_by_ppl'] = (features_df['security_deposit'] / data_df.guests_included).astype(int)
    
    features_df['id_verified'] = data_df.host_identity_verified.fillna('f') == 'f'
    # cat features

    
    #distances
    _BUCKINGHAM = (51.50057377643661, -0.143291856948126)
    _CHELSEA = (51.494652, -0.183631)
    _BRICKSTONE = (51.517750470473615, -0.09634183673107204)
    features_df['dist_to_buck'] = \
        dist_to(train_df['latitude'], train_df['longitude'], *_BUCKINGHAM)
#     features_df['dist_to_chelsea'] = \
#         dist_to(train_df['latitude'], train_df['longitude'], *_CHELSEA)
#     features_df['dist_to_brick'] = \
#         dist_to(train_df['latitude'], train_df['longitude'], *_BRICKSTONE)
    
    property_type = data_df.property_type.replace(property_replace_dict)
    property_type = property_type.apply(lambda x: x if x in property_type_set else 'Other')
    
    amenities = (data_df['amenities'].str.replace('"', '').str.replace('{', '')
                 .str.replace('}', '').str.replace(' ', '').str.lower())


    
    property_type_df = pd.get_dummies(property_type)
    room_type = pd.get_dummies(data_df.room_type)
    features_df = pd.concat([features_df, room_type, property_type_df],axis=1)

    # merge other dfs
    features_df = features_df.merge(full_df[['id', 'lr_pred']], on='id')
    features_df = features_df.merge(neigh_df.fillna(80), on='id')
    features_df = features_df.merge(agg_df, on='id', how='left')
    features_df['rev_count'] = features_df['rev_count'].fillna(0)
    features_df['rev_by_day'] = features_df['rev_count'] / features_df['host_active_days']
    features_df['rev_median'] = features_df['rev_median'].fillna(-1)
    features_df['rev_std'] = features_df['rev_std'].fillna(-1)
    features_df = features_df.merge(host_list_count, on='host_id', how='left')
    features_df['host_id_counts'] = features_df['host_id_counts']
    
#     features_df['pred_divide_mid_price'] = features_df['lr_pred'] / features_df['mid_400_price']
    
    union_text = (
        data_df['name'].fillna('unknown_name')
        + ' ' + data_df['description'].fillna('unknown_desc') 
        + ' ' + data_df['space'].fillna('unknown_space')
        + ' ' + data_df['access'].fillna('unknown_acess') 
        + ' ' + data_df['transit'].fillna('unknown_trans')
        + ' ' + data_df['neighborhood_overview'].fillna('unknown_neigh')
        + ' ' + amenities.fillna('unknown_amen')
        + ' ' + data_df['house_rules'].fillna('unknown_rules')
        + ' ' + data_df['interaction'].fillna('unknown_inter')
        + ' ' + data_df['notes'].fillna('unknown_note')
        + ' ' + data_df['host_about'].fillna('unknown_about')
    ).apply(lambda x: clean_text(x))
    
    # words ebmedings
    word_inx = np.concatenate([np.argsort(lr_reg.coef_)[::-1][:200], np.argsort(lr_reg.coef_)[:200]])
    text_features_df = pd.DataFrame(vect.transform(union_text)[:,word_inx].toarray(), 
                                    columns=[f'text_{vect.get_feature_names()[ind]}' for ind in word_inx])
    features_df = pd.concat([features_df, text_features_df], axis=1)
    
    
    return features_df.drop(['mid_400_price', 'mid_2000_price', 'id', 'host_id'], axis=1)

In [120]:
df = make_features(train_df, vectorizer)
X = df
y = np.log1p(train_df.price)

In [None]:
def multi_collinearity_heatmap(df, figsize=(11,9)):
    
    sns.set(style="white")
    corr = df.corr()
    mask = np.zeros_like(corr, dtype=np.bool)
    mask[np.triu_indices_from(mask)] = True
    f, ax = plt.subplots(figsize=figsize)
    cmap = sns.diverging_palette(220, 10, as_cmap=True)
    sns.heatmap(corr, mask=mask, cmap=cmap, center=0, square=True, 
                linewidths=.5, cbar_kws={"shrink": .5}, vmax=corr[corr != 1.0].max().max());

In [None]:
multi_collinearity_heatmap(X)

# test exper

In [121]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.3, shuffle=True, random_state=23)

In [122]:
def mape_scorer(est, X_test, y_test):
    y_test = np.expm1(y_test)
    y_pred =  np.expm1(est.predict(X_test))
    return np.mean(np.abs((y_test - y_pred) / y_test)) * 100

In [123]:
regressor = CatBoostRegressor(verbose=1000, random_state=23, 
                              loss_function='MAE',
#                               cat_features=cat_features, 
                              thread_count=10,
                             )
regressor.fit(X_train, y_train)

print(mape_scorer(regressor, X_valid, y_valid))

0:	learn: 0.5942803	total: 49.9ms	remaining: 49.8s
999:	learn: 0.2233110	total: 34.1s	remaining: 0us
24.158828522780528


In [124]:
pd.DataFrame(sorted(list(zip(regressor.get_feature_importance(), X.columns)),
                    key=lambda x: x[0], reverse=True))

Unnamed: 0,0,1
0,22.15556,lr_pred
1,16.073948,Entire home/apt
2,9.738431,accommodates
3,8.564398,mid_800_price
4,8.00484,bedrooms
5,3.428491,bathrooms
6,2.132402,cleaning_fee
7,1.556549,extra_people
8,1.274979,beds
9,1.229637,minimum_nights


# cross val

In [755]:
regressor = CatBoostRegressor(verbose=1000, random_state=23, loss_function='MAE',
#                               cat_features=cat_features, 
                              thread_count=10,
                             )

In [756]:
score = cross_val_score(regressor, X, y, scoring=mape_scorer, cv=5)

print(f"score mean= {score.mean():.3f}, std={score.std():.3f}, {score}")

0:	learn: 0.6000151	total: 8.97ms	remaining: 8.96s
999:	learn: 0.2252048	total: 6.54s	remaining: 0us
0:	learn: 0.5970550	total: 9.85ms	remaining: 9.84s
999:	learn: 0.2266754	total: 8.37s	remaining: 0us
0:	learn: 0.5927662	total: 11ms	remaining: 10.9s
999:	learn: 0.2238265	total: 7.88s	remaining: 0us
0:	learn: 0.5917784	total: 9.18ms	remaining: 9.17s
999:	learn: 0.2267092	total: 7.77s	remaining: 0us
0:	learn: 0.5955605	total: 9.66ms	remaining: 9.65s
999:	learn: 0.2261515	total: 7.88s	remaining: 0us
score mean= 25.549, std=1.599, [24.04699644 25.25507219 28.2355387  23.91279338 26.29310782]


# submition

In [125]:
regressor.fit(X, y)

0:	learn: 0.5959557	total: 59.1ms	remaining: 59s
999:	learn: 0.2256016	total: 39.5s	remaining: 0us


<catboost.core.CatBoostRegressor at 0x7f930fc873d0>

In [758]:
X_test_features = make_features(test_df)

y_test = regressor.predict(X_test_features)
submition_df = pd.DataFrame(test_df['id'])
submition_df['price'] = np.expm1(y_test)
submition_df.to_csv('submit.csv', index=False)
train_df.price.median(), train_df.price.std(), submition_df.price.median(), submition_df.price.std()

(80.0, 201.59477087683024, 71.76195326915621, 61.424159023721906)

In [526]:
!head 'submit.csv'

id,price
9554,28.43516497570186
11076,62.119209988808166
13913,45.98813817889058
17402,198.72996538046996
24328,120.05334453166778
25023,82.54987977174932
25123,33.68372376440658
26223,126.83074210305953
26682,48.70221324078447
