# IMPORTS

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from datetime import datetime

from sklearn.feature_selection import f_classif, mutual_info_classif
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import BaggingRegressor

RANDOM_SEED = 42
VAL_SIZE = 0.2

# settings to display all columns
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

# зафиксируем версию пакетов, чтобы эксперименты были воспроизводимы
!pip freeze > requirements.txt

def get_metric(y_true, y_pred):
    return np.mean(np.abs(((y_true - y_pred) / y_true)))


def get_boxplot(df, col):
    # draw a boxplot
    fig, axes = plt.subplots(figsize = (14, 4))
    sns.boxplot(x='per_square_meter_price', y=col, data=df, ax=axes)
    axes.set_title('Boxplot for ' + col)
    plt.show()


def getFeaturesInfo(df):
    # num of unique values, first 10 unique values, null values count, type
    df_agg = df.agg({'nunique', lambda s: s.unique()[:10]})\
        .append(pd.Series(df.isnull().sum(), name='null'))\
        .append(pd.Series(df.dtypes, name='dtype'))\
        .transpose()
    
    return df_agg


import typing
from sklearn.metrics import mean_absolute_percentage_error, r2_score, mean_squared_error

THRESHOLD = 0.15
NEGATIVE_WEIGHT = 1.1

def deviation_metric_one_sample(y_true: typing.Union[float, int], y_pred: typing.Union[float, int]) -> float:
    """
    Реализация кастомной метрики для хакатона.

    :param y_true: float, реальная цена
    :param y_pred: float, предсказанная цена
    :return: float, значение метрики
    """
    deviation = (y_pred - y_true) / np.maximum(1e-8, y_true)
    if np.abs(deviation) <= THRESHOLD:
        return 0
    elif deviation <= - 4 * THRESHOLD:
        return 9 * NEGATIVE_WEIGHT
    elif deviation < -THRESHOLD:
        return NEGATIVE_WEIGHT * ((deviation / THRESHOLD) + 1) ** 2
    elif deviation < 4 * THRESHOLD:
        return ((deviation / THRESHOLD) - 1) ** 2
    else:
        return 9


def deviation_metric(y_true: np.array, y_pred: np.array) -> float:
    return np.array([deviation_metric_one_sample(y_true[n], y_pred[n]) for n in range(len(y_true))]).mean()

def median_absolute_percentage_error(y_true: np.array, y_pred: np.array) -> float:
    return np.median(np.abs(y_pred-y_true)/y_true)

def metrics_stat(y_true: np.array, y_pred: np.array) -> typing.Dict[str,float]:
    mape = mean_absolute_percentage_error(y_true, y_pred)
    mdape = median_absolute_percentage_error(y_true, y_pred)
    rmse = mean_squared_error(y_true, y_pred, squared=False)
    r2 = r2_score(y_true, y_pred)
    raif_metric = deviation_metric(y_true, y_pred)
    return {'mape':mape, 'mdape':mdape, 'rmse': rmse, 'r2': r2, 'raif_metric':raif_metric}


train = pd.read_csv('train.csv', low_memory=False)
train.drop_duplicates(inplace=True)
train.dropna(subset=['osm_city_nearest_population'], inplace=True)
# train = train.query('price_type == 1')
test = pd.read_csv('test.csv')
sample = pd.read_csv('test_submission.csv')


OSError: dlopen(/Users/forzarossa/opt/anaconda3/lib/python3.8/site-packages/lightgbm/lib_lightgbm.so, 6): Library not loaded: /usr/local/opt/libomp/lib/libomp.dylib
  Referenced from: /Users/forzarossa/opt/anaconda3/lib/python3.8/site-packages/lightgbm/lib_lightgbm.so
  Reason: image not found

In [None]:
# train.head(5).T

In [None]:
# train.info()

In [None]:
# test.head()

In [None]:
# test.info()

In [None]:
# sample.head()

In [None]:
# sample.info()

# EDA

In [None]:
getFeaturesInfo(train)

In [None]:
getFeaturesInfo(test)

## per_square_meter_price

In [None]:
train['per_square_meter_price'].isna().sum()

In [None]:
train['per_square_meter_price'].sort_values().head()

In [None]:
train['per_square_meter_price'].sort_values().tail()

In [None]:
PRICE_CORRECTOR = train.groupby('price_type')['per_square_meter_price'].mean()[0] / train.groupby('price_type')['per_square_meter_price'].mean()[1]
PRICE_CORRECTOR


## city

In [None]:
train['city'].isna().sum()

In [None]:
test['city'].isna().sum()

In [None]:
top_city = list(train['city'].value_counts().head(15).index)

train['city'] = train['city'].apply(lambda x: x if x in top_city else 'other')
test['city'] = test['city'].apply(lambda x: x if x in top_city else 'other')

In [None]:
train['city'].value_counts()

In [None]:
test['city'].value_counts()

## region

In [None]:
top_region = list(train['region'].value_counts().head(15).index)

train['region'] = train['region'].apply(lambda x: x if x in top_region else 'other')
test['region'] = test['region'].apply(lambda x: x if x in top_region else 'other')

In [None]:
train['region'].value_counts()

In [None]:
test['region'].value_counts()

## floor

In [None]:
train['floor_empty'] = pd.isna(train['floor']).astype('uint8')
test['floor_empty'] = pd.isna(test['floor']).astype('uint8')



In [None]:
train['floor_multi'] = train['floor'].apply(lambda x: 1 if ',' in str(x) else 0)
test['floor_multi'] = test['floor'].apply(lambda x: 1 if ',' in str(x) else 0)


In [None]:
train['floor'].fillna(0.0, inplace=True)
test['floor'].fillna(0.0, inplace=True)


In [None]:
train['floor'] = train['floor'].apply(lambda x: 0 if 'цок' in str(x).lower()
                                      or 'подва' in str(x).lower()
                                      or 'этаж' in str(x).lower()
                                      or 'манс' in str(x).lower()
                                      or 'антрес' in str(x).lower()
                                      or 'мезон' in str(x).lower()
                                      or 'техн' in str(x).lower()
                                      or ',' in str(x).lower()
                                      or '-' in str(x).lower() else x)
test['floor'] = test['floor'].apply(lambda x: 0 if 'цок' in str(x).lower()
                                      or 'подва' in str(x).lower()
                                      or 'этаж' in str(x).lower()
                                      or 'манс' in str(x).lower()
                                      or 'антрес' in str(x).lower()
                                      or 'мезон' in str(x).lower()
                                      or 'техн' in str(x).lower()
                                      or ',' in str(x).lower()
                                      or '-' in str(x).lower() else x)

train['floor'] = train['floor'].apply(lambda x: int(float(str(x))))
test['floor'] = test['floor'].apply(lambda x: int(float(str(x))))


## total_square

In [None]:
train['total_square'].isna().sum()

In [None]:
test['total_square'].isna().sum()

In [None]:
# train['total_square'].sort_values()

In [None]:
# test['total_square'].sort_values()

## price_type

In [None]:
train['price_type'].value_counts()

In [None]:
test['price_type'].value_counts()

## reform

In [None]:
reform_cols = [
    'reform_house_population_1000',
    'reform_house_population_500',
    'reform_mean_floor_count_1000',
    'reform_mean_floor_count_500',
    'reform_mean_year_building_1000',
    'reform_mean_year_building_500'
]

for col in reform_cols:
    train[col] = train[col].fillna(train.groupby('city')[col].transform('median'))
    test[col] = test[col].fillna(test.groupby('city')[col].transform('median'))

# LABEL ENCODING

In [None]:
# ВАЖНО! дря корректной обработки признаков объединяем трейн и тест в один датасет
train['sample'] = 1 # помечаем где у нас трейн
test['sample'] = 0 # помечаем где у нас тест

data = test.append(train, sort=False).reset_index(drop=True) # объединяем

In [None]:
bin_cols = [
    'floor_empty',
    'floor_multi'
]

cat_cols = [
    'city',
    'realty_type',
]

num_cols = [
    'floor',
    'lat',
    'lng',
    'osm_amenity_points_in_0.01',
    'osm_catering_points_in_0.001',
    'osm_city_closest_dist',
    'osm_city_nearest_population',
    'osm_crossing_closest_dist',
    'osm_crossing_points_in_0.01',
    'osm_finance_points_in_0.01',
    'osm_healthcare_points_in_0.01',
    'osm_historic_points_in_0.005',
    'osm_hotels_points_in_0.01',
    'osm_leisure_points_in_0.01',
    'osm_offices_points_in_0.01',
    'osm_shops_points_in_0.01',
    'osm_subway_closest_dist',
    'osm_train_stop_closest_dist',
    'osm_train_stop_points_in_0.01',
    'osm_transport_stop_points_in_0.01',
    'reform_count_of_houses_1000',
    'reform_house_population_1000',
    'reform_mean_floor_count_1000',
    'reform_mean_floor_count_500',
    'reform_mean_year_building_1000',
]

for colum in cat_cols:
    data[colum] = data[colum].astype('category').cat.codes

## Feature Importance

In [None]:
data_temp = data.loc[data['sample'] == 1].drop(['sample'], axis=1)

In [None]:
imp_num = pd.Series(f_classif(data_temp[num_cols], data_temp['per_square_meter_price'])[0], index=num_cols)
imp_num.sort_values(inplace=True)
imp_num.plot(kind = 'barh', color='pink', title='Numeric Features Importance')

In [None]:
imp_num

# osm_building_points is less important than other

## Get Dummies

In [None]:
data = pd.get_dummies(data, prefix=cat_cols, columns=cat_cols)

In [None]:
cat_cols = [
    'city_0',
    'city_1',
    'city_2',
    'city_3',
    'city_4',
    'city_5',
    'city_6',
    'city_7',
    'city_8',
    'city_9',
    'city_10',
    'city_11',
    'city_12',
    'city_13',
    'city_14',
    'city_15',
    'realty_type_0',
    'realty_type_1',
    'realty_type_2'
]

## Outliers and Standartization

In [None]:
for col in num_cols:
    if col == 'floor' or col == 'lat' or col == 'lng':
        pass
    else:
        median = data[col].median()
        IQR = data[col].quantile(0.75) - data[col].quantile(0.25)
        perc25 = data[col].quantile(0.25)
        perc75 = data[col].quantile(0.75)

        print("Column: ", col)
        print(' 25%: {:.4},\n'.format(perc25), '75%: {:.4},\n'.format(perc75),
              "IQR: {:.4}, \n".format(IQR), "Borderline: [{f:.4}, {l:.4}].\n".format(f=perc25 - 1.5*IQR, l=perc75 + 1.5*IQR))
        print()

        # replace outliers with border-values
        data[col] = np.where(data[col] > (perc75 + 1.5*IQR), (perc75 + 1.5*IQR), data[col])
        data[col] = np.where(data[col] < (perc25 - 1.5*IQR), (perc25 - 1.5*IQR), data[col])

In [None]:
scaler = MinMaxScaler()
data[num_cols] = scaler.fit_transform(data[num_cols].values)

In [None]:
feat_cols = []
feat_cols.append('per_square_meter_price')
feat_cols.append('sample')
for item in [num_cols, bin_cols, cat_cols]:
    for elem in item:
        feat_cols.append(elem)

In [None]:
data = data[feat_cols]

In [None]:
data.info()

## Correlation Matrix

In [None]:
plt.title('Correlation Matrix of dataset features')
plt.rcParams['figure.figsize'] = (30,20)
sns.heatmap(data.corr(), vmin=-1, vmax=1, annot = False)

In [None]:
# show features with corr > 0.7
corr = data.corr().abs()
corr_table = corr.unstack()
corr_table_sorted = corr_table.sort_values(kind="quicksort", ascending=False)
corr_exclude_ones = corr_table_sorted[corr_table_sorted != 1]
high_corr = corr_exclude_ones[corr_exclude_ones >= 0.7]
high_corr

# MODEL

In [None]:
train = data.query('sample == 1')
test = data.query('sample == 0')

X = train.drop(['sample', 'per_square_meter_price'], axis=1).values
y = train['per_square_meter_price'].values

X_sub = test.drop(['sample', 'per_square_meter_price'], axis=1).values

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=VAL_SIZE, shuffle=True, random_state=RANDOM_SEED)

In [None]:
start = datetime.now()

lgbmr = LGBMRegressor(objective='regression', max_depth=12, num_leaves=1000,
                      learning_rate=0.02, n_estimators=2000, metric='mape')

lgbmr.fit(X_train, np.log(y_train))

# оцениваем точность
predict_lgbmr = np.exp(lgbmr.predict(X_val))
print(f"Точность модели по метрике MAPE: {(deviation_metric(y_val, predict_lgbmr)):0.2f}%")
print('Время выполнения - ', datetime.now() - start)

In [None]:
# lgbm for total_square = 200.19%
# lgbm for total_square w query price_type 1 = 46.11%
# lgbm for total_square and city = 168.25%
# lgbm with all features = 99.18%
# lgbm with n_estimarors 2000 = 95.64% ~1m
# lgbm with custom metric = 2.68 ~1m

In [None]:
start = datetime.now()

lgbmr_raif = LGBMRegressor(n_estimators=2000,
                      learning_rate=0.01,
                      reg_alpha=1,
                      num_leaves=40,
                      min_child_samples=5,
                      importance_type="gain",
                      n_jobs=1,
                      random_state=RANDOM_SEED)

lgbmr_raif.fit(X_train, np.log(y_train))

# оцениваем точность
predict_lgbmr_raif = np.exp(lgbmr_raif.predict(X_val))
print(f"Точность модели по метрике MAPE: {(deviation_metric(y_val, predict_lgbmr_raif)):0.2f}%")
print('Время выполнения - ', datetime.now() - start)

In [None]:
# lgbm with raif baseline params = 118.29% ~27s
# raif lgbm with custom metric = 3.66 ~ 30s

In [None]:
import xgboost as xgb

start = datetime.now()

xgb_reg = xgb.XGBRegressor(objective='reg:squarederror',
                           colsample_bytree=0.5,
                           learning_rate=0.05,
                           max_depth=12,
                           alpha=1,
                           n_estimators=1000)

xgb_reg.fit(X_train, np.log(y_train))

# оцениваем точность
predict_xgb = np.exp(xgb_reg.predict(X_val))
print(f"Точность модели по метрике MAPE: {(deviation_metric(y_val, predict_xgb)):0.2f}%")
print('Время выполнения - ', datetime.now() - start)

In [None]:
# xgb = 93.33% ~ 3.5m
# xgb with custom metric = 2.58% ~ 3.5m

In [None]:
start = datetime.now()

bagg_lgbm = BaggingRegressor(lgbmr_raif, n_estimators=3, n_jobs=4, random_state=RANDOM_SEED)
bagg_lgbm.fit(X_train, np.log(y_train))
predict_bagg_lgbm = np.exp(bagg_lgbm.predict(X_val))
print(f"Точность модели по метрике MAPE: {(deviation_metric(y_val, predict_bagg_lgbm)):0.2f}%")
print('Время выполнения - ', datetime.now() - start)

In [None]:
# 

# SUBMISSION

In [None]:
predict_lgbmr = np.exp(lgbmr.predict(X_sub))

sample['per_square_meter_price'] = predict_lgbmr
sample['per_square_meter_price'] = sample['per_square_meter_price'] * 0.94
sample.to_csv('submission_lgbmr.csv', index=False)
sample.head()

# 2.12

In [None]:
predict_lgbmr_raif = np.exp(lgbmr_raif.predict(X_sub))

sample['per_square_meter_price'] = predict_lgbmr_raif
sample['per_square_meter_price'] = sample['per_square_meter_price'] * 0.94
sample.to_csv('submission_lgbmr_raif.csv', index=False)
sample.head()

# 1.91

In [None]:
predict_xgb = np.exp(xgb_reg.predict(X_sub))

sample['per_square_meter_price'] = predict_xgb
sample['per_square_meter_price'] = sample['per_square_meter_price'] * 0.94
sample.to_csv('submission_xgb.csv', index=False)
sample.head()

# 2.03