In [1]:
import pandas as pd
import numpy as np
np.random.seed(0)
pd.set_option('display.max_columns', None)
from os.path import exists

from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import cross_val_score, KFold

import mlflow
import missingno as msno
import gc

from sklearn.dummy import DummyRegressor
import xgboost as xgb
import catboost

import eli5
from tqdm import tqdm

In [60]:
def get_or_create_experiment(name):
    experiment = mlflow.get_experiment_by_name(name)
    if not experiment:
        mlflow.create_experiment(name)
        return mlflow.get_experiment_by_name(name)
        
    return experiment


def _eid(name):
    return get_or_create_experiment(name).experiment_id


def get_filename(basename):           
    counter=0
    name = '{}_{}'.format(basename, counter)
    while exists('../output/{}.csv'.format(name)):
        counter +=1
        name = '{}_{}'.format(basename, counter)
        
    return name

def get_X_y_log(df_train, feats):

    X = df_train[feats]
    y = df_train['price']
    y_log = np.log(y)
    
    return X, y, y_log
    

def get_feats(df):
    
    blacklist = [
        'price',
        'id', 
        'price_m2',  
       # 'tarasy',
       # 'ochrona',
       # 'created_at_cat',                          
       # 'gara≈º',       
       # 'plan zagospodarowania:',
       # 'area_num',
       # 'loc0_cat',        
        ]
    
                         
    num_bool_feats = df.select_dtypes(['number', 'bool'])
    return [x for x in num_bool_feats if x not in blacklist]


def get_model(model_or_id):
    
    if model_or_id == 'catboost_hyper':
        model_params = dict( 
            
            max_depth=8,
            n_estimators=1000,
            learning_rate=0.3,
            random_state=0,
            silent=True,
        )
        return catboost.CatBoostRegressor(**model_params)
    
    elif model_or_id == 'catboost_light':
        model_params = dict( 
            
            max_depth=5,
            n_estimators=100,
            random_state=0,
            silent=True,
        )
        return catboost.CatBoostRegressor(**model_params)
    else:
        return model_or_id


def mlflow_experiment(run_name, model, X, feats, result, eli5_result):
    
    with mlflow.start_run(experiment_id=_eid('dw_solution_property'), run_name=run_name) as run:
    
        mlflow.log_params(model.get_params())
        mlflow.log_param("model", str(model).split("(")[0])
        mlflow.log_param("feats", feats)
        mlflow.log_param('X.shape', X.shape)
        
        #artifacts
        with open('../outputs/eli5.html', 'w') as f:
            f.write('<html>{}</html>'.format(eli5_result.data))
        mlflow.log_artifact('../outputs/eli5.html', 'plot')
                
        #metrics
        mlflow.log_metric('mae_mean', result[0])
        mlflow.log_metric('mae_std', result[1])
              
    print(f'Experiment {run_name} recorded')

def check_model(X, y, model, scoring):
    
    cv = KFold(n_splits=5, shuffle=True, random_state=0)
    scores = []
    for train_idx, test_idx in tqdm(cv.split(X)):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_log_train, y_test = np.log(y).iloc[train_idx], y.iloc[test_idx]

        model.fit(X_train, y_log_train)
        y_log_pred = model.predict(X_test)
        y_pred = np.exp(y_log_pred)

        score = scoring(y_test, y_pred)
        scores.append(score)
        
        return np.mean(scores), np.std(scores)
    
def save_forecast_to_csv(model, X_test, df_test, filename):

    y_pred = model.predict(X_test)
    df_test['price'] = np.exp(y_pred)
    
    try:
        df_test[ ['id', 'price'] ].to_csv('../output/{}.csv'.format(filename), index=False)
        print('{}.csv save successfully in "output" folder'.format(filename))
    except:
        print('{}.csv saving ERROR in "output" folder'.format(filename))
    
    
def start_experiment(df,
                     model_or_id,
                     scoring=mean_absolute_error,
                     filename='catboost',
                     results=True, export=False, mlflow_save=True):
    
    filename = get_filename(filename)    
    print(f'Working on "{filename}" experiment')
    
    #df
    df_train = df[ df['price'].notnull() ].fillna(-1)
    df_test = df[ df['price'].isnull() ].fillna(-1)
    
    #x Y
    feats = get_feats(df)
    X, y, y_log = get_X_y_log(df_train, feats)
    
    X = X
    y = y
 
    #model
    model = get_model(model_or_id)
    
    #results
    if results:
        result = check_model(X, y, model, scoring)
        
    model.fit(X.values, y_log.values)
    
    #export forecast
    if export:
        X_test = df_test[feats].values
        save_forecast_to_csv(model, X_test, df_test, filename)    
  
    eli5_result = eli5.show_weights(model, feature_names=feats)
    
    #mlflow
    if mlflow_save and result:
        mlflow_experiment(filename, model, X, feats, result, eli5_result)
    
    #return
    if results:
        print(result)
    return eli5_result  

In [58]:
def feature_engineering(df):
    
    def parse_czynsz(val):
        if isinstance(val, int): return val
        if isinstance(val, float): return val

        if val[-1] == '≈Ç':
            return float(val.split('z≈Ç')[0].replace(' ', '').replace(',','.'))
        if val[-1] == 'r':
            return float(val.split('eur')[0].replace(' ', '').replace(',','.'))*4.5
    
    
    def parse_area(val):
        if isinstance(val, int): return val
        if isinstance(val, float): return val

        return float(val.split('m')[0].replace(',','.').replace(' ',''))
    

    def parse_location_city(val):
        all_city = city_stats['city'].to_list()
        for city in reversed(val):
            if city in ['Dobra', 'J√≥zef√≥w']:
                continue
            if city in all_city:
                return city
        return 'other'    
    

    def parse_floors_in_building(val):
        if isinstance(val, int): return -1
        if isinstance(val, float): return -1    
        floor = float(val.replace(')','').split()[1])
        return floor if floor < 20 else 25
    
    def df_groupby_feat(df, groupby_feats, feat):
        agg_params={
            'mean_{}_{}'.format('_'.join(groupby_feats), feat): (feat, 'mean'),
            'median_{}_{}'.format('_'.join(groupby_feats), feat): (feat, 'median'),
            #'sum_{}_price'.format('_'.join(groupby_feats)): ('price', 'sum')
        }
        return df[groupby_feats + [feat]].groupby(groupby_feats).agg(
            **agg_params
        ).reset_index()

    df['czynsz_num'] = df.czynsz.map(parse_czynsz)
    
    df['area_num'] = df.area.map(parse_area)
    area_num_99 = np.percentile(df['area_num'], 99)
    df['area_norm'] = df['area_num'].map(lambda x: x if x <= area_num_99 else area_num_99)
    df['area_num_log'] = np.log(df['area_num'])
    
    df['price_m2'] = df['price'] / df['area_num']
    
    df['province'] = df['location'].map(lambda x: x[0])
    df['city'] = df['location'].map(parse_location_city)
    df['floors_in_building_num'] = df['floors_in_building'].map(parse_floors_in_building)
    df['build_year'] = df['rok budowy'].fillna(-1).astype('int')
    
    floors_dict = {'parter': 0, '> 10': 11, 'poddasze': -2, 'suterena': -1}
    df['floor_num'] = df['floor'].map(lambda x: floors_dict.get(x, x)).fillna(-10).astype('int')
    
    
    if 'city_area' not in df.columns:
        df = pd.merge(df, city_stats, on='city', how='left')
    if 'province_population' not in df.columns:
        df = pd.merge(df, province_stats, on='province', how='left')
    
    miasta_woj = ['Bia≈Çystok', 'Bydgoszcz', 'Gda≈Ñsk', 'Gorz√≥w Wielkopolski', 'Katowice', 'Kielce', 'Krak√≥w', 'Lublin',
    '≈Å√≥d≈∫', 'Olsztyn', 'Opole', 'Pozna≈Ñ', 'Rzesz√≥w', 'Szczecin', 'Toru≈Ñ', 'Warszawa', 'Wroc≈Çaw', 'Zielona G√≥ra']
    
    df['miasta_woj'] = df['city'].isin(miasta_woj)
    
    #Agregacje
    groupby_city_price = df_groupby_feat(df, ['city'], 'price')        
    if 'median_city_price' not in df:
        df = pd.merge(df, groupby_city_price, on='city', how='left')
        
    groupby_county_price = df_groupby_feat(df, ['county'], 'price')        
    if 'median_county_price' not in df:
        df = pd.merge(df, groupby_county_price, on='county', how='left')
    
    #groupby_city_pricem2 = df_groupby_feat(df, ['city'], 'price_m2')        
    #if 'median_county_price_m2' not in df:
    #    df = pd.merge(df, groupby_city_pricem2, on='city', how='left')
        
    #groupby_county_pricem2 = df_groupby_feat(df, ['county'], 'price_m2')        
    #if 'median_county_price_m2' not in df:
    #    df = pd.merge(df, groupby_county_pricem2, on='county', how='left')
        
    #groupby_province_price = df_group_price(df, ['province'])        
    #if 'median_province_price' not in df:
    #    df = pd.merge(df, groupby_province_price, on='province', how='left') 
    

    df['city_cat'] = df['city'].factorize()[0]
    df['county_cat'] = df['county'].factorize()[0]
    df['province_cat'] = df['province'].factorize()[0]
    
    
    cat_feats = {
        "materia≈Ç budynku": "build_material_cat",
        "okna": "window_cat",
        "stan wyko≈Ñczenia": "property_completion_cat",
        "rodzaj zabudowy": "property_type_cat",
        "ogrzewanie": "property_heating_cat",
        "forma w≈Çasno≈õci": "own_property_cat"
     }    
    
    for feat_name, feat_new_name in cat_feats.items():
        df[feat_new_name] = df[feat_name].factorize()[0]
        
        #OHE
        df_dummies = pd.get_dummies(df[feat_name])
        df_dummies.columns = ['{0}_{1}'.format(feat_new_name, x) for x in df_dummies.columns]
        pd.concat([df, df_dummies], axis=1)
        
    stats = df['stats'].apply(pd.Series)
    df = pd.concat([df, stats], axis=1)
    dict_created_at={
        'ponad 14 dni temu':18,  
        '23.10.2018':17,
        '24.10.2018':16,
        '25.10.2018':15,
        '26.10.2018':14, 
        '27.10.2018':13,
        '28.10.2018':12,
        '29.10.2018':11,
        '30.10.2018':10,
        '31.10.2018':9,
        '01.11.2018':8,    
        '02.11.2018':7,
        '03.11.2018':6,
        '04.11.2018':5,
        '05.11.2018':4,
        '06.11.2018':3,
        '07.11.2018':2,
        '08.11.2018':1,
    }
        
    df['created_at_cat'] = df['created_at'].map(dict_created_at)
    #df['created_at_cat'] = df['created_at'].factorize()[0]
    #df['visit_ads_num'] = df.visit_ads.fillna(-1).astype('int')  
    df['visit_ads_num'] = df.visit_ads.map(lambda x: np.log(int(x) + 10) if not isinstance(x, float)  else -1)
    
    
    #Starter3
    #Location
    for i in range(5):
        df["loc{}".format(i)] = df["location"].map(lambda x: x[i] if len(x) > i else "")      
    
    df['loc01'] = df['loc0'] + df['loc1']
    df['loc012'] = df['loc0'] + df['loc1'] + df['loc2']
    df['loc12'] = df['loc1'] + df['loc2']
    
    #cat location
    for i in range(5):
        df["loc{}_cat".format(i)] = df["loc{}".format(i)].factorize()[0]
    df["loc01_cat"] = df['loc01'].factorize()[0]
    df["loc012_cat"] = df['loc012'].factorize()[0] 
    df["loc12_cat"] = df['loc12'].factorize()[0] 
    
    def groupby_location(df, loc, feat):
        groupby_feat = df_groupby_feat(df, [loc], feat)        
        if 'median_{}_price'.format(loc) not in df:
            return pd.merge(df, groupby_feat, on='{}'.format(loc), how='left')
    
    #df = groupby_location(df, 'loc01', 'price')    
    #df = groupby_location(df, 'loc12')
    
    #Starter 4
    
    df['security'] = df['system alarmowy'] | df['rolety antyw≈Çamaniowe'] | df['drzwi / okna antyw≈Çamaniowe']
    
    df['area_per_room'] = df['area_norm'] / df["rooms"]
    
    #start build year    
    years = [1970, 1980, 1990, 2000, 2005, 2010, 2012, 2014, 2016, 2017]
    #for idx in range(len(years) - 1):
    #    df["build_year_{0}-{1}".format(years[idx], years[idx+1])] = (df["build_year"] >= years[idx]) & (df["build_year"] < years[idx+1])
    
    #df["build_year_before_1970"] = df["build_year"] < 1970
    #df["build_year_after_2017"] = df["build_year"] > 2017

    def build_year_norm(year):
        if year < 1970: return 1900
        if year > 2017: return 2018

        for idx in range(len(years) - 1):
            if year >= years[idx] and year < years[idx+1]:
                return years[idx]

    df["build_year_norm"] = df["build_year"].map(build_year_norm)
    #end build year  
    
    big_cities = {'Pozna≈Ñ', 'Sopot', 'Wroc≈Çaw', 'Krak√≥w', 'Gda≈Ñsk', 'Gdynia', 'Opole', 'Katowice',  'Czƒôstochowa', 'Szczecin', 'Kalisz', '≈Å√≥d≈∫', 'Olsztyn', 'Warszawa'}
    for city in big_cities:
        df[city] = df['city'] == city
        df['big_city'] = df['city'].map(lambda x: x in big_cities)
        
    
    #location
    df_val_cnts = df['loc12'].value_counts()
    loc12_vals = set(df_val_cnts[ df_val_cnts > 100].index.values)
    for item in loc12_vals:
        df[item] = df['loc12'] == item
        
    
    #primary_market
    
    def is_primary_market_conc(df, feat):
        df['is_primary_market_{}'.format(feat)] = df[ ['is_primary_market', feat] ].apply(
            lambda x: '{}_{}'.format(x['is_primary_market'], x[feat]), axis=1
        )
        df['is_primary_market_{}_cat'.format(feat)] = df['is_primary_market_{}'.format(feat)].factorize()[0]

        return df
    
    df = is_primary_market_conc(df, 'rooms')
    df = is_primary_market_conc(df, 'city')
    df = is_primary_market_conc(df, 'rodzaj zabudowy') 
    
    #agregacje price_m2
    groupby_price_m2 = df_groupby_feat(df, ['is_primary_market_rooms'], 'price_m2')
    df = pd.merge(df, groupby_price_m2, on='is_primary_market_rooms', how='left')
    
    #groupby_price_m2 = df_groupby_feat(df, ['is_primary_market_city'], 'price_m2')
    #df = pd.merge(df, groupby_price_m2, on='is_primary_market_city', how='left')
    
    groupby_price_m2 = df_groupby_feat(df, ['is_primary_market_rodzaj zabudowy'], 'price_m2')
    df = pd.merge(df, groupby_price_m2, on='is_primary_market_rodzaj zabudowy', how='left')
    

                                                                   
    return df

In [6]:
ls ../input_ext/

city_stats.csv  region_stats.csv


In [142]:
ls ../input/ 

big_train_warsaw_property.h5  [0m[01;36mtrain_data.h5[0m@
test_warsaw_property.h5       train_warsaw_property.h5


In [8]:
df_train = pd.read_hdf('../input/train_warsaw_property.h5')
df_test = pd.read_hdf('../input/test_warsaw_property.h5')

#big data set
df_train = pd.read_hdf('../input/big_train_warsaw_property.h5')

df_org = pd.concat([df_train, df_test])

#External city and region data (Wikipedia)
city_stats = pd.read_csv('../input_ext/city_stats.csv')
city_stats.drop('Wojew√≥dztwo', axis=1, inplace=True)
city_stats.columns = ['city', 'county', 'city_area', 'city_population', 'city_density']

province_stats = pd.read_csv('../input_ext/province_stats.csv')
province_stats.drop('Lp.', axis=1, inplace=True)
province_stats.columns = ['province', 'province_population', 'province_men_population', 'province_women_population']

df_train.shape, df_test.shape, df_org.shape

((46489, 53), (46275, 52), (92764, 53))

In [13]:
df = feature_engineering(df_org.copy())
print(df.shape)
df.sample(3)

(92764, 343)


Unnamed: 0,id,location,is_private,piekarnik,gara≈º,monitoring / ochrona,rolety antyw≈Çamaniowe,kuchenka,taras,balkon,ogr√≥dek,dwupoziomowe,system alarmowy,pom. u≈ºytkowe,klimatyzacja,tarasy,teren zamkniƒôty,internet,winda,telefon,pralka,piwnica,ochrona,telewizja kablowa,telewizor,lod√≥wka,domofon / wideofon,oddzielna kuchnia,zmywarka,gara≈º/miejsce parkingowe,meble,drzwi / okna antyw≈Çamaniowe,plan zagospodarowania:,price,area,rooms,floor,floors_in_building,dostƒôpne od,rok budowy,materia≈Ç budynku,okna,data rozpoczƒôcia,stan inwestycji,liczba kondygnacji,stan wyko≈Ñczenia,czynsz,rodzaj zabudowy,ogrzewanie,forma w≈Çasno≈õci,stats,text,is_primary_market,czynsz_num,area_num,area_norm,area_num_log,price_m2,province,city,floors_in_building_num,build_year,floor_num,county,city_area,city_population,city_density,province_population,province_men_population,province_women_population,miasta_woj,mean_city_price,median_city_price,mean_county_price,median_county_price,city_cat,county_cat,province_cat,build_material_cat,window_cat,property_completion_cat,property_type_cat,property_heating_cat,own_property_cat,ID,created_at,updated_at,visit_ads,created_at_cat,visit_ads_num,loc0,loc1,loc2,loc3,loc4,loc01,loc012,loc12,loc0_cat,loc1_cat,loc2_cat,loc3_cat,loc4_cat,loc01_cat,loc012_cat,loc12_cat,security,area_per_room,build_year_norm,Sopot,big_city,Wroc≈Çaw,Pozna≈Ñ,Czƒôstochowa,Gda≈Ñsk,Kalisz,Opole,Olsztyn,Gdynia,Katowice,Warszawa,≈Å√≥d≈∫,Krak√≥w,Szczecin,tarnog√≥rskiTarnowskie G√≥ry,SzczecinWarszewo,Katowice≈ör√≥dmie≈õcie,Zielona G√≥raCentrum,Wroc≈ÇawZ≈Çotniki,Krak√≥wWola Justowska,wejherowskiRumia,Bia≈ÇystokAntoniuk,Gda≈ÑskJelitkowo,KatowiceJ√≥zefowiec,KatowiceKostuchna,stargardzkiStargard,Bia≈ÇystokCentrum,Gda≈Ñsk≈Åostowice,Pozna≈ÑJe≈ºyce,≈Å√≥d≈∫Widzew,GdyniaRed≈Çowo,Bia≈ÇystokWysoki Stoczek,WarszawaBielany,Krak√≥wWzg√≥rza Krzes≈Çawickie,WarszawaUrsus,Wroc≈Çaw≈ör√≥dmie≈õcie,Tychy≈ªwak√≥w,≈õwidnicki≈öwidnica,Warszawa≈ªoliborz,SopotDolny,LublinWƒôglinek,pozna≈ÑskiLubo≈Ñ,Krak√≥wDƒôbniki,BydgoszczGlinki,Warszawa≈ör√≥dmie≈õcie,Opole≈ör√≥dmie≈õcie,Krak√≥wMistrzejowice,namys≈ÇowskiNamys≈Ç√≥w,Pozna≈ÑPiƒÖtkowo,Wroc≈ÇawPsie Pole,SzczecinGumie≈Ñce,WarszawaTarg√≥wek,Gda≈ÑskUje≈õcisko,BydgoszczOsiedle Le≈õne,Gda≈Ñsk≈ör√≥dmie≈õcie,BydgoszczFordon,≈Å√≥d≈∫G√≥rna,Gda≈ÑskKowale,Wroc≈ÇawStare Miasto,Gda≈ÑskOliwa,GliwiceCentrum,pozna≈ÑskiZalasewo,ostrowskiOstr√≥w Wielkopolski,BydgoszczBielawy,Toru≈ÑJakubskie Przedmie≈õcie,Gda≈ÑskPrzymorze,Krak√≥wKurdwan√≥w,WarszawaUrsyn√≥w,Toru≈ÑKoniuchy,wejherowskiWejherowo,DƒÖbrowa G√≥rniczaGo≈Çon√≥g,Zielona G√≥ra,pozna≈ÑskiSwarzƒôdz,Gliwice,Pozna≈ÑWilczak,LublinLSM,KatowicePiotrowice,SosnowiecPogo≈Ñ,WarszawaWola,LublinCentrum,GdyniaOb≈Çu≈ºe,bƒôdzi≈ÑskiBƒôdzin,Krak√≥wBronowice Ma≈Çe,SosnowiecCentrum,≈Å√≥d≈∫Polesie,Pozna≈ÑStare Miasto,WarszawaBia≈Ço≈Çƒôka,SosnowiecZag√≥rze,LublinCzuby,WarszawaMokot√≥w,CzƒôstochowaParkitka,Krak√≥wPodg√≥rze,krakowskiSkawina,Wroc≈ÇawMa≈õlice,LublinSzerokie,SzczecinNiebuszewo,Wroc≈ÇawKlecina,wroc≈ÇawskiKie≈Çcz√≥w,e≈ÇckiE≈Çk,LublinWrotk√≥w,BydgoszczG√≥rzyskowo,Pozna≈ÑWilda,Gda≈ÑskMorena,g≈ÇogowskiG≈Çog√≥w,Krak√≥wBie≈ºan√≥w-Prokocim,Krak√≥wBronowice,Rzesz√≥wS≈Çocina,piaseczy≈ÑskiPiaseczno,GdyniaOksywie,KielceCentrum,Pozna≈ÑRataje,Wroc≈ÇawStab≈Çowice,Wroc≈ÇawGaj,KatowiceOsiedle Paderewskiego,tczewskiTczew,wroc≈ÇawskiSiechnice,Toru≈ÑNa Skarpie,Wroc≈ÇawPlac Grunwaldzki,BydgoszczB≈Çonie,Krak√≥wRuczaj,piaseczy≈ÑskiJ√≥zefos≈Çaw,WarszawaPraga-P√≥≈Çnoc,gda≈ÑskiPruszcz Gda≈Ñski,Wroc≈Çawul. Brzoskwiniowa,SzczecinCentrum,≈Å√≥d≈∫Ba≈Çuty,Pozna≈ÑWinogrady,GdyniaOr≈Çowo,Pozna≈ÑGrunwald,lƒôborskiLƒôbork,GdyniaWitomino,WarszawaBemowo,WarszawaOchota,Pozna≈ÑBu≈Çgarska 59,WarszawaSaska Kƒôpa,pruszkowskiPruszk√≥w,Toru≈ÑMokre,BydgoszczCentrum,Krak√≥wSalwator,WarszawaGoc≈Çaw,BydgoszczKapu≈õciska,Krak√≥wPrƒÖdnik Czerwony,Szczecin≈ör√≥dmie≈õcie,KatowiceBryn√≥w,Krak√≥wKrowodrza,Krak√≥wNowa Huta,Pozna≈ÑG√≥rczyn,≈Å√≥d≈∫≈ör√≥dmie≈õcie,Wroc≈ÇawTarnogaj,Bia≈ÇystokBojary,Krak√≥w≈ör√≥dmie≈õcie,Gda≈ÑskJasie≈Ñ,KatowiceOsiedle TysiƒÖclecia,dzier≈ºoniowskiDzier≈ºoni√≥w,kamie≈ÑskiMiƒôdzyzdroje,Wroc≈ÇawSwojczyce,Bia≈ÇystokNowe Miasto,Toru≈ÑChe≈Çmi≈Ñskie Przedmie≈õcie,GrudziƒÖdz,puckiJastarnia,inowroc≈ÇawskiInowroc≈Çaw,lubi≈ÑskiLubin,Pozna≈ÑNowe Miasto,wejherowskiReda,Krak√≥wGrzeg√≥rzki,Krak√≥wStare Podg√≥rze,BytomCentrum,BydgoszczWy≈ºyny,tatrza≈ÑskiZakopane,wo≈Çomi≈ÑskiMarki,Kielce,bƒôdzi≈ÑskiSiewierz,Krak√≥wCzy≈ºyny,WarszawaWawer,Gda≈ÑskWrzeszcz,SzczecinPogodno,Gorz√≥w WielkopolskiG√≥rczyn,Toru≈ÑRubinkowo,Toru≈ÑBydgoskie Przedmie≈õcie,Pozna≈ÑPodolany,Wroc≈ÇawMuchob√≥r Wielki,Wroc≈ÇawJagodno,Krak√≥w≈Åobz√≥w,KatowiceDolina Trzech Staw√≥w,Gdynia≈ör√≥dmie≈õcie,wo≈Çomi≈ÑskiZƒÖbki,WarszawaW≈Çochy,Bydgoszcz≈ör√≥dmie≈õcie,Pozna≈ÑCentrum,LublinFelin,GliwiceStare Gliwice,CzƒôstochowaTysiƒÖclecie,Kielce≈ölichowice,Wroc≈ÇawHuby,Pozna≈ÑNaramowice,Gda≈ÑskChe≈Çm,BydgoszczSzwederowo,Wroc≈ÇawKrzyki,WarszawaPowi≈õle,Krak√≥wStare Miasto,Gda≈ÑskSiedlce,Rzesz√≥w,Krak√≥wKazimierz,LublinDziesiƒÖta,wielickiWieliczka,BydgoszczBartodzieje,Krak√≥wKliny-Zacisze,Gda≈ÑskStare Miasto,Wroc≈ÇawFabryczna,Pozna≈Ñ≈Åazarz,Krak√≥wOlsza,Rzesz√≥wDrabinianka,ko≈ÇobrzeskiKo≈Çobrzeg,WarszawaWilan√≥w,SopotG√≥rny,Gda≈ÑskZaspa,WarszawaPraga-Po≈Çudnie,Krak√≥wPrƒÖdnik Bia≈Çy,GdyniaMa≈Çy Kack,Krak√≥wWola Duchacka,KatowiceWe≈Çnowiec,ZabrzeCentrum,Lublin≈ör√≥dmie≈õcie,is_primary_market_rooms,is_primary_market_rooms_cat,is_primary_market_city,is_primary_market_city_cat,is_primary_market_rodzaj zabudowy,is_primary_market_rodzaj zabudowy_cat,mean_is_primary_market_rooms_price_m2,median_is_primary_market_rooms_price_m2,mean_is_primary_market_rodzaj zabudowy_price_m2,median_is_primary_market_rodzaj zabudowy_price_m2
74040,55250,"[≈õlƒÖskie, Bytom, Miechowice]",0,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,,60 m¬≤,3,4,(z 4),,,inne,,,,,do zamieszkania,,blok,miejskie,sp√≥≈Çdzielcze w≈Ç. z kw,"{'ID': 113014296, 'visit_ads': '55', 'created_...",Opis Oferujemy do sprzeda≈ºy przestronne trzypo...,False,,60.0,60.0,4.094345,,≈õlƒÖskie,Bytom,4.0,-1,4,Bytom[a],69.44,168394.0,2425.0,4570849,2204972,2365877,False,148897.3125,145000.0,148897.3125,145000.0,371,222,11,4,-1,1,1,1,1,113014296,07.11.2018,07.11.2018,55.0,2,4.174387,≈õlƒÖskie,Bytom,Miechowice,,,≈õlƒÖskieBytom,≈õlƒÖskieBytomMiechowice,BytomMiechowice,11,217,2289,1,0,223,2548,2548,False,20.0,1900.0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False_3,4,False_Bytom,543,False_blok,3,6013.719309,5395.683453,5587.835882,5201.773009
83926,75161,"[≈õlƒÖskie, Gliwice, ≈ªerniki]",0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,,"51,43 m¬≤",2,parter,(z 2),,,,plastikowe,,,,,530 z≈Ç,,,,"{'ID': 111452864, 'visit_ads': '175', 'created...",Opis Na sprzeda≈º wyremontowane i umeblowane mi...,False,530.0,51.43,51.43,3.940222,,≈õlƒÖskie,Gliwice,2.0,-1,0,Gliwice[a],133.88,181309.0,1354.0,4570849,2204972,2365877,False,266079.625,237380.0,266079.625,237380.0,365,217,11,-1,0,-1,-1,-1,-1,111452864,ponad 14 dni temu,ponad 14 dni temu,175.0,18,5.220356,≈õlƒÖskie,Gliwice,≈ªerniki,,,≈õlƒÖskieGliwice,≈õlƒÖskieGliwice≈ªerniki,Gliwice≈ªerniki,11,212,173,1,0,218,3343,3343,False,25.715,1900.0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False_2,3,False_Gliwice,530,False_nan,2,6437.640518,5813.953488,6265.400057,5605.633803
68156,43530,"[ma≈Çopolskie, Krak√≥w, Stare Miasto, ul. Krowod...",0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,True,False,False,False,False,True,True,False,False,False,True,False,False,,"67,30 m¬≤",2,1,(z 4),,1900.0,ceg≈Ça,plastikowe,,,,do zamieszkania,300 z≈Ç,kamienica,miejskie,pe≈Çna w≈Çasno≈õƒá,"{'ID': 108889064, 'created_at': 'ponad 14 dni ...",Opis Biuro Nieruchomo≈õci Stare Miasto prezentu...,False,300.0,67.3,67.3,4.20916,,ma≈Çopolskie,Krak√≥w,4.0,1900,1,Krak√≥w[a],326.85,767348.0,2348.0,3372618,1636707,1735911,True,529502.5,416305.0,529502.5,416305.0,162,100,5,1,0,1,2,1,0,108889064,ponad 14 dni temu,04.11.2018,,18,-1.0,ma≈Çopolskie,Krak√≥w,Stare Miasto,ul. Krowoderska,,ma≈ÇopolskieKrak√≥w,ma≈ÇopolskieKrak√≥wStare Miasto,Krak√≥wStare Miasto,5,95,26,2623,0,97,933,933,False,33.65,1900.0,False,True,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False_2,3,False_Krak√≥w,222,False_kamienica,5,6437.640518,5813.953488,6034.568709,4906.693992


# BASIC MODEL - Kaggle Score: 182576.93612

In [11]:
feats = ['id']
model = DummyRegressor()
model.fit(df_train[feats].values, df_train['price'])
y_pred = model.predict(df_test[feats].values)

df_test['price'] = y_pred
df_test[ ['id', 'price'] ].to_csv('../output/DummyRegressor.csv', index=False)

In [265]:
#msno.bar(df_train.iloc[:,40:50])

# ‚≠ê DAY 4 MODEL = Score: 51517.18706 Public score: 54286.18086

## ü§ñ MLflow

In [47]:
!mlflow ui --backend-store-uri file:///home/jovyan/dwsolutions/property_warsaw/notebooks/mlruns

[2022-06-19 19:10:06 +0000] [193] [INFO] Starting gunicorn 20.1.0
[2022-06-19 19:10:06 +0000] [193] [INFO] Listening at: http://127.0.0.1:5000 (193)
[2022-06-19 19:10:06 +0000] [193] [INFO] Using worker: sync
[2022-06-19 19:10:06 +0000] [195] [INFO] Booting worker with pid: 195
^C
[2022-06-19 19:13:52 +0000] [193] [INFO] Handling signal: int
[2022-06-19 19:13:53 +0000] [195] [INFO] Worker exiting (pid: 195)


 ### [Dashboard](/hub/user-redirect/proxy/5000/) 

In [35]:
start_experiment(df, 'catboost_hyper', scoring=mean_absolute_error, filename='catboost', results=True, export=False, mlflow_save=False)

Working on "catboost_0" experiment


0it [00:30, ?it/s]


(50700.473806202695, 0.0)


Weight,Feature
0.1042,area_num_log
0.1014,area_num
0.0839,median_city_price
0.0556,mean_county_price
0.0527,mean_city_price
0.0480,median_is_primary_market_rodzaj zabudowy_price_m2
0.0427,area_norm
0.0366,build_year
0.0300,median_county_price
0.0277,visit_ads_num


In [61]:
start_experiment(df,
                 'catboost_hyper',
                 scoring=mean_absolute_error,
                 filename='catboost',
                 results=False,
                 export=True,
                 mlflow_save=False)

Working on "catboost_5" experiment
catboost_5.csv save successfully in "output" folder


Weight,Feature
0.1058,area_num_log
0.1006,area_num
0.0843,median_city_price
0.0540,mean_county_price
0.0532,mean_city_price
0.0511,median_is_primary_market_rodzaj zabudowy_price_m2
0.0429,area_norm
0.0359,build_year
0.0306,median_county_price
0.0275,visit_ads_num


In [52]:
start_experiment(df,
                 'catboost_hyper',
                 scoring=mean_absolute_error,
                 filename='catboost',
                 results=True,
                 export=False,
                 mlflow_save=True)

Working on "catboost_3" experiment


0it [00:00, ?it/s]


AttributeError: 'numpy.ndarray' object has no attribute 'iloc'

In [12]:
df['winda_test'] = df["text"].map(lambda x: True if ('winda' in x) | ('windƒÖ' in x) | ('windy' in x) | ('windƒô' in x) else False)
df['winda_test']

0        False
1        False
2        False
3         True
4         True
         ...  
92759    False
92760    False
92761     True
92762    False
92763     True
Name: winda_test, Length: 92764, dtype: bool

In [19]:
df['winda_ext'] = df.apply(lambda x: True if (x['winda']) | (x['winda_test']) else False, axis=1)