In [12]:
import pandas as pd
import numpy as np

from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import cross_val_score, KFold
from sklearn.dummy import DummyRegressor

import eli5
import catboost

import mlflow
import missingno as msno
import gc
from os.path import exists
from tqdm import tqdm

pd.set_option('display.max_columns', None)

### Defining function used for experiments

In [24]:
def get_or_create_experiment(experiment_name):
    """Checks if mlflow experiment already exist (if not creates it) and return it."""
    experiment = mlflow.get_experiment_by_name(experiment_name)
    if not experiment:
        mlflow.create_experiment(name)
        return mlflow.get_experiment_by_name(experiment_name)        
    return experiment


def _eid(experiment_name):
    return get_or_create_experiment(experiment_name).experiment_id


def get_filename(basename):
    """Creates unique filename to prevent overwriting files in target directory when exporting the forecast."""
    counter=0
    name = '{}_{}'.format(basename, counter)  # filename in format <basename>_<number>.csv
    # Adds +1 to <number> until it gets unique filename
    while exists('../output/{}.csv'.format(name)):
        counter +=1
        filename = '{}_{}'.format(basename, counter)        
    return filename


def get_X_y_log(df, feats):
    """Creates feature matrix, target and logarihtmic vector for model training."""
    X = df[feats]
    y = df['price']
    y_log = np.log(y)    
    return X, y, y_log
    

def get_feats(df, blacklist_extended):
    """Creates training features by taking all number and boolean columns 
    from provided dataframe, reduced by those in blacklist.
    """    
    blacklist = [
        'price',
        'id', 
        'price_m2',
    ]
    # Extra blacklist feature can be added as list in function argument. 
    if blacklist_extended:
        blacklist = blacklist + blacklist_extended       
                         
    num_bool_feats = df.select_dtypes(['number', 'bool'])
    return [x for x in num_bool_feats if x not in blacklist]


def get_model(model_or_id):
    """Returns one of two default models, or alternative one, provided as function argument."""
    if model_or_id == 'catboost_hyper':  # hyper parameters tuned for better forecast, but longer model training.
        model_params = dict(             
            max_depth=8,
            n_estimators=1000,
            learning_rate=0.3,
            random_state=0,
            silent=True,
        )
        return catboost.CatBoostRegressor(**model_params)
    
    elif model_or_id == 'catboost_light':  # hyper parameters tuned for quick results.
        model_params = dict(            
            max_depth=5,
            n_estimators=100,
            random_state=0,
            silent=True,
        )
        return catboost.CatBoostRegressor(**model_params)
    else:
        return model_or_id


def mlflow_experiment(run_name, model, X, feats, result, eli5_result):
    """Uses MLflow tool to record and track important information about experiments.
    Such as model parameters, shape of feature and targe matrix, feats used for model training or eli5 importance of the features
    """
    with mlflow.start_run(experiment_id=_eid('dw_solution_property'), run_name=run_name) as run:
    
        mlflow.log_params(model.get_params())
        mlflow.log_param("model", str(model).split("(")[0])
        mlflow.log_param("feats", feats)
        mlflow.log_param('X.shape', X.shape)
        
        # artifacts
        with open('../outputs/eli5.html', 'w') as f:
            f.write('<html>{}</html>'.format(eli5_result.data))
        mlflow.log_artifact('../outputs/eli5.html', 'plot')
                
        # metrics
        mlflow.log_metric('mae_mean', result[0])
        mlflow.log_metric('mae_std', result[1])
              
    print(f'Experiment {run_name} recorded')

    
def check_model(X, y, model, scoring):
    """Evaluate model based on selected metric."""
    cv = KFold(n_splits=5, shuffle=True, random_state=0)
    scores = []
    for train_idx, test_idx in tqdm(cv.split(X)):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_log_train, y_test = np.log(y).iloc[train_idx], y.iloc[test_idx]

        model.fit(X_train, y_log_train)
        y_log_pred = model.predict(X_test)
        y_pred = np.exp(y_log_pred)

        score = scoring(y_test, y_pred)
        scores.append(score)
        
        return np.mean(scores), np.std(scores)
    
    
def save_forecast_to_csv(model, X_test, df_test, filename):
    """Exports forecast to csv file"""
    y_pred = model.predict(X_test)
    df_test['price'] = np.exp(y_pred)
    
    try:
        df_test[ ['id', 'price'] ].to_csv('../output/{}.csv'.format(filename), index=False)
        print('{}.csv save successfully in "output" folder'.format(filename))
    except:
        print('{}.csv saving ERROR in "output" folder'.format(filename))
    
    
def start_experiment(df, model_or_id, scoring=mean_absolute_error,
                     filename='catboost',
                     results=True, export=False, mlflow_save=True, blacklist_extended=None):
    """Main function to conducts experiment.
    
    Main feature are:
    record experiment data using MLflow - "mlflow_save" argument,
    showing results based on selected metric - "results" argument,
    exporting forecast to csv file - "export" argument.
    All above argument are boolean and can be set independently according to expected output.
    Mlflow_save argument can be set to True only if results argument is also set to True.
    """
    if mlflow_save:
        assert results, '"mlflow_save" argument can be set to True only if "results" argument is also set to True'
        
    filename = get_filename(filename)    
    print(f'Working on "{filename}" experiment')
    
    # Splits dataset into train and test dataframes.     
    df_train = df[ df['price'].notnull() ].fillna(-1)  # takes all rows with "price" column NOT empty
    df_test = df[ df['price'].isnull() ].fillna(-1)  # # takes all rows with "price" column empty
    
    # Gets X and y (feature matrix and target vector). 
    feats = get_feats(df, blacklist_extended )
    X, y, y_log = get_X_y_log(df_train, feats)

    # Gets model
    model = get_model(model_or_id)
    
    # Returning results
    if results:
        result = check_model(X, y, model, scoring)
        
    model.fit(X.values, y_log.values)
    
    # Export forecast to csv.
    if export:
        X_test = df_test[feats].values
        save_forecast_to_csv(model, X_test, df_test, filename)    
  
    eli5_result = eli5.show_weights(model, feature_names=feats)
    
    #MLflow tracking
    if mlflow_save and results:
        mlflow_experiment(filename, model, X, feats, result, eli5_result)

    if results:
        print(result)
    return eli5_result  

## Feature engineering

In [14]:
def feature_engineering(df):
    
    def parse_czynsz(val):
        if isinstance(val, int): return val
        if isinstance(val, float): return val

        if val[-1] == 'ł':
            return float(val.split('zł')[0].replace(' ', '').replace(',','.'))
        if val[-1] == 'r':
            return float(val.split('eur')[0].replace(' ', '').replace(',','.'))*4.5
    
    
    def parse_area(val):
        if isinstance(val, int): return val
        if isinstance(val, float): return val

        return float(val.split('m')[0].replace(',','.').replace(' ',''))
    

    def parse_location_city(val):
        all_city = city_stats['city'].to_list()
        for city_ in reversed(val):
            if city_ in ['Dobra', 'Józefów']:
                continue
            if city_ in all_city:
                return city_
        return 'other'    
    

    def parse_floors_in_building(val):
        if isinstance(val, int): return -1
        if isinstance(val, float): return -1    
        floor = float(val.replace(')','').split()[1])
        return floor if floor < 20 else 25
    
    def df_groupby_feat(df, groupby_feats, feat):
        agg_params={
            'mean_{}_{}'.format('_'.join(groupby_feats), feat): (feat, 'mean'),
            'median_{}_{}'.format('_'.join(groupby_feats), feat): (feat, 'median'),
            #'sum_{}_price'.format('_'.join(groupby_feats)): ('price', 'sum')
        }
        return df[groupby_feats + [feat]].groupby(groupby_feats).agg(
            **agg_params
        ).reset_index()

    df['czynsz_num'] = df.czynsz.map(parse_czynsz)
    
    df['area_num'] = df.area.map(parse_area)
    area_num_99 = np.percentile(df['area_num'], 99)
    df['area_norm'] = df['area_num'].map(lambda x: x if x <= area_num_99 else area_num_99)
    df['area_num_log'] = np.log(df['area_num'])
    
    df['price_m2'] = df['price'] / df['area_num']
    
    df['province'] = df['location'].map(lambda x: x[0])
    df['city'] = df['location'].map(parse_location_city)
    df['floors_in_building_num'] = df['floors_in_building'].map(parse_floors_in_building)
    df['build_year'] = df['rok budowy'].fillna(-1).astype('int')
    
    floors_dict = {'parter': 0, '> 10': 11, 'poddasze': -2, 'suterena': -1}
    df['floor_num'] = df['floor'].map(lambda x: floors_dict.get(x, x)).fillna(-10).astype('int')
    
    
    if 'city_area' not in df.columns:
        df = pd.merge(df, city_stats, on='city', how='left')
    if 'province_population' not in df.columns:
        df = pd.merge(df, province_stats, on='province', how='left')
    
    miasta_woj = ['Białystok', 'Bydgoszcz', 'Gdańsk', 'Gorzów Wielkopolski', 'Katowice', 'Kielce', 'Kraków', 'Lublin',
    'Łódź', 'Olsztyn', 'Opole', 'Poznań', 'Rzeszów', 'Szczecin', 'Toruń', 'Warszawa', 'Wrocław', 'Zielona Góra']
    
    df['miasta_woj'] = df['city'].isin(miasta_woj)
    
    #Agregacje
    groupby_city_price = df_groupby_feat(df, ['city'], 'price')        
    if 'median_city_price' not in df:
        df = pd.merge(df, groupby_city_price, on='city', how='left')
        
    groupby_county_price = df_groupby_feat(df, ['county'], 'price')        
    if 'median_county_price' not in df:
        df = pd.merge(df, groupby_county_price, on='county', how='left')
    
    #groupby_city_pricem2 = df_groupby_feat(df, ['city'], 'price_m2')        
    #if 'median_county_price_m2' not in df:
    #    df = pd.merge(df, groupby_city_pricem2, on='city', how='left')
        
    #groupby_county_pricem2 = df_groupby_feat(df, ['county'], 'price_m2')        
    #if 'median_county_price_m2' not in df:
    #    df = pd.merge(df, groupby_county_pricem2, on='county', how='left')
        
    #groupby_province_price = df_group_price(df, ['province'])        
    #if 'median_province_price' not in df:
    #    df = pd.merge(df, groupby_province_price, on='province', how='left') 
    

    df['city_cat'] = df['city'].factorize()[0]
    df['county_cat'] = df['county'].factorize()[0]
    df['province_cat'] = df['province'].factorize()[0]
    
    
    cat_feats = {
        "materiał budynku": "build_material_cat",
        "okna": "window_cat",
        "stan wykończenia": "property_completion_cat",
        "rodzaj zabudowy": "property_type_cat",
        "ogrzewanie": "property_heating_cat",
        "forma własności": "own_property_cat"
     }    
    
    for feat_name, feat_new_name in cat_feats.items():
        df[feat_new_name] = df[feat_name].factorize()[0]
        
        #OHE
        df_dummies = pd.get_dummies(df[feat_name])
        df_dummies.columns = ['{0}_{1}'.format(feat_new_name, x) for x in df_dummies.columns]
        pd.concat([df, df_dummies], axis=1)
        
    stats = df['stats'].apply(pd.Series)
    df = pd.concat([df, stats], axis=1)
    dict_created_at={
        'ponad 14 dni temu':18,  
        '23.10.2018':17,
        '24.10.2018':16,
        '25.10.2018':15,
        '26.10.2018':14, 
        '27.10.2018':13,
        '28.10.2018':12,
        '29.10.2018':11,
        '30.10.2018':10,
        '31.10.2018':9,
        '01.11.2018':8,    
        '02.11.2018':7,
        '03.11.2018':6,
        '04.11.2018':5,
        '05.11.2018':4,
        '06.11.2018':3,
        '07.11.2018':2,
        '08.11.2018':1,
    }
        
    df['created_at_cat'] = df['created_at'].map(dict_created_at)
    #df['created_at_cat'] = df['created_at'].factorize()[0]
    #df['visit_ads_num'] = df.visit_ads.fillna(-1).astype('int')  
    df['visit_ads_num'] = df.visit_ads.map(lambda x: np.log(int(x) + 10) if not isinstance(x, float)  else -1)
    
    
    #Starter3
    #Location
    for i in range(5):
        df["loc{}".format(i)] = df["location"].map(lambda x: x[i] if len(x) > i else "")      
    
    df['loc01'] = df['loc0'] + df['loc1']
    df['loc012'] = df['loc0'] + df['loc1'] + df['loc2']
    df['loc12'] = df['loc1'] + df['loc2']
    
    #cat location
    for i in range(5):
        df["loc{}_cat".format(i)] = df["loc{}".format(i)].factorize()[0]
    df["loc01_cat"] = df['loc01'].factorize()[0]
    df["loc012_cat"] = df['loc012'].factorize()[0] 
    df["loc12_cat"] = df['loc12'].factorize()[0] 
    
    def groupby_location(df, loc, feat):
        groupby_feat = df_groupby_feat(df, [loc], feat)        
        if 'median_{}_price'.format(loc) not in df:
            return pd.merge(df, groupby_feat, on='{}'.format(loc), how='left')
    
    #df = groupby_location(df, 'loc01', 'price')    
    #df = groupby_location(df, 'loc12')
    
    #Starter 4
    
    df['security'] = df['system alarmowy'] | df['rolety antywłamaniowe'] | df['drzwi / okna antywłamaniowe']
    
    df['area_per_room'] = df['area_norm'] / df["rooms"]
    
    years = [1970, 1980, 1990, 2000, 2005, 2010, 2012, 2014, 2016, 2017]
    
    def build_year_norm(year):
        if year < 1970: return 1900
        if year > 2017: return 2018

        for idx in range(len(years) - 1):
            if years[idx+1] > year >= years[idx]:
                return years[idx]

    df["build_year_norm"] = df["build_year"].map(build_year_norm)

    
    big_cities = {'Poznań', 'Sopot', 'Wrocław', 'Kraków', 'Gdańsk', 'Gdynia', 'Opole', 'Katowice',  'Częstochowa', 'Szczecin', 'Kalisz', 'Łódź', 'Olsztyn', 'Warszawa'}
    for city in big_cities:
        df[city] = df['city'] == city
        df['big_city'] = df['city'].map(lambda x: x in big_cities)
        
    
    #location
    df_val_cnts = df['loc12'].value_counts()
    loc12_vals = set(df_val_cnts[ df_val_cnts > 100].index.values)
    for item in loc12_vals:
        df[item] = df['loc12'] == item
        
    
    #primary_market
    
    def is_primary_market_conc(df, feat):
        df['is_primary_market_{}'.format(feat)] = df[ ['is_primary_market', feat] ].apply(
            lambda x: '{}_{}'.format(x['is_primary_market'], x[feat]), axis=1
        )
        df['is_primary_market_{}_cat'.format(feat)] = df['is_primary_market_{}'.format(feat)].factorize()[0]

        return df
    
    df = is_primary_market_conc(df, 'rooms')
    df = is_primary_market_conc(df, 'city')
    df = is_primary_market_conc(df, 'rodzaj zabudowy') 
    
    #agregacje price_m2
    groupby_price_m2 = df_groupby_feat(df, ['is_primary_market_rooms'], 'price_m2')
    df = pd.merge(df, groupby_price_m2, on='is_primary_market_rooms', how='left')
    
    #groupby_price_m2 = df_groupby_feat(df, ['is_primary_market_city'], 'price_m2')
    #df = pd.merge(df, groupby_price_m2, on='is_primary_market_city', how='left')
    
    groupby_price_m2 = df_groupby_feat(df, ['is_primary_market_rodzaj zabudowy'], 'price_m2')
    df = pd.merge(df, groupby_price_m2, on='is_primary_market_rodzaj zabudowy', how='left')
    

                                                                   
    return df

In [6]:
ls ../input_ext/

city_stats.csv  region_stats.csv


In [142]:
ls ../input/ 

big_train_warsaw_property.h5  [0m[01;36mtrain_data.h5[0m@
test_warsaw_property.h5       train_warsaw_property.h5


In [15]:
df_train = pd.read_hdf('../input/big_train_warsaw_property.h5')
df_test = pd.read_hdf('../input/test_warsaw_property.h5')
df_org = pd.concat([df_train, df_test])

#External city and region data (Wikipedia)
city_stats = pd.read_csv('../input_ext/city_stats.csv')
city_stats.drop('Województwo', axis=1, inplace=True)
city_stats.columns = ['city', 'county', 'city_area', 'city_population', 'city_density']

province_stats = pd.read_csv('../input_ext/province_stats.csv')
province_stats.drop('Lp.', axis=1, inplace=True)
province_stats.columns = ['province', 'province_population', 'province_men_population', 'province_women_population']

print(df_train.shape, df_test.shape, df_org.shape)

del df_train, df_test
gc.collect

(46489, 53) (46275, 52) (92764, 53)


<function gc.collect(generation=2)>

# BASIC MODEL - Kaggle Score: 182576.93612

In [16]:
df = feature_engineering(df_org.copy())
print(df.shape)
df.sample(3)

(92764, 343)


Unnamed: 0,id,location,is_private,piekarnik,garaż,monitoring / ochrona,rolety antywłamaniowe,kuchenka,taras,balkon,ogródek,dwupoziomowe,system alarmowy,pom. użytkowe,klimatyzacja,tarasy,teren zamknięty,internet,winda,telefon,pralka,piwnica,ochrona,telewizja kablowa,telewizor,lodówka,domofon / wideofon,oddzielna kuchnia,zmywarka,garaż/miejsce parkingowe,meble,drzwi / okna antywłamaniowe,plan zagospodarowania:,price,area,rooms,floor,floors_in_building,dostępne od,rok budowy,materiał budynku,okna,data rozpoczęcia,stan inwestycji,liczba kondygnacji,stan wykończenia,czynsz,rodzaj zabudowy,ogrzewanie,forma własności,stats,text,is_primary_market,czynsz_num,area_num,area_norm,area_num_log,price_m2,province,city,floors_in_building_num,build_year,floor_num,county,city_area,city_population,city_density,province_population,province_men_population,province_women_population,miasta_woj,mean_city_price,median_city_price,mean_county_price,median_county_price,city_cat,county_cat,province_cat,build_material_cat,window_cat,property_completion_cat,property_type_cat,property_heating_cat,own_property_cat,ID,created_at,updated_at,visit_ads,created_at_cat,visit_ads_num,loc0,loc1,loc2,loc3,loc4,loc01,loc012,loc12,loc0_cat,loc1_cat,loc2_cat,loc3_cat,loc4_cat,loc01_cat,loc012_cat,loc12_cat,security,area_per_room,build_year_norm,Wrocław,big_city,Opole,Warszawa,Kalisz,Częstochowa,Olsztyn,Sopot,Katowice,Poznań,Szczecin,Kraków,Gdynia,Łódź,Gdańsk,LublinCentrum,KrakówKazimierz,poznańskiSwarzędz,WarszawaWilanów,CzęstochowaTysiąclecie,wrocławskiKiełczów,ostrowskiOstrów Wielkopolski,GdańskOliwa,BiałystokWysoki Stoczek,wrocławskiSiechnice,KatowiceBrynów,wejherowskiWejherowo,PoznańBułgarska 59,GliwiceStare Gliwice,LublinSzerokie,OpoleŚródmieście,KrakówBronowice,wejherowskiRumia,BiałystokNowe Miasto,WrocławKrzyki,KatowiceDolina Trzech Stawów,SosnowiecZagórze,LublinWęglinek,KatowiceKostuchna,GdyniaOksywie,WrocławHuby,ToruńMokre,GdańskChełm,GdańskŚródmieście,KrakówBronowice Małe,WrocławŚródmieście,BydgoszczBłonie,Zielona Góra,BiałystokCentrum,gdańskiPruszcz Gdański,WarszawaBiałołęka,piaseczyńskiJózefosław,WarszawaTargówek,WrocławMaślice,KrakówDębniki,GdyniaWitomino,RzeszówSłocina,ToruńBydgoskie Przedmieście,PoznańGórczyn,ToruńKoniuchy,BydgoszczKapuściska,WarszawaPraga-Północ,RzeszówDrabinianka,kamieńskiMiędzyzdroje,świdnickiŚwidnica,Zielona GóraCentrum,WrocławStare Miasto,KielceŚlichowice,GdańskSiedlce,WrocławSwojczyce,WrocławKlecina,BydgoszczŚródmieście,WarszawaSaska Kępa,BydgoszczBielawy,GdyniaObłuże,PoznańWinogrady,stargardzkiStargard,PoznańŁazarz,PoznańPodolany,GdyniaŚródmieście,PoznańCentrum,tatrzańskiZakopane,WarszawaOchota,SzczecinGumieńce,WrocławPlac Grunwaldzki,GliwiceCentrum,WarszawaBemowo,KrakówBieżanów-Prokocim,KielceCentrum,PoznańNowe Miasto,GdyniaMały Kack,WarszawaŻoliborz,SzczecinWarszewo,KrakówKliny-Zacisze,głogowskiGłogów,GdańskUjeścisko,WrocławJagodno,WarszawaGocław,Dąbrowa GórniczaGołonóg,WrocławGaj,LublinDziesiąta,BiałystokBojary,inowrocławskiInowrocław,wołomińskiZąbki,ToruńNa Skarpie,ŁódźPolesie,KrakówRuczaj,GdańskJelitkowo,PoznańWilczak,GdańskStare Miasto,KrakówSalwator,KrakówMistrzejowice,BiałystokAntoniuk,Grudziądz,GdyniaRedłowo,PoznańRataje,BydgoszczBartodzieje,PoznańStare Miasto,PoznańWilda,WarszawaPowiśle,tczewskiTczew,WrocławFabryczna,WarszawaWawer,będzińskiBędzin,pruszkowskiPruszków,GdańskJasień,KrakówŁobzów,ŁódźWidzew,krakowskiSkawina,lęborskiLębork,KrakówWzgórza Krzesławickie,KrakówPodgórze,puckiJastarnia,WarszawaUrsynów,kołobrzeskiKołobrzeg,WarszawaMokotów,ToruńRubinkowo,KrakówStare Miasto,BydgoszczSzwederowo,BydgoszczGlinki,KatowiceOsiedle Paderewskiego,ToruńChełmińskie Przedmieście,dzierżoniowskiDzierżoniów,GdyniaOrłowo,Gliwice,CzęstochowaParkitka,LublinCzuby,SopotDolny,tarnogórskiTarnowskie Góry,ZabrzeCentrum,KrakówStare Podgórze,WrocławZłotniki,Gorzów WielkopolskiGórczyn,piaseczyńskiPiaseczno,KrakówWola Duchacka,BydgoszczOsiedle Leśne,LublinLSM,LublinFelin,KrakówKrowodrza,GdańskWrzeszcz,będzińskiSiewierz,KatowiceŚródmieście,PoznańGrunwald,ŁódźGórna,KatowiceOsiedle Tysiąclecia,TychyŻwaków,PoznańNaramowice,KrakówŚródmieście,BytomCentrum,WrocławStabłowice,ełckiEłk,KatowicePiotrowice,KrakówWola Justowska,KrakówNowa Huta,BydgoszczFordon,SzczecinCentrum,wołomińskiMarki,KrakówCzyżyny,ToruńJakubskie Przedmieście,WarszawaPraga-Południe,KrakówPrądnik Biały,GdańskŁostowice,lubińskiLubin,SzczecinPogodno,WrocławTarnogaj,GdańskKowale,WarszawaŚródmieście,SosnowiecCentrum,Rzeszów,KrakówOlsza,WarszawaWłochy,KatowiceWełnowiec,LublinŚródmieście,namysłowskiNamysłów,WarszawaBielany,Kielce,GdańskPrzymorze,BydgoszczCentrum,BydgoszczGórzyskowo,WarszawaUrsus,GdańskZaspa,wejherowskiReda,ŁódźBałuty,LublinWrotków,GdańskMorena,BydgoszczWyżyny,KrakówGrzegórzki,ŁódźŚródmieście,PoznańJeżyce,poznańskiZalasewo,SosnowiecPogoń,wielickiWieliczka,WrocławPsie Pole,SzczecinŚródmieście,KrakówPrądnik Czerwony,SzczecinNiebuszewo,PoznańPiątkowo,KatowiceJózefowiec,WarszawaWola,KrakówKurdwanów,SopotGórny,poznańskiLuboń,Wrocławul. Brzoskwiniowa,WrocławMuchobór Wielki,is_primary_market_rooms,is_primary_market_rooms_cat,is_primary_market_city,is_primary_market_city_cat,is_primary_market_rodzaj zabudowy,is_primary_market_rodzaj zabudowy_cat,mean_is_primary_market_rooms_price_m2,median_is_primary_market_rooms_price_m2,mean_is_primary_market_rodzaj zabudowy_price_m2,median_is_primary_market_rodzaj zabudowy_price_m2
22279,44366,"[małopolskie, Kraków, Bronowice, Gabrieli Zapo...",0,False,False,False,False,False,False,True,False,False,False,True,False,False,False,True,True,False,False,False,False,True,False,False,False,True,False,True,True,False,False,420000.0,58 m²,2,1,(z 10),,,,plastikowe,,,,do zamieszkania,370 zł,blok,gazowe,,"{'ID': 108403032, 'visit_ads': '463', 'created...",Opis Nowodworski Estates ma przyjemność zaprez...,False,370.0,58.0,58.0,4.060443,7241.37931,małopolskie,Kraków,10.0,-1,1,Kraków[a],326.85,767348.0,2348.0,3372618,1636707,1735911,True,529502.5,416305.0,529502.5,416305.0,162,100,5,-1,0,1,1,0,-1,108403032,ponad 14 dni temu,01.11.2018,463.0,18,6.159095,małopolskie,Kraków,Bronowice,Gabrieli Zapolskiej,,małopolskieKraków,małopolskieKrakówBronowice,KrakówBronowice,5,95,553,4298,0,97,961,961,False,29.0,1900.0,False,True,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False_2,3,False_Kraków,222,False_blok,3,6437.640518,5813.953488,5587.835882,5201.773009
6975,13911,"[kujawsko-pomorskie, Grudziądz, Rządz, ul. Syb...",-1,False,False,False,False,False,True,False,False,False,False,False,False,False,False,True,True,True,False,False,False,True,False,False,False,False,False,True,False,False,False,279461.5,"63,95 m²",3,parter,(z 4),,2019.0,,plastikowe,,,,do wykończenia,450 zł,blok,miejskie,pełna własność,"{'ID': 109429584, 'created_at': 'ponad 14 dni ...",Opis NOWY RZĄDZ- NOWA INWESTYCJA!!! Nowe miesz...,True,450.0,63.95,63.95,4.158102,4370.0,kujawsko-pomorskie,Grudziądz,4.0,2019,0,Grudziądz[a],57.76,95629.0,1656.0,2086210,1010973,1075237,False,236839.328125,219226.5,236839.328125,219226.5,17,16,1,-1,0,0,1,1,0,109429584,ponad 14 dni temu,02.11.2018,,18,-1.0,kujawsko-pomorskie,Grudziądz,Rządz,ul. Sybiraków,,kujawsko-pomorskieGrudziądz,kujawsko-pomorskieGrudziądzRządz,GrudziądzRządz,1,16,129,1750,0,16,139,139,False,21.316667,2018.0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True_3,0,True_Grudziądz,22,True_blok,8,6057.419592,5600.0,5915.493739,5690.000186
56388,19892,"[dolnośląskie, Wrocław, Stabłowice]",0,False,False,True,False,False,True,True,False,False,False,False,False,False,False,True,True,False,False,False,False,True,False,False,False,False,False,True,False,False,False,,"45,42 m²",2,2,(z 4),,2019.0,,plastikowe,,,,,,apartamentowiec,miejskie,,"{'ID': 112062736, 'visit_ads': '186', 'created...",Opis Rodzinna inwestycja w najbardziej zielone...,True,,45.42,45.42,3.815953,,dolnośląskie,Wrocław,4.0,2019,2,Wrocław[a],292.82,638586.0,2181.0,2904207,1396318,1507889,True,396989.6875,361227.0,396989.6875,361227.0,4,3,0,-1,0,-1,4,1,-1,112062736,ponad 14 dni temu,03.11.2018,186.0,18,5.278115,dolnośląskie,Wrocław,Stabłowice,,,dolnośląskieWrocław,dolnośląskieWrocławStabłowice,WrocławStabłowice,0,3,17,1,0,3,17,17,False,22.71,2018.0,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True_2,5,True_Wrocław,8,True_apartamentowiec,7,6727.439805,6046.991315,7203.191746,6103.313169


In [17]:
feats = ['id']
model = DummyRegressor()
model.fit(df_train[feats].values, df_train['price'])
y_pred = model.predict(df_test[feats].values)

df_test['price'] = y_pred
df_test[ ['id', 'price'] ].to_csv('../output/DummyRegressor.csv', index=False)

NameError: name 'df_train' is not defined

In [265]:
#msno.bar(df_train.iloc[:,40:50])

# ⭐ DAY 4 MODEL = Score: 51517.18706 Public score: 54286.18086

## 🤖 MLflow

In [None]:
!mlflow ui --backend-store-uri file:///home/jovyan/dwsolutions/property_warsaw/notebooks/mlruns

 ### [Dashboard](/hub/user-redirect/proxy/5000/) 

In [7]:
start_experiment(df, 'catboost_hyper', scoring=mean_absolute_error,
                 filename='catboost',
                 results=True,
                 export=True,
                 mlflow_save=True)

Working on "catboost_6" experiment


0it [00:31, ?it/s]


catboost_6.csv save successfully in "output" folder
Experiment catboost_6 recorded
(50682.823719676235, 0.0)


Weight,Feature
0.1612,area_num
0.0865,median_city_price
0.0802,mean_city_price
0.0588,area_num_log
0.0509,mean_is_primary_market_rodzaj zabudowy_price_m2
0.0364,build_year
0.0360,median_county_price
0.0349,mean_county_price
0.0301,area_norm
0.0293,visit_ads_num


In [61]:
start_experiment(df,
                 'catboost_hyper',
                 scoring=mean_absolute_error,
                 filename='catboost',
                 results=False,
                 export=True,
                 mlflow_save=False)

Working on "catboost_5" experiment
catboost_5.csv save successfully in "output" folder


Weight,Feature
0.1058,area_num_log
0.1006,area_num
0.0843,median_city_price
0.0540,mean_county_price
0.0532,mean_city_price
0.0511,median_is_primary_market_rodzaj zabudowy_price_m2
0.0429,area_norm
0.0359,build_year
0.0306,median_county_price
0.0275,visit_ads_num


In [52]:
start_experiment(df,
                 'catboost_hyper',
                 scoring=mean_absolute_error,
                 filename='catboost',
                 results=True,
                 export=False,
                 mlflow_save=True)

Working on "catboost_3" experiment


0it [00:00, ?it/s]


AttributeError: 'numpy.ndarray' object has no attribute 'iloc'

In [3]:
import future.utils

In [4]:
utils.__version__

NameError: name 'utils' is not defined

In [5]:
import future

In [6]:
future.__version__

'0.18.2'

In [7]:
future.utils.__version__

AttributeError: module 'future.utils' has no attribute '__version__'

In [8]:
future

<module 'future' from '/opt/conda/lib/python3.8/site-packages/future/__init__.py'>

In [10]:
import future
from future import utils

print(utils.PY2)
print(utils.PY34_PLUS)
print(future.__version__)

False
True
0.18.2


In [25]:
start_experiment(df, 'catboost_light', scoring=mean_absolute_error,
                 filename='catboost',
                 results=False,
                 export=False,
                 mlflow_save=True)

AssertionError: "mlflow_save" argument can be set to True only if "results" argument is also set to True

In [27]:
help(start_experiment)

Help on function start_experiment in module __main__:

start_experiment(df, model_or_id, scoring=<function mean_absolute_error at 0x7f5b01144550>, filename='catboost', results=True, export=False, mlflow_save=True, blacklist_extended=None)
    Main function to conducts experiment.
    
    Main feature are:
    record experiment data using MLflow - "mlflow_save" argument,
    showing results based on selected metric - "results" argument,
    exporting forecast to csv file - "export" argument.
    All above argument are boolean and can be set independently according to expected output.
    Mlflow_save argument can be set to True only if results argument is also set to True.

