In [29]:
import pandas as pd
import numpy as np

from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import cross_val_score, KFold
from sklearn.dummy import DummyRegressor

import eli5
import catboost

import mlflow
import missingno as msno
import gc
from os.path import exists
from tqdm import tqdm

pd.set_option('display.max_columns', None)
mlflow.set_tracking_uri("file:///mlruns")

### Defining function used for experiments

In [30]:
def get_or_create_experiment(experiment_name):
    """Checks if mlflow experiment already exist (if not creates it) and return it."""
    experiment = mlflow.get_experiment_by_name(experiment_name)
    if not experiment:
        mlflow.create_experiment(experiment_name)
        return mlflow.get_experiment_by_name(experiment_name)        
    return experiment


def _eid(experiment_name):
    return get_or_create_experiment(experiment_name).experiment_id


def get_filename(basename):
    """Creates unique filename to prevent overwriting files in target directory when exporting the forecast."""
    counter=0
    filename = '{}_{}'.format(basename, counter)  # filename in format <basename>_<number>.csv
    # Adds +1 to <number> until it gets unique filename
    while exists('output/{}.csv'.format(filename)):
        counter +=1
        filename = '{}_{}'.format(basename, counter)        
    return filename


def get_X_y_log(df, feats):
    """Creates feature matrix, target and logarihtmic vector for model training."""
    X = df[feats]
    y = df['price']
    y_log = np.log(y)    
    return X, y, y_log
    

def get_feats(df, blacklist_extended):
    """Creates training features by taking all number and boolean columns 
    from provided dataframe, reduced by those in blacklist.
    """    
    blacklist = [
        'price',
        'id', 
        'price_m2',
    ]
    # Extra blacklist feature can be added as list in function argument. 
    if blacklist_extended:
        blacklist = blacklist + blacklist_extended       
                         
    num_bool_feats = df.select_dtypes(['number', 'bool'])
    return [x for x in num_bool_feats if x not in blacklist]


def get_model(model_or_id):
    """Returns one of two default models, or alternative one, provided as function argument."""
    if model_or_id == 'catboost_hyper':  # hyper parameters tuned for better forecast, but longer model training.
        model_params = dict(             
            max_depth=8,
            n_estimators=1000,
            learning_rate=0.3,
            random_state=0,
            silent=True,
        )
        return catboost.CatBoostRegressor(**model_params)
    
    elif model_or_id == 'catboost_light':  # hyper parameters tuned for quick results.
        model_params = dict(            
            max_depth=5,
            n_estimators=100,
            random_state=0,
            silent=True,
        )
        return catboost.CatBoostRegressor(**model_params)
    else:
        return model_or_id


def mlflow_experiment(run_name, model, X, feats, result, eli5_result):
    """Uses MLflow tool to record and track important information about experiments.
    Such as model parameters, shape of feature and targe matrix, feats used for model training or eli5 importance of the features
    """
    with mlflow.start_run(experiment_id=_eid('property_forecast'), run_name=run_name) as run:
    
        mlflow.log_params(model.get_params())
        mlflow.log_param("model", str(model).split("(")[0])
        mlflow.log_param("feats", feats)
        mlflow.log_param('X.shape', X.shape)
        
        # artifacts
        with open('outputs/eli5.html', 'w') as f:
            f.write('<html>{}</html>'.format(eli5_result.data))
        mlflow.log_artifact('outputs/eli5.html', 'plot')
                
        # metrics
        mlflow.log_metric('mae_mean', result[0])
        mlflow.log_metric('mae_std', result[1])
              
    print(f'Experiment {run_name} recorded')

    
def check_model(X, y, model, scoring):
    """Evaluate model based on selected metric."""
    cv = KFold(n_splits=5, shuffle=True, random_state=0)
    scores = []    
    for train_idx, test_idx in tqdm(cv.split(X)):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_log_train, y_test = np.log(y).iloc[train_idx], y.iloc[test_idx]

        model.fit(X_train, y_log_train)
        y_log_pred = model.predict(X_test)
        y_pred = np.exp(y_log_pred)

        score = scoring(y_test, y_pred)
        scores.append(score)
        
    return (np.mean(scores), np.std(scores))
    
    
def save_forecast_to_csv(model, X_test, df_test, filename):
    """Exports forecast to csv file"""
    y_pred = model.predict(X_test)
    df_test['price'] = np.exp(y_pred)
    
    try:
        df_test[ ['id', 'price'] ].to_csv('output/{}.csv'.format(filename), index=False)
        print('{}.csv save successfully in "output" folder.'.format(filename))
    except:
        print('{}.csv saving ERROR in "output" folder.'.format(filename))
    
    
def start_experiment(df, model_or_id, scoring=mean_absolute_error,
                     filename='catboost',
                     results=True, export=False, mlflow_save=True, blacklist_extended=None):
    """Main function to conducts experiment.
    
    Main feature are:
    record experiment data using MLflow - "mlflow_save" argument,
    showing results based on selected metric - "results" argument,
    exporting forecast to csv file - "export" argument.
    All above argument are boolean and can be set independently according to expected output.
    Mlflow_save argument can be set to True only if results argument is also set to True.
    """
    if mlflow_save:
        assert results, '"mlflow_save" argument can be set to True only if "results" argument is also set to True.'
        
    filename = get_filename(filename)    
    print(f'Working on "{filename}" experiment.')
    
    # Splits dataset into train and test dataframes.     
    df_train = df[ df['price'].notnull() ].fillna(-1)  # takes all rows with "price" column NOT empty
    df_test = df[ df['price'].isnull() ].fillna(-1)  # # takes all rows with "price" column empty
    
    # Gets training features
    feats = get_feats(df, blacklist_extended )
    
    # Gets X and y (feature matrix and target vector).
    X, y, y_log = get_X_y_log(df_train, feats)

    # Gets model
    model = get_model(model_or_id)
    
    # Returning results
    if results:
        result = check_model(X, y, model, scoring)
        
    model.fit(X.values, y_log.values)
    
    # Export forecast to csv.
    if export:
        X_test = df_test[feats].values
        save_forecast_to_csv(model, X_test, df_test, filename)    
  
    eli5_result = eli5.show_weights(model, feature_names=feats, top=100)
    
    #MLflow tracking
    if mlflow_save and results:
        mlflow_experiment(filename, model, X, feats, result, eli5_result)

    if results:
        print(f'MAE SCORE: mean {result[0]}, std {result[1]}')
    return eli5_result  

## Load data

In [31]:
df_train = pd.read_hdf('data/train_property.h5')
df_test = pd.read_hdf('data/test_property.h5')
df_org = pd.concat([df_train, df_test])

# External city and province data (Wikipedia)
city_stats = pd.read_csv('external_data/city_stats_wiki.csv')
city_stats.drop('Województwo', axis=1, inplace=True)
city_stats.columns = ['city', 'county', 'city_area', 'city_population', 'city_density']

province_stats = pd.read_csv('external_data/province_stats_wiki.csv')
province_stats.drop('Lp.', axis=1, inplace=True)
province_stats.columns = ['province', 'province_population', 'province_men_population', 'province_women_population']

print(df_train.shape, df_test.shape, df_org.shape)

del df_train, df_test
gc.collect;

(46489, 53) (46275, 52) (92764, 53)


In [4]:
df_org.sample(5)

Unnamed: 0,id,location,is_private,piekarnik,garaż,monitoring / ochrona,rolety antywłamaniowe,kuchenka,taras,balkon,ogródek,dwupoziomowe,system alarmowy,pom. użytkowe,klimatyzacja,tarasy,teren zamknięty,internet,winda,telefon,pralka,piwnica,ochrona,telewizja kablowa,telewizor,lodówka,domofon / wideofon,oddzielna kuchnia,zmywarka,garaż/miejsce parkingowe,meble,drzwi / okna antywłamaniowe,plan zagospodarowania:,price,area,rooms,floor,floors_in_building,dostępne od,rok budowy,materiał budynku,okna,data rozpoczęcia,stan inwestycji,liczba kondygnacji,stan wykończenia,czynsz,rodzaj zabudowy,ogrzewanie,forma własności,stats,text,is_primary_market
86154,86154,"[wielkopolskie, poznański, Swarzędz]",0,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,,"70,50 m²",4,4,(z 4),,,wielka płyta,plastikowe,,,,,,blok,,spółdzielcze wł. z kw,"{'ID': 108913320, 'visit_ads': '973', 'created...",Opis Polecam na sprzedaż mieszkanie czteropoko...,False
16904,16904,"[dolnośląskie, Wrocław, Fabryczna]",0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,True,True,False,False,False,True,False,False,True,False,False,False,False,False,False,268236.0,"37,47 m²",3,3,(z 4),,,,,,,,,,blok,,,"{'ID': 112363296, 'created_at': 'ponad 14 dni ...","Opis Kupujący nie ponosi żadnych kosztów , tak...",True
76365,76365,"[pomorskie, Gdańsk, Siedlce, ul. Jacka Malczew...",0,False,False,False,False,True,False,True,False,False,False,False,False,False,True,False,True,False,True,False,False,False,False,True,True,False,False,True,True,False,False,,56 m²,3,1,(z 3),13 marca 2018,2015.0,pustak,plastikowe,,,,do zamieszkania,550 zł,,miejskie,pełna własność,"{'ID': 105989310, 'visit_ads': '879', 'created...",Opis LOKALIZACJA: Gdańsk Siedlce. Kilka minut ...,False
63840,63840,"[świętokrzyskie, Kielce, Nowy Folwark, Domaszo...",0,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,True,False,False,True,False,False,False,,"63,10 m²",3,parter,(z 1),,,silikat,,,,,do wykończenia,,dom wolnostojący,gazowe,pełna własność,"{'ID': 106885440, 'visit_ads': '3201', 'create...",Opis 24 mieszkania w nowej inwestycji przy uli...,True
13944,13944,"[kujawsko-pomorskie, Grudziądz, Rządz, ul. Syb...",-1,False,False,False,False,False,True,False,False,False,False,False,False,False,False,True,True,True,False,False,False,True,False,False,False,False,False,True,False,False,False,319796.59375,"73,18 m²",4,3,(z 4),,2019.0,,plastikowe,,,,do wykończenia,500 zł,blok,miejskie,pełna własność,"{'ID': 109430352, 'created_at': 'ponad 14 dni ...",Opis NOWY RZĄDZ- NOWA INWESTYCJA!!! Nowe miesz...,True


## Feature engineering

In [32]:
def feature_engineering(df):
    """Main feature engineering function. Returned dataframe is ready to model training."""
    
    def parse_czynsz(val):
        """Parses "czynsz" feature object/string to numeric value."""
        if isinstance(val, int): return val
        if isinstance(val, float): return val

        if val[-1] == 'ł':
            return float(val.split('zł')[0].replace(' ', '').replace(',','.'))
        if val[-1] == 'r':
            return float(val.split('eur')[0].replace(' ', '').replace(',','.'))*4.5
    
    
    def parse_area(val):
        """Parses "area" feature from object/string to numeric value."""
        if isinstance(val, int): return val
        if isinstance(val, float): return val

        return float(val.split('m')[0].replace(',','.').replace(' ',''))
    

    def parse_floors_in_building(val):
        """Parses "floors_in_building" feature from object/string to numeric value."""
        if isinstance(val, int): return -1
        if isinstance(val, float): return -1    
        floor = float(val.replace(')','').split()[1])
        return floor if floor < 20 else 25
    
    
    def parse_location_city(val):
        """Using external data from wikipedia checks if value parsed from location feature is city."""
        all_city = city_stats['city'].to_list()
        for city_ in reversed(val):
        # "Józefów" apears more then one time on on all cities list and "Dobra" appears also as a street name.
        # I decided to exclude them, but it can be improved.
            if city_ in ['Dobra', 'Józefów']:
                continue
            if city_ in all_city:
                    return city_
        return 'other' 
        

    def build_year_norm(year):
        """Normalize 'build_year' feature"""
        years = [1970, 1980, 1990, 2000, 2005, 2010, 2012, 2014, 2016, 2017]
        if year < 1970: return 1900
        if year > 2017: return 2018
    
        for idx in range(len(years) - 1):
            if years[idx+1] > year >= years[idx]:
                return years[idx]
    
    
    def df_groupby_feat(df, groupby_feats, feat):
        """Retuns aggregated feature by selected one or more features"""
        agg_params={
            'mean_{}_{}'.format('_'.join(groupby_feats), feat): (feat, 'mean'),
            'median_{}_{}'.format('_'.join(groupby_feats), feat): (feat, 'median'),   
        }
        return df[groupby_feats + [feat]].groupby(groupby_feats).agg(
            **agg_params
        ).reset_index()
    
    
    def concat_and_factorize(df, feat1, feat2):        
        """Concatenate two features and categorize the results"""
        # Concatenate part.
        df['{}_{}'.format(feat1, feat2)] = df[ [feat1, feat2] ].apply(
            lambda x: '{}_{}'.format(x[feat1], x[feat2]), axis=1
        )
        # Categorize part.
        df['{}_{}_cat'.format(feat1, feat2)] = df['{}_{}'.format(feat1, feat2)].factorize()[0]

        return df

    
    # Area
    df['area_num'] = df.area.map(parse_area)
    area_num_99 = np.percentile(df['area_num'], 99)
    df['area_norm'] = df['area_num'].map(lambda x: x if x <= area_num_99 else area_num_99)
    df['area_num_log'] = np.log(df['area_num'])
    df['price_m2'] = df['price'] / df['area_num'] 
    
    # Rooms
    df['area_per_room'] = df['area_norm'] / df["rooms"]    

    # Location 
    province_cities = ['Białystok', 'Bydgoszcz', 'Gdańsk', 'Gorzów Wielkopolski', 'Katowice', 'Kielce', 'Kraków', 'Lublin',
    'Łódź', 'Olsztyn', 'Opole', 'Poznań', 'Rzeszów', 'Szczecin', 'Toruń', 'Warszawa', 'Wrocław', 'Zielona Góra']
    
    df['province'] = df['location'].map(lambda x: x[0])
    df['city'] = df['location'].map(parse_location_city)
    df['province_city'] = df['city'].isin(province_cities)
    
    # Merging main dataframe with external data about cities.
    if 'city_area' not in df.columns:
        df = pd.merge(df, city_stats, on='city', how='left')
    # Merging main dataframe with external data about provinces.    
    if 'province_population' not in df.columns:
        df = pd.merge(df, province_stats, on='province', how='left')
        
    """'Location' feature is list cointaining elements describing property location in order
    from general to specyfic, which could be [<province>, <county>, <city>, <district> and <street>].
    """    
    for i in range(5):
        # We can assume that "loc1" is likely province, "loc2" is likely county and so on.
        df["loc{}".format(i)] = df["location"].map(lambda x: x[i] if len(x) > i else "")      
    
    df['loc01'] = df['loc0'] + df['loc1']
    df['loc012'] = df['loc0'] + df['loc1'] + df['loc2']
    df['loc12'] = df['loc1'] + df['loc2']
    
    # Categorize location features
    for i in range(5):
        df["loc{}_cat".format(i)] = df["loc{}".format(i)].factorize()[0]
    df["loc01_cat"] = df['loc01'].factorize()[0]
    df["loc012_cat"] = df['loc012'].factorize()[0] 
    df["loc12_cat"] = df['loc12'].factorize()[0]
    
    df['city_cat'] = df['city'].factorize()[0]
    df['county_cat'] = df['county'].factorize()[0]
    df['province_cat'] = df['province'].factorize()[0]
     
    big_cities = {'Poznań', 'Sopot', 'Wrocław', 'Kraków', 'Gdańsk', 'Gdynia', 'Opole', 'Katowice',  'Częstochowa', 'Szczecin', 'Kalisz', 'Łódź', 'Olsztyn', 'Warszawa'}
    for city in big_cities:
        df[city] = df['city'] == city
        df['big_city'] = df['city'].map(lambda x: x in big_cities)
            
    # loc1 is likely to be "city", and loc2 is likely to be "district", so with combining this two
    # we could get for example: WrocławKrzyki, WarszawaŚródmieście, SopotGórny and so on.
    df_val_cnts = df['loc12'].value_counts()
    
    # We takes combinations only if they occur more then 100 times in dataset.
    loc12_vals = set(df_val_cnts[ df_val_cnts > 100].index.values)
    for item in loc12_vals:
        df[item] = df['loc12'] == item    

    # Floor
    floors_dict = {'parter': 0, '> 10': 11, 'poddasze': -2, 'suterena': -1}
    df['floor_num'] = df['floor'].map(lambda x: floors_dict.get(x, x)).fillna(-10).astype('int')
   
    # Czynsz (Rent)
    df['rent_num'] = df['czynsz'].map(parse_czynsz)
    
    # Floors_in_building
    df['floors_in_building_num'] = df['floors_in_building'].map(parse_floors_in_building)
  
    # Rok budowy (build_year)
    df['build_year'] = df['rok budowy'].fillna(-1).astype('int')
    df["build_year_norm"] = df["build_year"].map(build_year_norm)
    
    # OHE: materiał budynku, okna, stan wykończenia, rodzaj zabudowy, ogrzewanie, forma własności.
    cat_feats = {
        "materiał budynku": "build_material_cat",
        "okna": "window_cat",
        "stan wykończenia": "property_completion_cat",
        "rodzaj zabudowy": "property_type_cat",
        "ogrzewanie": "property_heating_cat",
        "forma własności": "own_property_cat"
     }    
    
    for feat_name, feat_new_name in cat_feats.items():
        df[feat_new_name] = df[feat_name].factorize()[0] 
        df_dummies = pd.get_dummies(df[feat_name])
        df_dummies.columns = ['{0}_{1}'.format(feat_new_name, x) for x in df_dummies.columns]
        df = pd.concat([df, df_dummies], axis=1)
        

    df['security'] = df['system alarmowy'] | df['rolety antywłamaniowe'] | df['drzwi / okna antywłamaniowe']


    # Price agreggations.
    groupby_city_price = df_groupby_feat(df, ['city'], 'price')        
    if 'median_city_price' not in df:
        df = pd.merge(df, groupby_city_price, on='city', how='left')
        
    groupby_county_price = df_groupby_feat(df, ['county'], 'price')        
    if 'median_county_price' not in df:
        df = pd.merge(df, groupby_county_price, on='county', how='left')
    
    
    # Primary_market
    df = concat_and_factorize(df, 'is_primary_market', 'rooms')
    df = concat_and_factorize(df, 'is_primary_market', 'city')
    df = concat_and_factorize(df, 'is_primary_market', 'rodzaj zabudowy')
    
    
    # Price for m2 aggregation(is_primaru_market).
    groupby_price_m2 = df_groupby_feat(df, ['is_primary_market_rooms'], 'price_m2')
    if 'median_is_primary_market_rooms_price_m2' not in df:
        df = pd.merge(df, groupby_price_m2, on='is_primary_market_rooms', how='left')
 
    groupby_price_m2 = df_groupby_feat(df, ['is_primary_market_rodzaj zabudowy'], 'price_m2')
    if 'median_is_primary_market_rodzaj zabudowy_price_m2' not in df:
        df = pd.merge(df, groupby_price_m2, on='is_primary_market_rodzaj zabudowy', how='left')

        
    # Stats
    stats = df['stats'].apply(pd.Series)
    df = pd.concat([df, stats], axis=1)
    dict_created_at={
        'ponad 14 dni temu':18,  
        '23.10.2018':17,
        '24.10.2018':16,
        '25.10.2018':15,
        '26.10.2018':14, 
        '27.10.2018':13,
        '28.10.2018':12,
        '29.10.2018':11,
        '30.10.2018':10,
        '31.10.2018':9,
        '01.11.2018':8,    
        '02.11.2018':7,
        '03.11.2018':6,
        '04.11.2018':5,
        '05.11.2018':4,
        '06.11.2018':3,
        '07.11.2018':2,
        '08.11.2018':1,
    }
        
    df['created_at_cat'] = df['created_at'].map(dict_created_at)
    df['visit_ads_num'] = df.visit_ads.map(lambda x: np.log(int(x) + 10) if not isinstance(x, float)  else -1)
                                                                   
    return df

In [33]:
df_fe = feature_engineering(df_org.copy())
print(df_fe.shape)
df_fe.sample(3)

  df[item] = df['loc12'] == item
  df['floor_num'] = df['floor'].map(lambda x: floors_dict.get(x, x)).fillna(-10).astype('int')
  df['rent_num'] = df['czynsz'].map(parse_czynsz)
  df['floors_in_building_num'] = df['floors_in_building'].map(parse_floors_in_building)
  df['build_year'] = df['rok budowy'].fillna(-1).astype('int')
  df["build_year_norm"] = df["build_year"].map(build_year_norm)
  df[feat_new_name] = df[feat_name].factorize()[0]
  df['security'] = df['system alarmowy'] | df['rolety antywłamaniowe'] | df['drzwi / okna antywłamaniowe']


(92764, 376)


Unnamed: 0,id,location,is_private,piekarnik,garaż,monitoring / ochrona,rolety antywłamaniowe,kuchenka,taras,balkon,ogródek,dwupoziomowe,system alarmowy,pom. użytkowe,klimatyzacja,tarasy,teren zamknięty,internet,winda,telefon,pralka,piwnica,ochrona,telewizja kablowa,telewizor,lodówka,domofon / wideofon,oddzielna kuchnia,zmywarka,garaż/miejsce parkingowe,meble,drzwi / okna antywłamaniowe,plan zagospodarowania:,price,area,rooms,floor,floors_in_building,dostępne od,rok budowy,materiał budynku,okna,data rozpoczęcia,stan inwestycji,liczba kondygnacji,stan wykończenia,czynsz,rodzaj zabudowy,ogrzewanie,forma własności,stats,text,is_primary_market,area_num,area_norm,area_num_log,price_m2,area_per_room,province,city,province_city,county,city_area,city_population,city_density,province_population,province_men_population,province_women_population,loc0,loc1,loc2,loc3,loc4,loc01,loc012,loc12,loc0_cat,loc1_cat,loc2_cat,loc3_cat,loc4_cat,loc01_cat,loc012_cat,loc12_cat,city_cat,county_cat,province_cat,Wrocław,big_city,Kraków,Gdańsk,Gdynia,Warszawa,Katowice,Sopot,Szczecin,Poznań,Olsztyn,Kalisz,Opole,Częstochowa,Łódź,świdnickiŚwidnica,KrakówWola Justowska,KrakówGrzegórzki,głogowskiGłogów,SzczecinWarszewo,ToruńNa Skarpie,RzeszówDrabinianka,Zielona Góra,GdyniaObłuże,WrocławTarnogaj,Gliwice,Rzeszów,BydgoszczBielawy,PoznańBułgarska 59,WrocławPlac Grunwaldzki,wołomińskiMarki,gdańskiPruszcz Gdański,inowrocławskiInowrocław,wejherowskiWejherowo,piaseczyńskiJózefosław,KatowiceBrynów,WrocławStare Miasto,PoznańPodolany,WarszawaGocław,WrocławKlecina,WarszawaTargówek,Grudziądz,WrocławGaj,ostrowskiOstrów Wielkopolski,ŁódźŚródmieście,GdyniaŚródmieście,KrakówWzgórza Krzesławickie,PoznańStare Miasto,krakowskiSkawina,WrocławKrzyki,BiałystokWysoki Stoczek,ToruńMokre,PoznańGrunwald,BiałystokCentrum,GdańskJelitkowo,KrakówMistrzejowice,LublinFelin,GdańskJasień,PoznańWilda,KatowiceKostuchna,WarszawaWilanów,ŁódźGórna,KatowiceOsiedle Tysiąclecia,ŁódźBałuty,Dąbrowa GórniczaGołonóg,KrakówStare Podgórze,będzińskiSiewierz,tczewskiTczew,BydgoszczBartodzieje,SzczecinCentrum,WarszawaPraga-Północ,LublinWęglinek,WarszawaOchota,wejherowskiRumia,WarszawaWola,KrakówŁobzów,ŁódźPolesie,tatrzańskiZakopane,BydgoszczFordon,GdańskMorena,WrocławJagodno,BydgoszczBłonie,GdańskŚródmieście,KatowiceWełnowiec,WrocławŚródmieście,Gorzów WielkopolskiGórczyn,będzińskiBędzin,GdańskStare Miasto,GdańskChełm,WarszawaWłochy,GdańskPrzymorze,SopotDolny,LublinCzuby,WarszawaMokotów,GdyniaWitomino,BydgoszczWyżyny,Kielce,Wrocławul. Brzoskwiniowa,WarszawaBemowo,WarszawaUrsus,PoznańŁazarz,LublinWrotków,poznańskiSwarzędz,GdańskSiedlce,BiałystokBojary,wrocławskiSiechnice,dzierżoniowskiDzierżoniów,SosnowiecCentrum,LublinCentrum,BydgoszczSzwederowo,GliwiceStare Gliwice,BiałystokAntoniuk,KrakówWola Duchacka,SzczecinŚródmieście,GdańskZaspa,KrakówPodgórze,GdyniaRedłowo,PoznańJeżyce,SopotGórny,WrocławStabłowice,lubińskiLubin,KrakówNowa Huta,BydgoszczGlinki,WarszawaBiałołęka,BiałystokNowe Miasto,KatowiceŚródmieście,ToruńKoniuchy,GdańskKowale,wielickiWieliczka,WarszawaŚródmieście,KatowiceDolina Trzech Stawów,ełckiEłk,LublinDziesiąta,poznańskiZalasewo,GdańskOliwa,KatowicePiotrowice,KatowiceOsiedle Paderewskiego,GdańskŁostowice,LublinLSM,BydgoszczGórzyskowo,GdyniaOksywie,PoznańWilczak,WarszawaPowiśle,pruszkowskiPruszków,KrakówCzyżyny,wołomińskiZąbki,BydgoszczCentrum,CzęstochowaParkitka,ŁódźWidzew,WrocławFabryczna,KielceCentrum,KrakówPrądnik Biały,PoznańNowe Miasto,WrocławSwojczyce,BytomCentrum,SosnowiecPogoń,ToruńRubinkowo,WarszawaPraga-Południe,PoznańCentrum,kołobrzeskiKołobrzeg,ToruńBydgoskie Przedmieście,lęborskiLębork,WarszawaBielany,ZabrzeCentrum,ToruńJakubskie Przedmieście,poznańskiLuboń,PoznańWinogrady,KrakówKazimierz,KrakówDębniki,TychyŻwaków,piaseczyńskiPiaseczno,KrakówBieżanów-Prokocim,GliwiceCentrum,GdyniaMały Kack,SzczecinNiebuszewo,tarnogórskiTarnowskie Góry,KrakówPrądnik Czerwony,WarszawaSaska Kępa,PoznańPiątkowo,OpoleŚródmieście,PoznańNaramowice,KielceŚlichowice,puckiJastarnia,SzczecinPogodno,KrakówBronowice Małe,GdańskUjeścisko,KrakówSalwator,LublinŚródmieście,KatowiceJózefowiec,KrakówKrowodrza,wejherowskiReda,KrakówOlsza,WrocławHuby,KrakówKliny-Zacisze,SosnowiecZagórze,stargardzkiStargard,KrakówKurdwanów,BydgoszczOsiedle Leśne,ToruńChełmińskie Przedmieście,wrocławskiKiełczów,SzczecinGumieńce,WrocławZłotniki,RzeszówSłocina,KrakówStare Miasto,KrakówRuczaj,KrakówBronowice,WarszawaUrsynów,GdyniaOrłowo,KrakówŚródmieście,WarszawaWawer,BydgoszczŚródmieście,namysłowskiNamysłów,CzęstochowaTysiąclecie,BydgoszczKapuściska,WarszawaŻoliborz,WrocławMuchobór Wielki,Zielona GóraCentrum,LublinSzerokie,PoznańRataje,WrocławMaślice,kamieńskiMiędzyzdroje,PoznańGórczyn,WrocławPsie Pole,GdańskWrzeszcz,floor_num,rent_num,floors_in_building_num,build_year,build_year_norm,build_material_cat,build_material_cat_beton,build_material_cat_beton komórkowy,build_material_cat_cegła,build_material_cat_drewno,build_material_cat_inne,build_material_cat_keramzyt,build_material_cat_pustak,build_material_cat_silikat,build_material_cat_wielka płyta,build_material_cat_żelbet,window_cat,window_cat_aluminiowe,window_cat_drewniane,window_cat_plastikowe,property_completion_cat,property_completion_cat_do remontu,property_completion_cat_do wykończenia,property_completion_cat_do zamieszkania,property_type_cat,property_type_cat_apartamentowiec,property_type_cat_blok,property_type_cat_dom wolnostojący,property_type_cat_kamienica,property_type_cat_loft,property_type_cat_plomba,property_type_cat_szeregowiec,property_heating_cat,property_heating_cat_elektryczne,property_heating_cat_gazowe,property_heating_cat_inne,property_heating_cat_kotłownia,property_heating_cat_miejskie,property_heating_cat_piece kaflowe,own_property_cat,own_property_cat_pełna własność,own_property_cat_spółdzielcze wł. z kw,own_property_cat_spółdzielcze własnościowe,own_property_cat_udział,security,mean_city_price,median_city_price,mean_county_price,median_county_price,is_primary_market_rooms,is_primary_market_rooms_cat,is_primary_market_city,is_primary_market_city_cat,is_primary_market_rodzaj zabudowy,is_primary_market_rodzaj zabudowy_cat,mean_is_primary_market_rooms_price_m2,median_is_primary_market_rooms_price_m2,mean_is_primary_market_rodzaj zabudowy_price_m2,median_is_primary_market_rodzaj zabudowy_price_m2,ID,created_at,updated_at,visit_ads,created_at_cat,visit_ads_num
13188,26292,"[małopolskie, Kraków, Stare Miasto, ul. Starow...",0,False,False,False,False,False,False,False,False,False,False,False,False,False,True,True,False,False,False,False,False,True,False,False,True,False,False,False,False,False,False,340000.0,"28,40 m²",1,poddasze,(z 2),31 października 2018,2018.0,cegła,drewniane,,,,do wykończenia,,kamienica,gazowe,pełna własność,"{'ID': 112378192, 'created_at': 'ponad 14 dni ...",Opis Na sprzedaż mieszkanie 1-pokojowe o pow. ...,False,28.4,28.4,3.346389,11971.830986,28.4,małopolskie,Kraków,True,Kraków[a],326.85,767348.0,2348.0,3372618,1636707,1735911,małopolskie,Kraków,Stare Miasto,ul. Starowiślna,,małopolskieKraków,małopolskieKrakówStare Miasto,KrakówStare Miasto,5,95,26,2498,0,97,933,933,162,100,5,False,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,-2,,2.0,2018,2018.0,1,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,0,1,0,2,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,False,529502.5,416305.0,529502.5,416305.0,False_1,7,False_Kraków,222,False_kamienica,5,7201.175855,6666.666667,6034.568709,4906.693992,112378192,ponad 14 dni temu,24.10.2018,,18,-1.0
11021,21979,"[małopolskie, Kraków, Wincentego Weryhy-Darows...",0,False,False,False,False,False,False,True,False,False,False,False,False,False,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,420000.0,54 m²,2,2,(z 3),,2010.0,,plastikowe,,,,,210 zł,blok,,pełna własność,"{'ID': 112943816, 'visit_ads': '104', 'created...",Opis | 2 pok 54 m2 Weryhy Darowskiego/Bronowic...,False,54.0,54.0,3.988984,7777.777778,27.0,małopolskie,Kraków,True,Kraków[a],326.85,767348.0,2348.0,3372618,1636707,1735911,małopolskie,Kraków,Wincentego Weryhy-Darowskiego,,,małopolskieKraków,małopolskieKrakówWincentego Weryhy-Darowskiego,KrakówWincentego Weryhy-Darowskiego,5,95,1003,1,0,97,1083,1083,162,100,5,False,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,2,210.0,3.0,2010,2010.0,-1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,-1,0,0,0,1,0,1,0,0,0,0,0,-1,0,0,0,0,0,0,0,1,0,0,0,False,529502.5,416305.0,529502.5,416305.0,False_2,3,False_Kraków,222,False_blok,3,6437.640518,5813.953488,5587.835882,5201.773009,112943816,02.11.2018,04.11.2018,104.0,7,4.736198
31705,63211,"[śląskie, Katowice, Wełnowiec, Bytkowska]",0,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,347097.0,"62,54 m²",2,2,(z 4),,,,plastikowe,,,,,,apartamentowiec,,pełna własność,"{'ID': 112691128, 'visit_ads': '67', 'created_...",Opis Do sprzedaży nowe mieszkania w zielonej ...,False,62.54,62.54,4.135806,5550.0,31.27,śląskie,Katowice,True,Katowice[a],164.64,296262.0,1799.0,4570849,2204972,2365877,śląskie,Katowice,Wełnowiec,Bytkowska,,śląskieKatowice,śląskieKatowiceWełnowiec,KatowiceWełnowiec,11,205,2341,5880,0,211,2611,2611,358,210,11,False,True,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,2,,4.0,-1,1900.0,-1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,-1,0,0,0,4,1,0,0,0,0,0,0,-1,0,0,0,0,0,0,0,1,0,0,0,False,327488.5625,289184.0,327488.5625,289184.0,False_2,3,False_Katowice,523,False_apartamentowiec,9,6437.640518,5813.953488,9576.944357,8492.838129,112691128,ponad 14 dni temu,ponad 14 dni temu,67.0,18,4.343805


# Dummy Model - Score: 182576.93612

In [None]:
feats = ['id']
model = DummyRegressor()
model.fit(df_train[feats].values, df_train['price'])
y_pred = model.predict(df_test[feats].values)

df_test['price'] = y_pred
df_test[ ['id', 'price'] ].to_csv('../output/DummyRegressor.csv', index=False)

In [None]:
#msno.bar(df_train.iloc[:,40:50])

## MLflow

In [17]:
!mlflow ui --backend-store-uri file:///mlruns

^C


## [MLflow Dashboard](http://localhost:5000)

## Testing Models

In [54]:
start_experiment(df_fe,
                 'catboost_light', scoring=mean_absolute_error,
                 filename='catboost_light',
                 results=True,
                 export=True,
                 mlflow_save=True)

Working on "catboost_light_14" experiment.


5it [00:41,  8.22s/it]


catboost_light_14.csv save successfully in "output" folder.
Experiment catboost_light_14 recorded
MAE SCORE: mean 62720.58822222085, std 1994.9434438704461


Weight,Feature
0.2461,mean_city_price
0.1606,area_norm
0.0941,area_num
0.0847,median_county_price
0.0617,mean_county_price
0.0605,area_num_log
0.0408,build_year
0.0395,median_is_primary_market_rodzaj zabudowy_price_m2
0.0204,is_primary_market
0.0135,property_completion_cat


In [35]:
start_experiment(df_fe, 'catboost_hyper', scoring=mean_absolute_error,
                 filename='catboost_hyper',
                 results=True,
                 export=True,
                 mlflow_save=True)

Working on "catboost_hyper_7" experiment.


5it [04:46, 57.34s/it]


catboost_hyper_7.csv save successfully in "output" folder.
Experiment catboost_hyper_7 recorded
MAE SCORE: mean 52937.314563540844, std 2038.8693217414093


Weight,Feature
0.1167,area_num
0.1094,area_norm
0.0806,median_county_price
0.0713,median_city_price
0.0568,mean_city_price
0.0443,build_year
0.0360,mean_county_price
0.0257,visit_ads_num
0.0254,area_num_log
0.0206,median_is_primary_market_rodzaj zabudowy_price_m2
