In [1]:
import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np
import json
from os.path import exists
import joblib
import catboost

## Defining function used for training model.

In [131]:
def get_model_filename(basename):
    """Creates unique filename to prevent overwriting
    file when exporting trained model.
    """
    counter=0
    name = '{}_{}'.format(basename, counter)  # filename in format <basename>_<number>
    # Adds +1 to <number> until it gets unique filename
    while exists('models/{}.model'.format(name)):
        counter +=1
        name = '{}_{}'.format(basename, counter)
        
    return name


def get_X_y_log(df, feats):
    """Creates feature matrix, target and logarihtmic vector for model training."""
    X = df[feats].values
    y = df['price'].values
    y_log = np.log(y)    
    return X, y_log
  

def get_feats(df):
    """Creates training features by taking all number and boolean columns 
    from provided dataframe, reduced by those in blacklist.
    """       
    blacklist = [
        'price',
        'id', 
        'price_m2',
        'floors_in_building'
        ]    
                         
    num_bool_feats = df.select_dtypes(['number', 'bool'])
    return [x for x in num_bool_feats if x not in blacklist]
    
    
def default_model():   
    """Returns CatBoostRegressor model with specific hyper parameters"""
    model_params = dict(
        max_depth=8,
        n_estimators=1000,
        learning_rate=0.3,
        random_state=0,
        silent=True,
        )
    return catboost.CatBoostRegressor(**model_params)
    
    
def train_model(df, info=True):
    """Trains and returns Catboost Regressor model"""
    
    # Removes from training data possible target variable leaks.
    df_train = df[ df['price'].notnull() ].fillna(-1)
        
    # Gets training features.
    feats = get_feats(df_train)
    
    # Gets X and y (feature matrix and target vector). 
    X, y_log = get_X_y_log(df_train, feats)
    
    # Model training.
    model = default_model()
    model.fit(X, y_log)
    
    if info:
        print(f' Number of feats: {len(feats)}')        
        print()
        print(f'FEATS: \n {feats}\n')
    
    return model

    
def export_model(model, basename):  
    """Exports trained model to "models" folder"""    
    filename = get_model_filename(basename)
    joblib.dump(model, "models/{}.model".format(filename))
    return 'Done'

## Load data

In [132]:
df_org = pd.read_hdf('data/train_property.h5')

# External city and province data (Wikipedia)
city_stats = pd.read_csv('external_data/city_stats_wiki.csv')
city_stats.drop('Województwo', axis=1, inplace=True)
city_stats.columns = ['city', 'county', 'city_area', 'city_population', 'city_density']

province_stats = pd.read_csv('external_data/province_stats_wiki.csv')
province_stats.drop('Lp.', axis=1, inplace=True)
province_stats.columns = ['province', 'province_population', 'province_men_population', 'province_women_population']

print(df_org.shape)

(46489, 53)


### Preprocess Data

In [1]:
def preprocess_data(df):
    """Preprocess features to get desirable, better intuitive format when inputing data
    for predictions"""    

    # Defins features that will be required for predictions.
    required_feats = [    
        "area",
        "rooms",
        "location",
        'floor',
        'floors_in_building',
        'rok budowy',
        'czynsz',        
        'is_primary_market',
        'is_private',
        'rodzaj zabudowy',
        'materiał budynku',
        'okna',
        'stan wykończenia',
        'ogrzewanie',
        'forma własności',
        'system alarmowy',
        'rolety antywłamaniowe',
        'drzwi / okna antywłamaniowe',
    ]

    required_training_feats = ['price'] + required_feats

    def parse_area(val):
        """Parses "area" feature from object/string to numeric value."""    
        if isinstance(val, int): return val
        if isinstance(val, float): return val
        return val.split('m')[0].replace(',','.').replace(' ','')

    def parse_czynsz(val):
        """Parses "czynsz" feature object/string to numeric value."""
        if isinstance(val, int): return val
        if isinstance(val, float): return val

        if val[-1] == 'ł':
            return float(val.split('zł')[0].replace(' ', '').replace(',','.'))
        if val[-1] == 'r':
            return float(val.split('eur')[0].replace(' ', '').replace(',','.'))*4.5

    def parse_floors_in_building(val):
        """Parses "floors_in_building" feature from object/string to numeric value."""
        if isinstance(val, int): return -1
        if isinstance(val, float): return -1    
        return int(val.replace(')','').split()[1])

    def parse_is_private(val):
        if val in [0, 1]:
            return bool(val)
        return -1
    

    df = df[ required_training_feats ].copy()

    df['area'] = df['area'].map(parse_area)
    df['czynsz'] = df['czynsz'].map(parse_czynsz)
    df['floors_in_building'] = df['floors_in_building'].map(parse_floors_in_building)
    df['is_private'] = df['is_private'].map(parse_is_private)
    
    
    return df

In [134]:
df_pre = preprocess_data(df_org)
df_pre.shape

(46489, 19)

In [135]:
def feature_engineering(df):
    """Main feature engineering function. Returned dataframe is ready to model training."""
    
    def parse_location_city(val):
        """Using external data from wikipedia checks if value parsed from location feature is city."""
        all_city = city_stats['city'].to_list()
        for city_ in reversed(val):
        # "Józefów" apears more then one time on on all cities list and "Dobra" appears also as a street name.
        # I decided to exclude them, but it can be improved.
            if city_ in ['Dobra', 'Józefów']:
                continue
            if city_ in all_city:
                    return city_
        return 'other' 
    
    
    def normalize_build_year(year):
        """Normalize 'build_year' feature"""
        years = [1970, 1980, 1990, 2000, 2005, 2010, 2012, 2014, 2016, 2017]
        if year < 1970: return 1900
        if year > 2017: return 2018
    
        for idx in range(len(years) - 1):
            if years[idx+1] > year >= years[idx]:
                return years[idx]
    

    def normalize_floors_in_building(val):
        """Cap max floors in building number to control outliers"""
        floor = float(val)
        return floor if floor < 20 else 25
    

    def categorize_and_export_as_json(df, featname):
        """Creates and export as json, dictionary with category labels for selected feature"""
        cat_dict = dict(zip(df['{}'.format(featname)].drop_duplicates().values, df['{}'.format(featname)].drop_duplicates().factorize()[0]))
        cat_dict = {key:int(value) for key, value in cat_dict.items()}        
        with open('model_predict_data/cat_dict_{}.txt'.format(featname), 'w') as file:
            file.write(json.dumps(cat_dict))
        print(f'{file.name} saved.')
            
        return df[featname].map(cat_dict)
    
    
    def df_groupby_feat_and_export_to_csv(df, groupby_feats, feat):
        """Retuns aggregated feature by selected one or more features and export it to csv"""
        agg_params={
            'mean_{}_{}'.format('_'.join(groupby_feats), feat): (feat, 'mean'),
            'median_{}_{}'.format('_'.join(groupby_feats), feat): (feat, 'median'),
         }
        groupby_df = df[groupby_feats + [feat]].groupby(groupby_feats).agg(
            **agg_params
        ).reset_index()
        filename = 'groupby_{}_{}.csv'.format('_'.join(groupby_feats), feat)

        groupby_df.to_csv('model_predict_data/{}'.format(filename), index=False)
        print(f'{filename} saved.')

        return groupby_df
    
    
    def is_primary_market_conc(df, feat):
        """Concatenate "is_primary_market" with other feature and categorize the results.
        Dictionary with category labels is exported for prediction part"""
        df['is_primary_market_{}'.format(feat)] = df[ ['is_primary_market', feat] ].apply(
            lambda x: '{}_{}'.format(x['is_primary_market'], x[feat]), axis=1
        )

        df['is_primary_market_{}_cat'.format(feat)] = categorize_and_export_as_json(df, 'is_primary_market_{}'.format(feat))

        return df

    # Area
    df['area_num'] = df.area.astype(float)
    area_num_99 = np.percentile(df['area_num'], 99)
    df['area_norm'] = df['area_num'].map(lambda x: x if x <= area_num_99 else area_num_99)
    df['area_num_log'] = np.log(df['area_num'])
    df['price_m2'] = df['price'] / df['area_num'] 
    
    # Rooms
    df['area_per_room'] = df['area_norm'] / df["rooms"]    

    # Location 
    province_cities = ['Białystok', 'Bydgoszcz', 'Gdańsk', 'Gorzów Wielkopolski', 'Katowice', 'Kielce', 'Kraków', 'Lublin',
    'Łódź', 'Olsztyn', 'Opole', 'Poznań', 'Rzeszów', 'Szczecin', 'Toruń', 'Warszawa', 'Wrocław', 'Zielona Góra']
    
    df['province'] = df['location'].map(lambda x: x[0])
    df['city'] = df['location'].map(parse_location_city)
    df['province_city'] = df['city'].isin(province_cities)
    
    # Merging main dataframe with external data about cities.
    if 'city_area' not in df.columns:
        df = pd.merge(df, city_stats, on='city', how='left')
    # Merging main dataframe with external data about provinces.    
    if 'province_population' not in df.columns:
        df = pd.merge(df, province_stats, on='province', how='left')
        
    """'Location' feature is list cointaining elements describing property location in order
    from general to specyfic, which could be [<province>, <county>, <city>, <district> and <street>].
    """    
    for i in range(5):        
        # We can assume that "loc1" is likely province, "loc2" is likely county and so on.
        df["loc{}".format(i)] = df["location"].map(lambda x: x[i] if len(x) > i else "")      
    
    df['loc01'] = df['loc0'] + df['loc1']
    df['loc012'] = df['loc0'] + df['loc1'] + df['loc2']
    df['loc12'] = df['loc1'] + df['loc2']
    
    # Categorize location features
    for i in range(5):
        df["loc{}_cat".format(i)] = categorize_and_export_as_json(df, 'loc{}'.format(i))
    df["loc01_cat"] = categorize_and_export_as_json(df, 'loc01')
    df["loc012_cat"] = categorize_and_export_as_json(df, 'loc012')
    df["loc12_cat"] = categorize_and_export_as_json(df, 'loc12')
    
    df['city_cat'] = categorize_and_export_as_json(df, 'city')
    df['county_cat'] = categorize_and_export_as_json(df, 'county')
    df['province_cat'] = categorize_and_export_as_json(df, 'province')
  
    big_cities = {'Poznań', 'Sopot', 'Wrocław', 'Kraków', 'Gdańsk', 'Gdynia', 'Opole', 'Katowice',  'Częstochowa', 'Szczecin', 'Kalisz', 'Łódź', 'Olsztyn', 'Warszawa'}
    for city in big_cities:
        df[city] = df['city'] == city
        df['big_city'] = df['city'].map(lambda x: x in big_cities)
        
    
    # loc1 is likely to be "city", and loc2 is likely to be "district", so with combining this two
    # we could get for example: WrocławKrzyki, WarszawaŚródmieście, SopotGórny and so on.
    df_val_cnts = df['loc12'].value_counts()
    
    # We takes combinations only if they occur more then 100 times in dataset.
    loc12_vals = set(df_val_cnts[ df_val_cnts > 100].index.values)
    for item in loc12_vals:
        df[item] = df['loc12'] == item 
    
    # Floor
    floors_dict = {'parter': 0, '> 10': 11, 'poddasze': -2, 'suterena': -1}
    df['floor_num'] = df['floor'].map(lambda x: floors_dict.get(x, x)).fillna(-10).astype('int')
    
    # Floors_in_building
    df['floors_in_building_num'] = df['floors_in_building'].map(normalize_floors_in_building)
   

    # "price" aggregations    
    groupby_city_price = df_groupby_feat_and_export_to_csv(df, ['city'], 'price')       
    if 'median_city_price' not in df:
        df = pd.merge(df, groupby_city_price, on='city', how='left')
        
    groupby_county_price = df_groupby_feat_and_export_to_csv(df, ['county'], 'price')   
    if 'median_county_price' not in df:
        df = pd.merge(df, groupby_county_price, on='county', how='left')
  
   
    # is_primary_market
    df = is_primary_market_conc(df, 'rooms')
    df = is_primary_market_conc(df, 'city')
    df = is_primary_market_conc(df, 'rodzaj zabudowy') 
    
    # "price_m2" aggregations for concateneted is_primary_market with other features.   
    groupby_price_m2 = df_groupby_feat_and_export_to_csv(df, ['is_primary_market_rooms'], 'price_m2')
    if 'median_is_primary_market_rooms_price_m2' not in df:
        df = pd.merge(df, groupby_price_m2, on='is_primary_market_rooms', how='left')
        
    groupby_price_m2 = df_groupby_feat_and_export_to_csv(df, ['is_primary_market_rodzaj zabudowy'], 'price_m2')
    if 'median_is_primary_market_rodzaj zabudowy_price_m2' not in df:
        df = pd.merge(df, groupby_price_m2, on='is_primary_market_rodzaj zabudowy', how='left')
   
    # rok budowy            
    df['build_year'] = df['rok budowy'].fillna(-1).astype('int')   
    df["build_year_norm"] = df["build_year"].map(normalize_build_year)
    
    df['security'] = df['system alarmowy'] | df['rolety antywłamaniowe'] | df['drzwi / okna antywłamaniowe']
    
    cat_feats = {         
        "materiał budynku": "build_material_cat",
        "okna": "window_cat",
        "stan wykończenia": "property_completion_cat",
        "rodzaj zabudowy": "property_type_cat",
        "ogrzewanie": "property_heating_cat",
        "forma własności": "own_property_cat"
         }    
    
    for feat_name, feat_new_name in cat_feats.items():    
        df[feat_new_name] = categorize_and_export_as_json(df, feat_name)
      
        #OHE
        df_dummies = pd.get_dummies(df[feat_name])
        df_dummies.columns = ['{0}_{1}'.format(feat_new_name, x) for x in df_dummies.columns]
        df = pd.concat([df, df_dummies], axis=1)     
    

    print('Done')    
    return df

In [136]:
df_fe = feature_engineering(df_pre)

model_predict_data/cat_dict_loc0.txt saved.
model_predict_data/cat_dict_loc1.txt saved.
model_predict_data/cat_dict_loc2.txt saved.
model_predict_data/cat_dict_loc3.txt saved.
model_predict_data/cat_dict_loc4.txt saved.
model_predict_data/cat_dict_loc01.txt saved.
model_predict_data/cat_dict_loc012.txt saved.
model_predict_data/cat_dict_loc12.txt saved.
model_predict_data/cat_dict_city.txt saved.
model_predict_data/cat_dict_county.txt saved.
model_predict_data/cat_dict_province.txt saved.


  df[item] = df['loc12'] == item
  df['floor_num'] = df['floor'].map(lambda x: floors_dict.get(x, x)).fillna(-10).astype('int')
  df['floors_in_building_num'] = df['floors_in_building'].map(normalize_floors_in_building)


groupby_city_price.csv saved.
groupby_county_price.csv saved.
model_predict_data/cat_dict_is_primary_market_rooms.txt saved.
model_predict_data/cat_dict_is_primary_market_city.txt saved.
model_predict_data/cat_dict_is_primary_market_rodzaj zabudowy.txt saved.
groupby_is_primary_market_rooms_price_m2.csv saved.
groupby_is_primary_market_rodzaj zabudowy_price_m2.csv saved.
model_predict_data/cat_dict_materiał budynku.txt saved.
model_predict_data/cat_dict_okna.txt saved.
model_predict_data/cat_dict_stan wykończenia.txt saved.
model_predict_data/cat_dict_rodzaj zabudowy.txt saved.
model_predict_data/cat_dict_ogrzewanie.txt saved.
model_predict_data/cat_dict_forma własności.txt saved.
Done


In [137]:
model = train_model(df_fe)
export_model(model, 'test_model')

 Number of feats: 190

FEATS: 
 ['rooms', 'czynsz', 'is_primary_market', 'system alarmowy', 'rolety antywłamaniowe', 'drzwi / okna antywłamaniowe', 'area_num', 'area_norm', 'area_num_log', 'area_per_room', 'province_city', 'city_area', 'city_population', 'city_density', 'province_population', 'province_men_population', 'province_women_population', 'loc0_cat', 'loc1_cat', 'loc2_cat', 'loc3_cat', 'loc4_cat', 'loc01_cat', 'loc012_cat', 'loc12_cat', 'city_cat', 'county_cat', 'province_cat', 'Poznań', 'big_city', 'Wrocław', 'Kraków', 'Kalisz', 'Gdynia', 'Opole', 'Szczecin', 'Gdańsk', 'Sopot', 'Częstochowa', 'Olsztyn', 'Łódź', 'Warszawa', 'Katowice', 'GdańskJasień', 'kołobrzeskiKołobrzeg', 'WarszawaMokotów', 'WarszawaBielany', 'ŁódźŚródmieście', 'WarszawaOchota', 'GdańskStare Miasto', 'KrakówNowa Huta', 'BydgoszczFordon', 'świdnickiŚwidnica', 'WarszawaBiałołęka', 'ToruńChełmińskie Przedmieście', 'GdańskMorena', 'BydgoszczBartodzieje', 'Zielona Góra', 'WarszawaWilanów', 'WrocławKrzyki', 'Krak

'Done'