In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor as RF
from sklearn.metrics import r2_score as r2
import warnings
from itertools import combinations
from tqdm import tqdm

warnings.filterwarnings('ignore')

In [2]:

def add_stats(df, distr_stat_dr, distr_stat_r, mean_price):
    df = pd.merge(df, distr_stat_dr, on=['DistrictId', 'Rooms'], how='left')
    df = pd.merge(df, distr_stat_r, on='Rooms', how='left')

    df['mean_price_r'] = df['mean_price_r'].fillna(mean_price)
    df['mean_price_dr'] = df['mean_price_dr'].fillna(df['mean_price_r'])
    return df


def add_cat_fts(df, cat_fts=('Ecology_2', 'Ecology_3', 'Shops_2')):
    for col in cat_fts:
        df[col] = (df[col] == 'B').astype(int)
    return df


def drop_columns(df, col):
    return df.drop(col, axis=1)


def add_normsl(df, arr_norm):
    for i in arr_norm:
        df = pd.merge(df, i, on=[i.iloc[0].index[0]], how='left')
        # df.info()
        df[f'qty_{i.iloc[0].index[0]}'] = df[f'qty_{i.iloc[0].index[0]}'].fillna(i[f'qty_{i.iloc[0].index[0]}'].min())
    return df


def prepare_data(df, distr_stat_dr, distr_stat_r, mean_price, arr_norm):
    df = drop_columns(df, ['Healthcare_1'])
    df = add_cat_fts(df)
    df = add_normsl(df, arr_norm)
    df = add_stats(df, distr_stat_dr, distr_stat_r, mean_price)
    return df


def model_tr(df_train, df_valid, feats):
    model = RF(n_estimators=1500, max_depth=20, random_state=42, max_features=2, max_leaf_nodes=800)
    model.fit(df_train.loc[:, feats], df_train['Price'])
    return model, model.predict(df_train.loc[:, feats]), model.predict(df_valid.loc[:, feats])


def gen_arr_tr(feats, excl_col):
    feats = list(feats)
    [feats.remove(i) for i in excl_col if i in feats]
    return sum([list(map(list, combinations(feats, i))) for i in range(len(feats) + 1)], [])


def find_optim_trates(train, valid, cols, excl_col, t_par):
    opt_feats, err_arr = list(cols).copy(), []
    arr_feats = gen_arr_tr(cols, excl_col)
    r2_t_max, r2_v_max, opt_model = 0, 0, None
    if t_par == 'f':
        for f in tqdm(arr_feats):
            try:
                model, pred_train, pred_valid = model_tr(train, valid, f)
                r2_train = r2(train['Price'], pred_train)
                r2_valid = r2(valid['Price'], pred_valid)
                if r2_valid > r2_v_max:
                    opt_model, opt_feats, r2_t_max, r2_v_max = model, f, r2_train, r2_valid
            except Exception:
                [err_arr.append(j) for j in f if j not in err_arr]
    elif t_par == 's':
        model, pred_train, pred_valid = model_tr(train, valid, cols)
        opt_model, opt_feats, r2_t_max, r2_v_max = model, cols, r2(train['Price'], pred_train), r2(valid['Price'], pred_valid)
    return opt_model, opt_feats, err_arr, r2_t_max, r2_v_max


In [3]:
data = pd.read_csv('D:\Kurs_project_task/train.csv')
data.head()
train, valid = train_test_split(data, test_size=0.3, random_state=42)

distr_stat_dr = train.groupby(['DistrictId', 'Rooms'], as_index=False)[['Price']].mean().rename(columns={'Price':'mean_price_dr'})
distr_stat_r = train.groupby(['Rooms'], as_index=False)[['Price']].mean().rename(columns={'Price':'mean_price_r'})

col_norm, norm_info = ['DistrictId', 'Social_1', 'Social_3'], []
for i in col_norm:
    norm_info.append(train[i].value_counts(normalize=True).reset_index().rename(columns={'index': i, i: f'qty_{i}'}))

mean_price = train['Price'].mean()

train = prepare_data(train, distr_stat_dr, distr_stat_r, mean_price, norm_info)
train.head()

valid = prepare_data(valid, distr_stat_dr, distr_stat_r, mean_price, norm_info)
valid.head()

colmns = ('DistrictId', 'Square', 'Helthcare_2', 'HouseYear', 'Ecology_1', 'qty_Social_3', 'qty_DistrictId')

#t_par - 'f' or 's'
model, feats, err_feats, r2_tr_max, r2_val_max = find_optim_trates(train, valid, colmns, ['Price', 'Id'], 's')
print(feats, '\n', err_feats)
print(r2_tr_max, r2_val_max)

test = pd.read_csv('D:\Kurs_project_task/test.csv')
test = prepare_data(test, distr_stat_dr, distr_stat_r, mean_price, norm_info)
test['Price'] = model.predict(test.loc[:, feats])
test.loc[:, ['Id', 'Price']].to_csv('PSubbotin_predictions.csv', index=None)

('DistrictId', 'Square', 'Helthcare_2', 'HouseYear', 'Ecology_1', 'qty_Social_3', 'qty_DistrictId') 
 []
0.9208640904909332 0.7255182673114586
