In [102]:
import pandas as pd
import numpy as np
import random

In [103]:
def process_rooms_number(x):
    '''Функция заполнения стобца количество комнат целыми числами'''
    if pd.isna(x):
        return 1
    
    if isinstance(x, int):
        return x
    
    if x.isdigit():
        return int(x)
   
    if x == 'Студия':
        return 1
    
    if x == 'Своб. планировка':
        return 1
    
    if x == '> 9':
        return 10

    return 1

def prediction(data, type_of_house_dict, district_dict, floor_dict, floors_in_house_dict, rooms_number_dict, \
               w = [0.2, 0.2, 0.2, 0.2, 0.2]):
    '''
    Функция получения предсказания цены по переданному датафрейму,
    полученным словарям для разных характеристик и весам 
    '''
    prediction = []
    
    for i in range(data.shape[0]):
        
        price_type = type_of_house_dict[data.type_of_house.iloc[i]] \
        if data.type_of_house.iloc[i] in type_of_house_dict.keys() else mean_sq_meter_price
        
        price_district = district_dict[data.district.iloc[i]] \
        if data.district.iloc[i] in district_dict.keys() else mean_sq_meter_price
        
        price_floor = floor_dict[data.floor.iloc[i]] \
        if data.floor.iloc[i] in floor_dict.keys() else mean_sq_meter_price
        
        price_floors_in_house = floors_in_house_dict[data.floors_in_house.iloc[i]] \
        if data.floors_in_house.iloc[i] in floors_in_house_dict.keys() else mean_sq_meter_price
        
        price_room = rooms_number_dict[data.rooms_number.iloc[i]] \
        if data.rooms_number.iloc[i] in rooms_number_dict.keys() else mean_sq_meter_price
        
        prediction.append(round((price_type * w[0] + price_district * w[1] + price_floor * w[2] + \
                         price_room * w[3] + price_floors_in_house * w[4]) * data.area.iloc[i]))
        
    return prediction

In [96]:
# Считаем данные
data_train = pd.read_csv('real_estate_novosibirsk.csv')
data_test = pd.read_table('dataset_521000_13.txt', delimiter=';')

# Удалим дубликаты и записи с пропусками
data_train = data_train.drop_duplicates(subset=['item_id'], keep='last')
data_train = data_train.dropna(subset=['area'])
data_train = data_train.dropna(subset=['type_of_house'])

# Заполним кол-во комнат целыми числами
data_train['rooms_number'] = data_train['rooms_number'].apply(process_rooms_number).copy()

# Удаляем 2% самых высоких и самых низких цен
data_train = data_train[(data_train.price.quantile(0.02) < data_train.price) & (data_train.price < data_train.price.quantile(0.99))]

# Ограничим все первыми 30 этажами
# data_train = data_train[data_train.floor <= 30]

# Посчитаем среднюю цену среди всех квартир
mean_sq_meter_price = (data_train.price / data_train.area).mean()

# Создадим столбец с ценой за квадратный метр в каждой квартире
data_train['price_m2'] = data_train['price'] / data_train['area']

#Словари для учета специфики конкретной квартиры
# Создадим словарь тип дома: средняя цена
type_of_house_dict = data_train.groupby('type_of_house')['price_m2'].median().round().to_dict()

# Создадим словарь район: медианная цена
district_dict = data_train.groupby('district')['price_m2'].median().to_dict()

# Создадим словарь этаж: средняя цена
floor_dict = data_train.groupby('floor')['price_m2'].median().round().to_dict()

# Создадим словарь этаж: средняя цена
floors_in_house_dict = data_train.groupby('floors_in_house')['price_m2'].median().round().to_dict()

# Создадим словарь количество комнат: средняя цена
rooms_number_dict = data_train.groupby('rooms_number')['price_m2'].median().round().to_dict()

In [50]:
data_train

Unnamed: 0,area,area_raw,item_id,type_of_house,floor,floors_in_house,rooms_number,price,district,price_order_id,price_m2
2,91.0,91.0,198518500076,Кирпичный,3.0,16.0,3,5100000.0,Кировский,3,56043.956044
3,18.7,18.7,257626750244,Кирпичный,4.0,5.0,1,1300000.0,Ленинский,1,69518.716578
6,86.0,86.0,367907500565,Кирпичный,4.0,4.0,4,3500000.0,Кировский,3,40697.674419
15,56.5,56.5,456658500306,Кирпичный,4.0,5.0,2,2025000.0,Дзержинский,3,35840.707965
18,29.0,29.0,483174000997,Панельный,3.0,5.0,1,1450000.0,Советский,3,50000.000000
...,...,...,...,...,...,...,...,...,...,...,...
124566,39.0,39.0,831037250887,Кирпичный,11.0,25.0,1,3580000.0,Дзержинский,3,91794.871795
124569,33.0,33.0,831536250075,Кирпичный,13.0,24.0,1,3900000.0,Центральный,3,118181.818182
124572,33.0,33.0,832255750867,Кирпичный,10.0,10.0,1,2950000.0,Заельцовский,3,89393.939394
124574,51.0,51.0,833043000375,Панельный,4.0,10.0,2,3300000.0,Октябрьский,2,64705.882353


In [99]:
# Получим предсказания на трейне
pred = prediction(data_train, type_of_house_dict, district_dict, floor_dict, floors_in_house_dict, rooms_number_dict)

# Метрика на train
MAPE = round(((data_train.price - pred) / data_train.price).abs().mean(), 3)
MAPE

0.205

In [105]:
def random_five_digit():
    randomlist = random.sample(range(1, 100), 5)
    return [round(i / sum(randomlist), 4) for i in randomlist ]

In [85]:
random_five_digit()

[0.26666666666666666,
 0.2,
 0.06666666666666667,
 0.13333333333333333,
 0.3333333333333333]

In [106]:
min_mape = 1
w_optimum = [0.2, 0.2, 0.2, 0.2, 0.2]
for _ in range(100):
    w = random_five_digit()
    pred = prediction(data_train, type_of_house_dict, district_dict, floor_dict, floors_in_house_dict, rooms_number_dict, w)
    MAPE = round(((data_train.price - pred) / data_train.price).abs().mean(), 4)
    print(f'MAPE = {MAPE}, w = {w}')
    if MAPE < min_mape:
        min_mape = MAPE
        w_optimum = w
        #print(f'MAPE = {MAPE}, w = {w}')

MAPE = 0.2062, w = [0.119, 0.3849, 0.2738, 0.2063, 0.0159]
MAPE = 0.2058, w = [0.1946, 0.3297, 0.2486, 0.1892, 0.0378]
MAPE = 0.209, w = [0.2416, 0.1468, 0.2141, 0.2966, 0.1009]
MAPE = 0.2049, w = [0.2719, 0.1287, 0.0673, 0.2632, 0.269]
MAPE = 0.201, w = [0.3281, 0.1858, 0.1028, 0.0672, 0.3162]
MAPE = 0.2024, w = [0.1013, 0.212, 0.2595, 0.1203, 0.307]
MAPE = 0.1995, w = [0.0167, 0.2222, 0.0278, 0.2333, 0.5]
MAPE = 0.2027, w = [0.229, 0.2761, 0.0168, 0.2694, 0.2088]
MAPE = 0.2021, w = [0.2834, 0.2713, 0.0891, 0.1579, 0.1984]
MAPE = 0.2033, w = [0.2865, 0.2661, 0.1871, 0.117, 0.1433]
MAPE = 0.2052, w = [0.3349, 0.0377, 0.0519, 0.217, 0.3585]
MAPE = 0.2043, w = [0.3481, 0.1, 0.2296, 0.0481, 0.2741]
MAPE = 0.2044, w = [0.1858, 0.2699, 0.0354, 0.3319, 0.177]
MAPE = 0.2022, w = [0.1617, 0.2561, 0.1941, 0.1456, 0.2426]
MAPE = 0.2048, w = [0.2532, 0.1039, 0.3247, 0.039, 0.2792]
MAPE = 0.2018, w = [0.2852, 0.2969, 0.1016, 0.1367, 0.1797]
MAPE = 0.2059, w = [0.2554, 0.0087, 0.0606, 0.2641, 0.411

In [109]:
# Получим предсказания на тесте
pred_test = prediction(data_test, type_of_house_dict, district_dict, floor_dict, floors_in_house_dict, rooms_number_dict, w_optimum)

# Запишем предсказния в файл
answer = pd.Series(pred_test)
answer.to_csv('solution.csv', header=False, index=False)