In [352]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from joblib import dump, load

In [353]:
flats_data = pd.read_excel('data/flats.xlsx')
euro_data = pd.read_excel('data/euro.xlsx')
flats_data['renovation'] = 0
euro_data['renovation'] = 1
merged_data = pd.merge(flats_data, euro_data, how='outer')

In [354]:
merged_data['is_apartment'] = merged_data['is_apartment'].fillna(0)
merged_data['all_data.object_info.living_area'] = merged_data['all_data.object_info.living_area'].fillna(merged_data["all_data.object_info.living_area"].mean())
merged_data['build_year'] = merged_data['build_year'].fillna(merged_data['build_year'].mean())

merged_data.dropna(subset=['price_sq', 'area'], inplace=True)

merged_data['city'], cities = pd.factorize(merged_data['city'])
merged_data['house_wall_type'], wall_types = pd.factorize(merged_data['house_wall_type'])

In [355]:
# Создание модели
X = merged_data[['city', 'lat', 'lon', 'area', 'rooms', 'floor', 'house_floors',
                 'renovation', 'build_year', 'house_wall_type', 'kitchen_area',
                 'all_data.object_info.living_area', 'is_apartment', ]]  # Признаки 
y = merged_data['price_sq']  # Целевая переменная

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()  
model.fit(X_train, y_train)
dump(model, 'model.pkl')

['model.pkl']

In [356]:
# Оценка производительности модели
y_pred = model.predict(X_test)
mse = mean_squared_error(y_pred, y_test)
mae = mean_absolute_error(y_pred, y_test)
r2 = r2_score(y_pred, y_test)
print("Среднеквадратичная ошибка:", mse)
print("Средняя абсолютная ошибка:", mae)
print("Коэффициент детерминации:", r2)

Среднеквадратичная ошибка: 1969877248.0805466
Средняя абсолютная ошибка: 31268.451106806755
Коэффициент детерминации: 0.6133949214790724


In [341]:
# Оценка коэффициентов модели
coefficients = pd.DataFrame({'Признак': X.columns, 'Коэффициент': model.coef_})
print(coefficients)

                             Признак   Коэффициент
0                               city  56193.953541
1                                lat -11810.311169
2                                lon   -942.917030
3                               area  -1320.243553
4                              rooms  -2316.741584
5                              floor    412.618133
6                       house_floors   1440.727546
7                         renovation  12158.343340
8                         build_year     35.941128
9                    house_wall_type  -1118.559531
10                      kitchen_area   1863.505239
11  all_data.object_info.living_area   -378.483762
12                      is_apartment  17593.542928


In [357]:
print(list(cities))
print(list(wall_types))

['Новосибирск', 'Казань', 'Санкт-Петербург', 'Москва', 'поселение Мосрентген', 'поселение Сосенское', 'поселение Московский']
['Монолитный', 'Кирпичный', 'Панельный', 'Иные', 'Кирпично-монолитный', 'Блочный', 'Монолитно-кирпичный', 'Железобетон', 'Смешанные', 'Деревянный']


In [358]:
model = load('model.pkl')
my_data = pd.DataFrame({'city': [cities.get_loc('Казань')],
                        'lat': [55.789472],
                        'lon': [49.115801],
                        'area': [75],
                        'rooms': [3],
                        'floor': [3],
                        'house_floors': [12],
                        'renovation': [_],
                        'build_year': [2000],
                        'house_wall_type': [wall_types.get_loc('Кирпичный')],
                        'kitchen_area': [15.0],
                        'all_data.object_info.living_area': [20.0],
                        'is_apartment': [0]
                        })

my_data['renovation'] = 0
y_pred = model.predict(my_data)  
no_renovation_value = round(y_pred[0] * my_data['area'][0])

my_data['renovation'] = 1
y_pred = model.predict(my_data)
renovation_value = round(y_pred[0] * my_data['area'][0])

difference = renovation_value - no_renovation_value

no_renovation_value = '{:,}'.format(no_renovation_value).replace(',', ' ')
renovation_value = '{:,}'.format(renovation_value).replace(',', ' ')
difference = '{:,}'.format(difference).replace(',', ' ')

print("Цена без евроремонта: ", no_renovation_value)
print("Цена c евроремонтом: ", renovation_value)
print("Разница в цене: ", difference)

Цена без евроремонта:  9 597 706
Цена c евроремонтом:  10 509 582
Разница в цене:  911 876
