In [158]:
# Импорт необходимых библиотек
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

# Загрузка данных
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

# Просмотр первых нескольких строк данных
train_df.head()



Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2,Price
0,14038,35,2.0,47.981561,29.442751,6.0,7,9.0,1969,0.08904,B,B,33,7976,5,,0,11,B,184966.93073
1,15053,41,3.0,65.68364,40.049543,8.0,7,9.0,1978,7e-05,B,B,46,10309,1,240.0,1,16,B,300009.450063
2,4765,53,2.0,44.947953,29.197612,0.0,8,12.0,1968,0.049637,B,B,34,7759,0,229.0,1,3,B,220925.908524
3,5809,58,2.0,53.352981,52.731512,9.0,8,17.0,1977,0.437885,B,B,23,5735,3,1084.0,0,5,B,175616.227217
4,10783,99,1.0,39.649192,23.776169,7.0,11,12.0,1976,0.012339,B,B,35,5776,1,2078.0,2,4,B,150226.531644


In [None]:
test_df.head()

In [159]:
# Сохраняем целевую переменную
train_labels = train_df['Price']



In [160]:
# Заполнение пропущенных значений в LifeSquare
# Используем медианное значение
train_df['LifeSquare_median'] = train_df['LifeSquare'].fillna(train_df['LifeSquare'].median())
test_df['LifeSquare_median'] = test_df['LifeSquare'].fillna(test_df['LifeSquare'].median())

# Используем среднее значение
train_df['LifeSquare_mean'] = train_df['LifeSquare'].fillna(train_df['LifeSquare'].mean())
test_df['LifeSquare_mean'] = test_df['LifeSquare'].fillna(test_df['LifeSquare'].mean())

# Пропорционально общему количеству квадратных метров (Square)
train_df['LifeSquare_proportional'] = train_df['LifeSquare'].fillna(train_df['Square'] * (train_df['LifeSquare'].mean() / train_df['Square'].mean()))
test_df['LifeSquare_proportional'] = test_df['LifeSquare'].fillna(test_df['Square'] * (train_df['LifeSquare'].mean() / train_df['Square'].mean()))

# Заполнение пропущенных значений в Healthcare_1
# Используем медианное значение
train_df['Healthcare_1_median'] = train_df['Healthcare_1'].fillna(train_df['Healthcare_1'].median())
test_df['Healthcare_1_median'] = test_df['Healthcare_1'].fillna(test_df['Healthcare_1'].median())

# Используем среднее значение
train_df['Healthcare_1_mean'] = train_df['Healthcare_1'].fillna(train_df['Healthcare_1'].mean())
test_df['Healthcare_1_mean'] = test_df['Healthcare_1'].fillna(test_df['Healthcare_1'].mean())
# Проверка на наличие пропущенных значений после заполнения
print(train_df.isnull().sum())
print(test_df.isnull().sum())
train_df.head()

Id                            0
DistrictId                    0
Rooms                         0
Square                        0
LifeSquare                 2113
KitchenSquare                 0
Floor                         0
HouseFloor                    0
HouseYear                     0
Ecology_1                     0
Ecology_2                     0
Ecology_3                     0
Social_1                      0
Social_2                      0
Social_3                      0
Healthcare_1               4798
Helthcare_2                   0
Shops_1                       0
Shops_2                       0
Price                         0
LifeSquare_median             0
LifeSquare_mean               0
LifeSquare_proportional       0
Healthcare_1_median           0
Healthcare_1_mean             0
dtype: int64
Id                            0
DistrictId                    0
Rooms                         0
Square                        0
LifeSquare                 1041
KitchenSquare              

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,...,Healthcare_1,Helthcare_2,Shops_1,Shops_2,Price,LifeSquare_median,LifeSquare_mean,LifeSquare_proportional,Healthcare_1_median,Healthcare_1_mean
0,14038,35,2.0,47.981561,29.442751,6.0,7,9.0,1969,0.08904,...,,0,11,B,184966.93073,29.442751,29.442751,29.442751,900.0,1142.90446
1,15053,41,3.0,65.68364,40.049543,8.0,7,9.0,1978,7e-05,...,240.0,1,16,B,300009.450063,40.049543,40.049543,40.049543,240.0,240.0
2,4765,53,2.0,44.947953,29.197612,0.0,8,12.0,1968,0.049637,...,229.0,1,3,B,220925.908524,29.197612,29.197612,29.197612,229.0,229.0
3,5809,58,2.0,53.352981,52.731512,9.0,8,17.0,1977,0.437885,...,1084.0,0,5,B,175616.227217,52.731512,52.731512,52.731512,1084.0,1084.0
4,10783,99,1.0,39.649192,23.776169,7.0,11,12.0,1976,0.012339,...,2078.0,2,4,B,150226.531644,23.776169,23.776169,23.776169,2078.0,2078.0


In [161]:
# Заполнение пропущенных значений в Healthcare_1
# Используем медианное значение
train_df['Healthcare_1_median'] = train_df['Healthcare_1'].fillna(train_df['Healthcare_1'].median())
test_df['Healthcare_1_median'] = test_df['Healthcare_1'].fillna(test_df['Healthcare_1'].median())

# Используем среднее значение
train_df['Healthcare_1_mean'] = train_df['Healthcare_1'].fillna(train_df['Healthcare_1'].mean())
test_df['Healthcare_1_mean'] = test_df['Healthcare_1'].fillna(test_df['Healthcare_1'].mean())
train_df.head()

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,...,Healthcare_1,Helthcare_2,Shops_1,Shops_2,Price,LifeSquare_median,LifeSquare_mean,LifeSquare_proportional,Healthcare_1_median,Healthcare_1_mean
0,14038,35,2.0,47.981561,29.442751,6.0,7,9.0,1969,0.08904,...,,0,11,B,184966.93073,29.442751,29.442751,29.442751,900.0,1142.90446
1,15053,41,3.0,65.68364,40.049543,8.0,7,9.0,1978,7e-05,...,240.0,1,16,B,300009.450063,40.049543,40.049543,40.049543,240.0,240.0
2,4765,53,2.0,44.947953,29.197612,0.0,8,12.0,1968,0.049637,...,229.0,1,3,B,220925.908524,29.197612,29.197612,29.197612,229.0,229.0
3,5809,58,2.0,53.352981,52.731512,9.0,8,17.0,1977,0.437885,...,1084.0,0,5,B,175616.227217,52.731512,52.731512,52.731512,1084.0,1084.0
4,10783,99,1.0,39.649192,23.776169,7.0,11,12.0,1976,0.012339,...,2078.0,2,4,B,150226.531644,23.776169,23.776169,23.776169,2078.0,2078.0


In [162]:
# Удаляем столбец Price из train_df, если он есть (для случая, если он уже включен в train_df)
if 'Price' in train_df.columns:
    train_df.drop(['Price'], axis=1, inplace=True)

# Проверка на наличие пропущенных значений в train_df
print(train_df.isnull().sum())

# Проверка на наличие пропущенных значений в test_df
print(test_df.isnull().sum())

Id                            0
DistrictId                    0
Rooms                         0
Square                        0
LifeSquare                 2113
KitchenSquare                 0
Floor                         0
HouseFloor                    0
HouseYear                     0
Ecology_1                     0
Ecology_2                     0
Ecology_3                     0
Social_1                      0
Social_2                      0
Social_3                      0
Healthcare_1               4798
Helthcare_2                   0
Shops_1                       0
Shops_2                       0
LifeSquare_median             0
LifeSquare_mean               0
LifeSquare_proportional       0
Healthcare_1_median           0
Healthcare_1_mean             0
dtype: int64
Id                            0
DistrictId                    0
Rooms                         0
Square                        0
LifeSquare                 1041
KitchenSquare                 0
Floor                      

In [170]:
# Проверка на наличие пропущенных значений в train_df
print("Проверка на наличие пропущенных значений в train_df:")
print(train_df.isnull().sum())

Проверка на наличие пропущенных значений в train_df:
Id                            0
DistrictId                    0
Rooms                         0
Square                        0
LifeSquare                 2113
KitchenSquare                 0
Floor                         0
HouseFloor                    0
HouseYear                     0
Ecology_1                     0
Social_1                      0
Social_2                      0
Social_3                      0
Healthcare_1               4798
Helthcare_2                   0
Shops_1                       0
LifeSquare_median             0
LifeSquare_mean               0
LifeSquare_proportional       0
Healthcare_1_median           0
Healthcare_1_mean             0
Ecology_2_A                   0
Ecology_2_B                   0
Ecology_3_A                   0
Ecology_3_B                   0
Shops_2_A                     0
Shops_2_B                     0
dtype: int64


In [171]:
# Преобразование категориальных признаков в числовые
train_df = pd.get_dummies(train_df)
test_df = pd.get_dummies(test_df)
train_df.head()

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,...,LifeSquare_mean,LifeSquare_proportional,Healthcare_1_median,Healthcare_1_mean,Ecology_2_A,Ecology_2_B,Ecology_3_A,Ecology_3_B,Shops_2_A,Shops_2_B
0,14038,35,2.0,47.981561,29.442751,6.0,7,9.0,1969,0.08904,...,29.442751,29.442751,900.0,1142.90446,False,True,False,True,False,True
1,15053,41,3.0,65.68364,40.049543,8.0,7,9.0,1978,7e-05,...,40.049543,40.049543,240.0,240.0,False,True,False,True,False,True
2,4765,53,2.0,44.947953,29.197612,0.0,8,12.0,1968,0.049637,...,29.197612,29.197612,229.0,229.0,False,True,False,True,False,True
3,5809,58,2.0,53.352981,52.731512,9.0,8,17.0,1977,0.437885,...,52.731512,52.731512,1084.0,1084.0,False,True,False,True,False,True
4,10783,99,1.0,39.649192,23.776169,7.0,11,12.0,1976,0.012339,...,23.776169,23.776169,2078.0,2078.0,False,True,False,True,False,True


In [172]:
# Выравнивание данных train и test
train_df, test_df = train_df.align(test_df, join='inner', axis=1)

In [173]:
# Заполнение оставшихся пропущенных значений медианой
train_df = train_df.fillna(train_df.median())
test_df = test_df.fillna(test_df.median())

In [174]:
print("Проверка на наличие пропущенных значений в train_df после преобразования категориальных признаков:")
print(train_df.isnull().sum().sum())
print("Проверка на наличие пропущенных значений в test_df после преобразования категориальных признаков:")
print(test_df.isnull().sum().sum())

Проверка на наличие пропущенных значений в train_df после преобразования категориальных признаков:
0
Проверка на наличие пропущенных значений в test_df после преобразования категориальных признаков:
0


In [175]:
# Нормализация данных после заполнения пропущенных значений
scaler = StandardScaler()
train_df_scaled = scaler.fit_transform(train_df)
test_df_scaled = scaler.transform(test_df)

In [None]:
train_df.head()

In [176]:


# Разделение данных на обучающую и валидационную выборки
X_train, X_valid, y_train, y_valid = train_test_split(train_df_scaled, train_labels, test_size=0.2, random_state=42)


In [None]:
train_df.head()

In [177]:
# Обучение модели
model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)




In [178]:
# Предсказание на валидационной выборке
y_valid_pred = model.predict(X_valid)
# Оценка модели
r2 = r2_score(y_valid, y_valid_pred)
print(f'R2 на валидационной выборке: {r2}')

R2 на валидационной выборке: 0.7033581398024924


In [179]:
# Предсказание на тестовом наборе данных
test_predictions = model.predict(test_df_scaled)

In [180]:
# Подготовка файла для отправки
submission = pd.DataFrame({
    'Id': pd.read_csv('test.csv')['Id'],  # Используем оригинальные ID из тестового набора данных
    'Price': test_predictions
})

In [181]:
# Просмотр первых нескольких строк файла submission.csv для проверки
print(submission.head())

      Id          Price
0    725  157192.011793
1  15856  224723.945114
2   5480  181157.662410
3  15664  349214.354082
4  14275  146671.581299
