In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [2]:
data = pd.read_csv('data/train.csv')

Проверяем данные

In [3]:
data.info

<bound method DataFrame.info of      LandContour  GarageYrBlt  FullBath  1stFlrSF  BsmtFinSF1  TotRmsAbvGrd  \
0            Lvl       2003.0         2       856         706             8   
1            Lvl       1976.0         2      1262         978             6   
2            Lvl       2001.0         2       920         486             6   
3            Lvl       1998.0         1       961         216             7   
4            Lvl       2000.0         2      1145         655             9   
...          ...          ...       ...       ...         ...           ...   
1455         Lvl       1999.0         2       953           0             7   
1456         Lvl       1978.0         2      2073         790             7   
1457         Lvl       1941.0         2      1188         275             9   
1458         Lvl       1950.0         1      1078          49             5   
1459         Lvl       1965.0         1      1256         830             6   

     ExterQual  Ope

In [4]:
data.head()

Unnamed: 0,LandContour,GarageYrBlt,FullBath,1stFlrSF,BsmtFinSF1,TotRmsAbvGrd,ExterQual,OpenPorchSF,Heating,Condition2,...,GarageCars,KitchenQual,KitchenAbvGr,GrLivArea,2ndFlrSF,CentralAir,BsmtQual,TotalBsmtSF,Fireplaces,SalePrice
0,Lvl,2003.0,2,856,706,8,Gd,61,GasA,Norm,...,2,Gd,1,1710,854,Y,Gd,856,0,208500
1,Lvl,1976.0,2,1262,978,6,TA,0,GasA,Norm,...,2,TA,1,1262,0,Y,Gd,1262,1,181500
2,Lvl,2001.0,2,920,486,6,Gd,42,GasA,Norm,...,2,Gd,1,1786,866,Y,Gd,920,1,223500
3,Lvl,1998.0,1,961,216,7,TA,35,GasA,Norm,...,3,Gd,1,1717,756,Y,TA,756,1,140000
4,Lvl,2000.0,2,1145,655,9,Gd,84,GasA,Norm,...,3,Gd,1,2198,1053,Y,Gd,1145,1,250000


In [5]:
data.isnull().sum()

LandContour      0
GarageYrBlt     81
FullBath         0
1stFlrSF         0
BsmtFinSF1       0
TotRmsAbvGrd     0
ExterQual        0
OpenPorchSF      0
Heating          0
Condition2       0
OverallQual      0
GarageCars       0
KitchenQual      0
KitchenAbvGr     0
GrLivArea        0
2ndFlrSF         0
CentralAir       0
BsmtQual        37
TotalBsmtSF      0
Fireplaces       0
SalePrice        0
dtype: int64

Заполняем пустые ячейки информацией

In [6]:
data['GarageYrBlt'] = data['GarageYrBlt'].fillna(-1)

In [7]:
data['BsmtQual'] = data['BsmtQual'].fillna('NA')

Переводим величины из некатегориальных в категориальные (исчисляемые)

In [8]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
data['GarageYrBlt'] = label_encoder.fit_transform(data['GarageYrBlt'])

In [9]:
from sklearn.preprocessing import OneHotEncoder

onehot_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

In [10]:
onehot_encoded = onehot_encoder.fit_transform(data[['LandContour', 'Heating', 'Condition2']])
new_columns = onehot_encoder.get_feature_names_out(['LandContour', 'Heating', 'Condition2'])
data = pd.concat([data, pd.DataFrame(onehot_encoded, columns=new_columns)], axis=1)
data = data.drop(['LandContour', 'Heating', 'Condition2'], axis=1)

In [11]:
kitchen_qual_mapping = {
    "Ex": 5,
    "Gd": 4,
    "TA": 3,
    "Fa": 2,
    "Po": 1
}

In [12]:
exter_qual_mapping = {
    "Ex": 5,
    "Gd": 4,
    "TA": 3,
    "Fa": 2,
    "Po": 1
}

In [13]:
central_air_mapping = {
    "Y": 1,
    "N": 0
}

In [14]:
bsmt_qual_mapping = {
    "Ex": 6,
    "Gd": 5,
    "TA": 4,
    "Fa": 3,
    "Po": 2,
    "NA": 1
}

In [15]:
data['KitchenQual'] = data['KitchenQual'].map(kitchen_qual_mapping)
data['ExterQual'] = data['ExterQual'].map(exter_qual_mapping)
data['CentralAir'] = data['CentralAir'].map(central_air_mapping)
data['BsmtQual'] = data['BsmtQual'].map(bsmt_qual_mapping)

Создаем выборки для тренировки и тестирования

In [16]:
X = data.drop('SalePrice', axis=1)
y = data['SalePrice']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=20)

In [17]:
X_train

Unnamed: 0,GarageYrBlt,FullBath,1stFlrSF,BsmtFinSF1,TotRmsAbvGrd,ExterQual,OpenPorchSF,OverallQual,GarageCars,KitchenQual,...,Heating_OthW,Heating_Wall,Condition2_Artery,Condition2_Feedr,Condition2_Norm,Condition2_PosA,Condition2_PosN,Condition2_RRAe,Condition2_RRAn,Condition2_RRNn
1137,0,1,780,0,6,3,0,5,0,3,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1335,64,2,1334,686,6,3,16,6,2,3,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
459,37,1,979,185,5,3,0,5,1,4,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
115,86,2,729,419,5,3,32,6,2,3,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
908,70,1,902,301,5,3,0,5,2,3,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
924,67,2,1686,625,7,3,131,6,2,3,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1247,63,1,1034,539,6,3,0,6,3,3,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
271,41,1,1363,226,5,3,0,7,2,3,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
474,87,2,1652,1196,6,4,48,8,2,5,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


Обучаем модель

In [18]:
model = LinearRegression()

model.fit(X_train, y_train)

Делаем предсказание

In [19]:
y_pred = model.predict(X_test)

In [20]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_absolute_percentage_error

mae = mean_absolute_error(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)
print("Средняя абсолютная ошибка:", mae)
print("Средняя абсолютная процентная ошибка:", mape*100, "%")

Средняя абсолютная ошибка: 21870.88161942671
Средняя абсолютная процентная ошибка: 12.697327890166427 %


In [21]:
y_test

954     127500
326     324000
308      82500
1058    335000
416     149500
         ...  
297     239000
1340    123000
316     260000
318     260000
689     194700
Name: SalePrice, Length: 292, dtype: int64

In [22]:
y_pred

array([155264.74260226, 270659.6313074 ,  95925.12429005, 343043.30829067,
       148180.40480273, 282184.05988635, 198547.88350702, 213638.30329086,
       139833.17814524, 245339.02548135, 283659.76950322, 128916.12021051,
       210912.93019371, 165690.84902092,  96506.4536442 ,  98147.60471198,
       107426.77084669,  71386.45447118, 219849.07064158, 142879.93929504,
       153331.72903435, 199933.7447421 , 243284.32285333, 131489.87606685,
       210986.88008265, 218072.84534911, 106341.33228927, 111757.21015337,
        81119.57729756,  67727.97062946, 115447.77877028, 280463.35426812,
       198861.36665491, 112761.42626142, 104326.22606361, 172966.40401428,
       138837.27150071, 218526.60498757, 165924.74873504, 267357.64847181,
       141997.79512879,  98350.78387531, 259139.93721015, 104746.63522964,
       174304.70094069, 153180.97905773, 158747.58661379, 395931.75162663,
       224541.33726512, 111093.21920071,  87270.82774308, 238034.80158136,
       177531.78842464, 2

Редактируем данные на основе которых нужно получить предсказание

In [23]:
test_data = pd.read_csv('data/test.csv')

test_data['GarageYrBlt'] = test_data['GarageYrBlt'].fillna(-1)
test_data['BsmtQual'] = test_data['BsmtQual'].fillna('NA')

test_data['GarageYrBlt'] = label_encoder.fit_transform(test_data['GarageYrBlt'])

test_data = pd.concat([test_data, pd.DataFrame(onehot_encoded, columns=new_columns)], axis=1)
test_data = test_data.drop(['LandContour', 'Heating', 'Condition2'], axis=1)

test_data['KitchenQual'] = test_data['KitchenQual'].map(kitchen_qual_mapping)
test_data['ExterQual'] = test_data['ExterQual'].map(exter_qual_mapping)
test_data['CentralAir'] = test_data['CentralAir'].map(central_air_mapping)
test_data['BsmtQual'] = test_data['BsmtQual'].map(bsmt_qual_mapping)

# По какой-то причине программа добавляет дополнительные пустые строки в конце,
# чтобы в data и test_data их было одинаковое количество
test_data = test_data.iloc[:-2]  # Выбирает все строки, кроме последних

# В строке 662 (или 660) файла test.csv пропущено ряд переменных. Разумнее всего ее удалить, дабы не мешала
# Lvl,1946.0,1,896,,4,TA,0,GasA,Norm,4,1.0,TA,1,896,0,Y,,,0
# Это она

test_data['KitchenQual'] = test_data['KitchenQual'].fillna(0)
test_data['GarageCars'] = test_data['KitchenQual'].fillna(0)

Предсказываем цены

In [26]:
test_pred = model.predict(test_data)

Редактируем и публикуем результаты в отдельный файл

In [41]:
test_pred = pd.Series(test_pred)

test_pred_df = pd.DataFrame(test_pred, columns=['SalePrice'])

test_pred_df['Id'] = range(1, len(test_pred_df) + 1)
test_pred_df.set_index('Id', inplace=True)

test_pred_df.to_csv('data/answer/preds.csv')