In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [2]:
data = pd.read_csv('data/train.csv')

In [3]:
data.info

<bound method DataFrame.info of      LandContour  GarageYrBlt  FullBath  1stFlrSF  BsmtFinSF1  TotRmsAbvGrd  \
0            Lvl       2003.0         2       856         706             8   
1            Lvl       1976.0         2      1262         978             6   
2            Lvl       2001.0         2       920         486             6   
3            Lvl       1998.0         1       961         216             7   
4            Lvl       2000.0         2      1145         655             9   
...          ...          ...       ...       ...         ...           ...   
1455         Lvl       1999.0         2       953           0             7   
1456         Lvl       1978.0         2      2073         790             7   
1457         Lvl       1941.0         2      1188         275             9   
1458         Lvl       1950.0         1      1078          49             5   
1459         Lvl       1965.0         1      1256         830             6   

     ExterQual  Ope

In [4]:
data.head()

Unnamed: 0,LandContour,GarageYrBlt,FullBath,1stFlrSF,BsmtFinSF1,TotRmsAbvGrd,ExterQual,OpenPorchSF,Heating,Condition2,...,GarageCars,KitchenQual,KitchenAbvGr,GrLivArea,2ndFlrSF,CentralAir,BsmtQual,TotalBsmtSF,Fireplaces,SalePrice
0,Lvl,2003.0,2,856,706,8,Gd,61,GasA,Norm,...,2,Gd,1,1710,854,Y,Gd,856,0,208500
1,Lvl,1976.0,2,1262,978,6,TA,0,GasA,Norm,...,2,TA,1,1262,0,Y,Gd,1262,1,181500
2,Lvl,2001.0,2,920,486,6,Gd,42,GasA,Norm,...,2,Gd,1,1786,866,Y,Gd,920,1,223500
3,Lvl,1998.0,1,961,216,7,TA,35,GasA,Norm,...,3,Gd,1,1717,756,Y,TA,756,1,140000
4,Lvl,2000.0,2,1145,655,9,Gd,84,GasA,Norm,...,3,Gd,1,2198,1053,Y,Gd,1145,1,250000


In [5]:
data.isnull().sum()

LandContour      0
GarageYrBlt     81
FullBath         0
1stFlrSF         0
BsmtFinSF1       0
TotRmsAbvGrd     0
ExterQual        0
OpenPorchSF      0
Heating          0
Condition2       0
OverallQual      0
GarageCars       0
KitchenQual      0
KitchenAbvGr     0
GrLivArea        0
2ndFlrSF         0
CentralAir       0
BsmtQual        37
TotalBsmtSF      0
Fireplaces       0
SalePrice        0
dtype: int64

In [6]:
data['GarageYrBlt'] = data['GarageYrBlt'].fillna(-1)

In [7]:
data['BsmtQual'] = data['BsmtQual'].fillna('NA')

In [8]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
data['GarageYrBlt'] = label_encoder.fit_transform(data['GarageYrBlt'])

In [9]:
from sklearn.preprocessing import OneHotEncoder

onehot_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

In [10]:
onehot_encoded = onehot_encoder.fit_transform(data[['LandContour', 'Heating', 'Condition2']])
new_columns = onehot_encoder.get_feature_names_out(['LandContour', 'Heating', 'Condition2'])
data = pd.concat([data, pd.DataFrame(onehot_encoded, columns=new_columns)], axis=1)
data = data.drop(['LandContour', 'Heating', 'Condition2'], axis=1)

In [11]:
kitchen_qual_mapping = {
    "Ex": 5,
    "Gd": 4,
    "TA": 3,
    "Fa": 2,
    "Po": 1
}

In [12]:
exter_qual_mapping = {
    "Ex": 5,
    "Gd": 4,
    "TA": 3,
    "Fa": 2,
    "Po": 1
}

In [13]:
central_air_mapping = {
    "Y": 1,
    "N": 0
}

In [14]:
bsmt_qual_mapping = {
    "Ex": 6,
    "Gd": 5,
    "TA": 4,
    "Fa": 3,
    "Po": 2,
    "NA": 1
}

In [15]:
data['KitchenQual'] = data['KitchenQual'].map(kitchen_qual_mapping)
data['ExterQual'] = data['ExterQual'].map(exter_qual_mapping)
data['CentralAir'] = data['CentralAir'].map(central_air_mapping)
data['BsmtQual'] = data['BsmtQual'].map(bsmt_qual_mapping)

In [16]:
X = data.drop('SalePrice', axis=1)
y = data['SalePrice']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=20)

In [17]:
model = LinearRegression()

model.fit(X_train, y_train)

In [18]:
y_pred = model.predict(X_test)

In [19]:
y_test

954     127500
326     324000
308      82500
1058    335000
416     149500
         ...  
297     239000
1340    123000
316     260000
318     260000
689     194700
Name: SalePrice, Length: 292, dtype: int64

In [20]:
type(y_pred)

numpy.ndarray

In [21]:
type(y_test)

pandas.core.series.Series

In [22]:
y_pred = pd.Series(y_pred)

In [23]:
y_pred

0      155264.742602
1      270659.631307
2       95925.124290
3      343043.308291
4      148180.404803
           ...      
287    236137.099135
288    123487.922242
289    282334.132376
290    314139.517727
291    189957.301426
Length: 292, dtype: float64

In [30]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_percentage_error

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)
print("Средняя абсолютная ошибка:", mae)
print("Средняя квадратичная ошибка:", mse)
print("Средняя абсолютная процентная ошибка:", mape*100, "%")

Средняя абсолютная ошибка: 21870.88161942671
Средняя квадратичная ошибка: 1247658296.214129
Средняя абсолютная процентная ошибка: 12.697327890166427 %


In [26]:
y_test

954     127500
326     324000
308      82500
1058    335000
416     149500
         ...  
297     239000
1340    123000
316     260000
318     260000
689     194700
Name: SalePrice, Length: 292, dtype: int64

In [27]:
y_pred

0      155264.742602
1      270659.631307
2       95925.124290
3      343043.308291
4      148180.404803
           ...      
287    236137.099135
288    123487.922242
289    282334.132376
290    314139.517727
291    189957.301426
Length: 292, dtype: float64

In [None]:
x_test =