In [1]:
import pandas as pd
from sklearn.preprocessing import PolynomialFeatures, OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('housing.csv')
df.head()

Unnamed: 0,full_sq,life_sq,floor,max_floor,material,build_year,num_room,kitch_sq,sub_area,area_m,...,metro_min_walk,metro_km_walk,mkad_km,kremlin_km,green_part_1000,prom_part_1000,office_count_1000,trc_count_1000,leisure_count_1000,price_doc
0,38,18.0,1.0,14.0,panel,1971.0,1.0,9.0,Dmitrovskoe,7126815.0,...,27.084184,2.257015,2.442781,14.856442,17.16,10.33,2,4,0,5150000
1,41,17.0,14.0,16.0,panel,1978.0,1.0,10.0,Savelovskoe,2641243.0,...,13.719174,1.143264,11.528984,5.323699,0.9,39.48,17,3,0,5980000
2,41,20.0,11.0,14.0,panel,1989.0,1.0,8.0,Krjukovo,10842310.0,...,276.453594,23.0378,20.828299,38.988909,7.54,6.64,0,2,0,4600000
3,38,19.0,6.0,17.0,panel,1986.0,1.0,8.0,Brateevo,7587523.0,...,14.08566,1.173805,2.885041,14.921056,24.97,0.0,0,5,0,5650000
4,58,37.0,3.0,9.0,panel,1968.0,3.0,6.0,Novogireevo,4395333.0,...,5.455795,0.45465,1.920884,11.812614,3.46,5.41,0,4,4,9300000


видим, что признаков 36, и понадобится отобрать среди них нужные, т.к. признаки могут иметь корреляцию, а нам нужно избежать мультиколлинеарности

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8894 entries, 0 to 8893
Data columns (total 36 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   full_sq             8894 non-null   int64  
 1   life_sq             8894 non-null   float64
 2   floor               8894 non-null   float64
 3   max_floor           8894 non-null   float64
 4   material            8894 non-null   object 
 5   build_year          8859 non-null   float64
 6   num_room            8894 non-null   float64
 7   kitch_sq            8894 non-null   float64
 8   sub_area            8894 non-null   object 
 9   area_m              8894 non-null   float64
 10  green_zone_part     8894 non-null   float64
 11  indust_part         8894 non-null   float64
 12  preschool           8894 non-null   int64  
 13  school              8894 non-null   int64  
 14  healthcare          8894 non-null   int64  
 15  shopping            8894 non-null   int64  
 16  office

видим, что кол-во пропущенных значений незначительно, поэтому можем убрать их и  продолжать работать с датасетом

In [4]:
df = df.dropna(how='any').reset_index(drop=False)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8848 entries, 0 to 8847
Data columns (total 37 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   index               8848 non-null   int64  
 1   full_sq             8848 non-null   int64  
 2   life_sq             8848 non-null   float64
 3   floor               8848 non-null   float64
 4   max_floor           8848 non-null   float64
 5   material            8848 non-null   object 
 6   build_year          8848 non-null   float64
 7   num_room            8848 non-null   float64
 8   kitch_sq            8848 non-null   float64
 9   sub_area            8848 non-null   object 
 10  area_m              8848 non-null   float64
 11  green_zone_part     8848 non-null   float64
 12  indust_part         8848 non-null   float64
 13  preschool           8848 non-null   int64  
 14  school              8848 non-null   int64  
 15  healthcare          8848 non-null   int64  
 16  shoppi

In [5]:
for column in df.columns: # посмотрим на кол-во уникальных знач-й, чтобы для нормализации убрать категор признаки
    print(column, df[column].nunique())

index 8848
full_sq 159
life_sq 114
floor 36
max_floor 43
material 5
build_year 105
num_room 11
kitch_sq 40
sub_area 144
area_m 144
green_zone_part 144
indust_part 130
preschool 13
school 14
healthcare 7
shopping 16
office 30
radiation 2
detention 2
young 144
work 143
elder 144
0_6_age 144
7_14_age 144
metro_min_avto 6457
metro_km_avto 6457
metro_min_walk 6457
metro_km_walk 6457
mkad_km 6459
kremlin_km 6459
green_part_1000 2948
prom_part_1000 2347
office_count_1000 75
trc_count_1000 18
leisure_count_1000 23
price_doc 1005


In [6]:
for column in df.columns:
    if df[column].nunique() < 15:
        print(column, df[column].unique())

material ['panel' 'breezeblock' 'brick' 'mass concrete plus brick' 'mass concrete']
num_room [ 1.  3.  4.  2.  5.  6. 10.  0.  8.  7. 17.]
preschool [ 4  2  6  1  5  0  7  3  8 13 11 10  9]
school [ 4  2  6  5  7  0  8  1  9  3 14 13 11 10]
healthcare [1 2 0 3 4 6 5]
radiation ['no' 'yes']
detention ['no' 'yes']


In [7]:
df.select_dtypes('object') # посмотрим на тип object

Unnamed: 0,material,sub_area,radiation,detention
0,panel,Dmitrovskoe,no,no
1,panel,Savelovskoe,no,no
2,panel,Krjukovo,no,no
3,panel,Brateevo,yes,no
4,panel,Novogireevo,yes,no
...,...,...,...,...
8843,breezeblock,Pechatniki,yes,yes
8844,panel,Mar'ino,no,yes
8845,panel,Severnoe Butovo,no,no
8846,breezeblock,Ivanovskoe,yes,no


In [8]:
cat_cols = ['radiation', 'detention', 'sub_area', 'material']

In [9]:
one_hot_encoder = OneHotEncoder(sparse_output=False)

In [10]:
pd.DataFrame(one_hot_encoder.fit_transform(df[cat_cols]), columns=one_hot_encoder.get_feature_names_out())

Unnamed: 0,radiation_no,radiation_yes,detention_no,detention_yes,sub_area_Ajeroport,sub_area_Akademicheskoe,sub_area_Alekseevskoe,sub_area_Altuf'evskoe,sub_area_Arbat,sub_area_Babushkinskoe,...,sub_area_Vyhino-Zhulebino,sub_area_Zamoskvorech'e,sub_area_Zapadnoe Degunino,sub_area_Zjablikovo,sub_area_Zjuzino,material_breezeblock,material_brick,material_mass concrete,material_mass concrete plus brick,material_panel
0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8843,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
8844,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
8845,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
8846,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [11]:
df_oh = pd.concat([df.drop(cat_cols, axis=1), pd.DataFrame(one_hot_encoder.fit_transform(df[cat_cols]), columns=one_hot_encoder.get_feature_names_out())], axis=1)

In [12]:
X = df_oh.drop('price_doc', axis=1)
y = df_oh.price_doc

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

In [14]:
from sklearn.linear_model import LinearRegression
lin_model = LinearRegression().fit(X_train, y_train)

In [15]:
from sklearn.metrics import r2_score
lin_model.score(X_test, y_test)

0.592999899995169

In [16]:
r2_score(y_test, lin_model.predict(X_test))

0.592999899995169

In [17]:
poly = PolynomialFeatures() # попробую сделать признаки полиномиальными, мб они лучше опишут данные

In [18]:
X_poly_train = poly.fit_transform(X_train)
X_poly_test = poly.transform(X_test)

In [None]:
poly_model = LinearRegression().fit(X_poly_train, y_train)

In [None]:
poly_model.score(X_poly_test, y_test)

In [None]:
lin_model.score(X_test, y_test)

In [None]:
X.shape

In [None]:
X_poly_train.shape

In [None]:
from sklearn.feature_selection import SelectKBest, f_regression

In [None]:
f_statistic, p_value = f_regression(X_poly_train, y_train)
selector = SelectKBest(f_regression, k=125)
best_k_train_X = pd.DataFrame(selector.fit_transform(X_poly_train, y_train), columns=selector.get_feature_names_out())
best_k_test_X = pd.DataFrame(selector.transform(X_poly_test), columns=selector.get_feature_names_out())

In [None]:
best_features_model = LinearRegression().fit(best_k_train_X, y_train)

In [None]:
best_features_model.score(best_k_test_X, y_test)