# Вебинар 2. Предобработка данных.

**Подключение библиотек и скриптов**

In [1]:
import numpy as np
import pandas as pd

**Пути к директориям и файлам**

In [2]:
DATASET_PATH = '../housing.csv'
PREPARED_DAT  ASET_PATH = '../housing_prepared.csv'

## 1. Загрузка данных

**Описание задачи**

Цель - предсказать стоимость дома 

Зачем?  

_В банках, страховых компаниях:_
- Узнать истинную стоимость имущества (залога)
- Принять решение о выдаче ипотеки/страховки
- Принять рещшение о % по ипотеке/страховке
  
_На площадках объявлений (Авито, Циан, ...):_
- Найти недооцененные квартиры (~ выгодные предложения), показать их пользователям
- Показывать рыночную стоимость квартиры пользователям
- Для тех, кто продает квартиру, рекомендовать цену продажи
- Поиск фрода

_Для инвесторов в недвижимость:_
- Определять рыночную стоимость квартир
- Поиск недооцененных активов
- Торговля на рынке недвижимости

**Описание датасета**

Статистические данные о ряде домов в Калифорнии, основанные на переписи 1990 года.

* **longitude** - долгота
* **latitude** - широта
* **housing_median_age** - средний возраст дома
* **total_rooms** - общее количество комнат
* **total_bedrooms** - общее количество спален
* **population** - количество проживающих
* **households** - домохозяйства (семья)
* **ocean_proximity** - близость океана
* **median_income** - средний доход
* **median_house_value** - средняя стоимость дома

In [3]:
? pd.read_csv

In [7]:
df = pd.read_csv(DATASET_PATH )
df.head(10)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,id
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY,0
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY,1
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY,2
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY,3
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY,4
5,-122.25,37.85,52.0,919.0,213.0,413.0,193.0,4.0368,269700.0,NEAR BAY,5
6,-122.25,37.84,52.0,2535.0,489.0,1094.0,514.0,3.6591,299200.0,NEAR BAY,6
7,-122.25,37.84,52.0,3104.0,687.0,1157.0,647.0,3.12,241400.0,NEAR BAY,7
8,-122.26,37.84,42.0,2555.0,665.0,1206.0,595.0,2.0804,226700.0,NEAR BAY,8
9,-122.25,37.84,52.0,3549.0,707.0,1551.0,714.0,3.6912,261100.0,NEAR BAY,9


In [8]:
df.shape

(20640, 11)

In [102]:
df.columns

Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'median_house_value', 'ocean_proximity', 'id'],
      dtype='object')

In [103]:
df.index

RangeIndex(start=0, stop=20640, step=1)

In [107]:
df[['longitude', 'latitude']].head(2)

Unnamed: 0,longitude,latitude
0,-122.23,37.88
1,-122.22,37.86


In [9]:
df[df['longitude'] > 10].head(2)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,id
3479,118.51,34.29,29.0,1287.0,194.0,525.0,187.0,6.4171,319300.0,<1H OCEAN,3479
5904,118.43,34.29,39.0,1769.0,410.0,1499.0,390.0,3.1212,153500.0,<1H OCEAN,5904


In [10]:
df[(df['longitude'] > 10) | (df['latitude'] < 100)].head(2)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,id
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY,0
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY,1


In [14]:
df[(df['longitude'] > 10) & (df['latitude'] < 100)].head(2)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,id
3479,118.51,34.29,29.0,1287.0,194.0,525.0,187.0,6.4171,319300.0,<1H OCEAN,3479
5904,118.43,34.29,39.0,1769.0,410.0,1499.0,390.0,3.1212,153500.0,<1H OCEAN,5904


In [15]:
df.loc[df['longitude'] > 10, 'median_income'].head(2)

3479    6.4171
5904    3.1212
Name: median_income, dtype: float64

In [16]:
df.loc[df['longitude'] > 10, ['median_income', 'median_house_value']].head(2)

Unnamed: 0,median_income,median_house_value
3479,6.4171,319300.0
5904,3.1212,153500.0


## 2. Приведение типов данных

In [17]:
df.dtypes

longitude             float64
latitude              float64
housing_median_age    float64
total_rooms           float64
total_bedrooms        float64
population            float64
households            float64
median_income         float64
median_house_value    float64
ocean_proximity        object
id                      int64
dtype: object

In [18]:
type(df['id'])

pandas.core.series.Series

In [19]:
type(df.id)

pandas.core.series.Series

In [20]:
type(df['id'].values)

numpy.ndarray

In [21]:
df['id'].dtype

dtype('int64')

In [22]:
df['id'] = df['id'].astype(str)
df['id'].dtype

dtype('O')

### Обзор количественных переменных

In [24]:
df_num_features = df.select_dtypes(include=['float64', 'int64'])
df_num_features.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0


In [25]:
df.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
count,20640.0,20640.0,19918.0,20640.0,20433.0,20041.0,20640.0,20640.0,20640.0
mean,-119.471242,35.036934,28.65363,2635.763081,537.870553,1425.418243,499.53968,3.870671,206855.816909
std,5.041408,94.903955,12.576796,2181.615252,421.38507,1135.185798,382.329753,1.899822,115395.615874
min,-124.35,-13534.03,1.0,2.0,1.0,3.0,1.0,0.4999,14999.0
25%,-121.8,33.93,18.0,1447.75,296.0,786.0,280.0,2.5634,119600.0
50%,-118.49,34.26,29.0,2127.0,435.0,1165.0,409.0,3.5348,179700.0
75%,-118.01,37.71,37.0,3148.0,647.0,1726.0,605.0,4.74325,264725.0
max,122.03,1327.13,52.0,39320.0,6445.0,35682.0,6082.0,15.0001,500001.0


_Пример расчета статистик_

In [26]:
x = np.array([1,2,3,4,5])
x = np.sort(x)
x

array([1, 2, 3, 4, 5])

In [27]:
np.quantile(x, q=0.5)

3.0

In [30]:
np.quantile(df['total_rooms'], q=0.5)

2127.0

### Обзор категориальных переменных

In [29]:
df_obj_features = df.select_dtypes(include='object')
df_obj_features.head()

Unnamed: 0,ocean_proximity,id
0,NEAR BAY,0
1,NEAR BAY,1
2,NEAR BAY,2
3,NEAR BAY,3
4,NEAR BAY,4


In [31]:
df['ocean_proximity'].value_counts()

<1H OCEAN     9127
INLAND        6542
NEAR OCEAN    2655
NEAR BAY      2288
-               23
ISLAND           5
Name: ocean_proximity, dtype: int64

In [32]:
df['ocean_proximity'].unique()

array(['NEAR BAY', '<1H OCEAN', 'INLAND', '-', 'NEAR OCEAN', 'ISLAND'],
      dtype=object)

In [33]:
df['ocean_proximity'].nunique()

6

## 3. Обработка пропусков

In [34]:
df.shape[0]

20640

In [35]:
df.isnull().sum()

longitude               0
latitude                0
housing_median_age    722
total_rooms             0
total_bedrooms        207
population            599
households              0
median_income           0
median_house_value      0
ocean_proximity         0
id                      0
dtype: int64

**housing_median_age**

In [144]:
median = df['housing_median_age'].median()

df['housing_median_age'] = df['housing_median_age'].fillna(median)

In [38]:
median

29.0

**total_bedrooms**

In [39]:
median = df['total_bedrooms'].median()

df['total_bedrooms'] = df['total_bedrooms'].fillna(median)

**population**

In [40]:
median = df['population'].median()

df['population'] = df['population'].fillna(median)

**Все и сразу**

In [41]:
median = df[['population', 'housing_median_age', 'total_bedrooms']].median()
median

population            1165.0
housing_median_age      29.0
total_bedrooms         435.0
dtype: float64

In [42]:
df[['population', 'housing_median_age', 'total_bedrooms']] =\
    df[['population', 'housing_median_age', 'total_bedrooms']].fillna(median)

In [43]:
df.isnull().sum()

longitude             0
latitude              0
housing_median_age    0
total_rooms           0
total_bedrooms        0
population            0
households            0
median_income         0
median_house_value    0
ocean_proximity       0
id                    0
dtype: int64

**ocean_proximity**

In [44]:
df['ocean_proximity'].mode()

0    <1H OCEAN
dtype: object

In [45]:
df['ocean_proximity'].mode()[0]

'<1H OCEAN'

In [47]:
df.replace({'ocean_proximity': 
                {'-': df['ocean_proximity'].mode()[0]}
           }, 
           inplace=True)

In [48]:
df['ocean_proximity'].value_counts()

<1H OCEAN     9150
INLAND        6542
NEAR OCEAN    2655
NEAR BAY      2288
ISLAND           5
Name: ocean_proximity, dtype: int64

## 4. Обработка выбросов

In [52]:
df.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
count,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0
mean,-119.471242,35.036934,28.665746,2635.763081,536.838857,1417.860562,499.53968,3.870671,206855.816909
std,5.041408,94.903955,12.355019,2181.615252,419.391878,1119.445348,382.329753,1.899822,115395.615874
min,-124.35,-13534.03,1.0,2.0,1.0,3.0,1.0,0.4999,14999.0
25%,-121.8,33.93,19.0,1447.75,297.0,797.0,280.0,2.5634,119600.0
50%,-118.49,34.26,29.0,2127.0,435.0,1165.0,409.0,3.5348,179700.0
75%,-118.01,37.71,37.0,3148.0,643.25,1701.0,605.0,4.74325,264725.0
max,122.03,1327.13,52.0,39320.0,6445.0,35682.0,6082.0,15.0001,500001.0


**longitude**

Возможные значения longtitude (долгота) и latitude (широта) можно найти [здесь](https://dateandtime.info/ru/citycoordinates.php?id=5332748)

_Широта принимает значения от −90° до 90°. 0° – широта экватора; −90° – широта Южного полюса; 90° – широта Северного полюса. Положительные значения соответствуют северной широте (точки севернее экватора, сокращённо с.ш. или N); отрицательные – южной широте (точки южнее экватора, сокращённо ю.ш. или S).  
Долгота отсчитывается от нулевого меридиана (IERS Reference Meridian в системе WGS 84) и принимает значения от −180° до 180°. Положительные значения соответствуют восточной долготе (сокращённо в.д. или E); отрицательные – западной долготе (сокращённо з.д. или W)._

In [62]:
df[df['longitude'] >= 0]

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,bedroom_share,population_per_room,<1H OCEAN,INLAND,ISLAND,NEAR BAY,NEAR OCEAN
3479,118.51,34.29,29.0,1287.0,194.0,525.0,187.0,6.4171,319300.0,<1H OCEAN,15.073815,0.407925,1,0,0,0,0
5904,118.43,34.29,39.0,1769.0,410.0,1499.0,390.0,3.1212,153500.0,<1H OCEAN,23.176936,0.847371,1,0,0,0,0
8405,118.36,33.93,40.0,1625.0,500.0,2036.0,476.0,2.6298,156500.0,<1H OCEAN,30.769231,1.252923,1,0,0,0,0
8636,118.41,33.88,43.0,2492.0,449.0,1033.0,437.0,7.9614,500001.0,<1H OCEAN,18.017657,0.414526,1,0,0,0,0
13051,121.29,38.61,17.0,13553.0,2474.0,6544.0,2359.0,3.9727,132700.0,INLAND,18.254261,0.482845,0,1,0,0,0
15263,117.27,33.02,21.0,2144.0,340.0,928.0,344.0,5.798,286100.0,NEAR OCEAN,15.858209,0.432836,0,0,0,0,1
17085,0.0,37.47,33.0,1266.0,415.0,1991.0,334.0,2.92,202800.0,NEAR OCEAN,32.780411,1.57267,0,0,0,0,1
17359,0.0,34.88,4.0,3680.0,559.0,1678.0,569.0,5.0639,201700.0,<1H OCEAN,15.190217,0.455978,1,0,0,0,0
18551,122.03,36.96,28.0,1607.0,421.0,926.0,385.0,2.425,216100.0,NEAR OCEAN,26.197884,0.576229,0,0,0,0,1
19423,0.0,37.69,5.0,9601.0,1639.0,4449.0,1575.0,4.5332,195500.0,INLAND,17.071138,0.463389,0,1,0,0,0


In [63]:
df.loc[df['longitude'] > 0, 'longitude'] * -1

3479    -118.51
5904    -118.43
8405    -118.36
8636    -118.41
13051   -121.29
15263   -117.27
18551   -122.03
Name: longitude, dtype: float64

In [97]:
df.loc[df['longitude'] > 0, 'longitude'] = df.loc[df['longitude'] > 0, 'longitude'] * -1

In [98]:
df.loc[df['longitude'] == 0, 'longitude'] = df['longitude'].median()

**latitude**

In [99]:
df[(df['latitude'] <= 0) | (df['latitude'] > 50)]

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms_x,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,id,bedroom_share,population_per_room,-,<1H OCEAN,INLAND,ISLAND,NEAR BAY,NEAR OCEAN,total_rooms_y


Калифорния вытянута вдоль берега Тихого Океана между 32 и 42 гр. северной широты 114 и 124 западной долготы. 

In [100]:
df.loc[(df['latitude'] <= 0) | (df['latitude'] > 50), 'latitude'] = df['latitude'].median()

## 5. Отбор и построение новых признаков (фичей)

**Исключаем признак "id"**

In [69]:
df = df[df.columns[:-1]]

### 5.1 Количественные переменные

In [70]:
# Доля спален в общем кол-ве комнат
df['bedroom_share'] = df['total_bedrooms'] / df['total_rooms'] * 100

# Сколько человек в среднем живут в одной комнате
df['population_per_room'] = df['population'] / df['total_rooms']

### 5.2 Категориальные переменные

Неплохой обзор по работе с категориальными признаками можно посмотреть [здесь](https://dyakonov.org/2016/08/03/python-%D0%BA%D0%B0%D1%82%D0%B5%D0%B3%D0%BE%D1%80%D0%B8%D0%B0%D0%BB%D1%8C%D0%BD%D1%8B%D0%B5-%D0%BF%D1%80%D0%B8%D0%B7%D0%BD%D0%B0%D0%BA%D0%B8/)

In [71]:
for i in df.columns: # перебираем все столбцы
    if str(df[i].dtype) == 'object': # если тип столбца - object
        print('='*10)
        print(i) # выводим название столбца
        print(set(df[i])) # выводим все его значения (но делаем set - чтоб значения не повторялись)
        print('\n') # выводим пустую строку

ocean_proximity
{'NEAR BAY', 'NEAR OCEAN', 'ISLAND', 'INLAND', '<1H OCEAN'}




**A) Бинарные (дамми) переменные**

In [72]:
df = pd.concat([df, pd.get_dummies(df['ocean_proximity'])], axis=1)
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,...,population_per_room,<1H OCEAN,INLAND,ISLAND,NEAR BAY,<1H OCEAN.1,INLAND.1,ISLAND.1,NEAR BAY.1,NEAR OCEAN
0,122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY,...,0.365909,0,0,0,1,0,0,0,1,0
1,122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY,...,0.338217,0,0,0,1,0,0,0,1,0
2,122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY,...,0.338105,0,0,0,1,0,0,0,1,0
3,122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY,...,0.437991,0,0,0,1,0,0,0,1,0
4,122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY,...,0.347265,0,0,0,1,0,0,0,1,0


**Б) Feature encoding / Target encoding**

In [73]:
df_cat = df.groupby('ocean_proximity')['total_rooms'].median()
df_cat = pd.DataFrame(df_cat)

df_cat

Unnamed: 0_level_0,total_rooms
ocean_proximity,Unnamed: 1_level_1
<1H OCEAN,2108.0
INLAND,2130.0
ISLAND,1675.0
NEAR BAY,2083.0
NEAR OCEAN,2197.0


In [58]:
df_cat.reset_index(inplace=True)

df_cat

Unnamed: 0,ocean_proximity,total_rooms
0,<1H OCEAN,2108.0
1,INLAND,2130.0
2,ISLAND,1675.0
3,NEAR BAY,2083.0
4,NEAR OCEAN,2197.0


In [60]:
df_cat.rename(columns={'total_rooms': 'median_rooms'},
             inplace=True)

df_cat.sort_values(by='median_rooms')

Unnamed: 0,ocean_proximity,median_rooms
2,ISLAND,1675.0
3,NEAR BAY,2083.0
0,<1H OCEAN,2108.0
1,INLAND,2130.0
4,NEAR OCEAN,2197.0


In [61]:
df['total_rooms'].median()

2127.0

In [101]:
df = df.merge(df_cat, on='ocean_proximity')

df.head(3)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms_x,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,...,bedroom_share,population_per_room,-,<1H OCEAN,INLAND,ISLAND,NEAR BAY,NEAR OCEAN,total_rooms_y,total_rooms
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY,...,14.659091,0.365909,0,0,0,0,1,0,2083.0,2083.0
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY,...,15.579659,0.338217,0,0,0,0,1,0,2083.0,2083.0
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY,...,12.951602,0.338105,0,0,0,0,1,0,2083.0,2083.0


## 5.3* А что дальше?

### latitude, longitude:

_Идея №1_

[Источник](https://medium.com/open-machine-learning-course/open-machine-learning-course-topic-6-feature-engineering-and-feature-selection-8b94f870706a)

If you have a small amount of data, enough time, and no desire to extract fancy features, you can use _reverse_geocoder_ from OpenStreetMap (OSM):
                   

In [86]:
#Import all necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import geopandas
import plotly.express as px

In [103]:
dn=df.head(100)

In [104]:
fig = px.scatter_geo(data_frame=dn, scope='north america',lat='latitude',lon='longitude',
                     size='median_house_value', color='median_house_value', projection='hammer')
fig.update_layout(
        title_text = 'Дома калифорнии')
fig.show()

_Идея №2_

- Найти координаты центров городов, достопримечательностей, станций метро, ..
- Считать расстояние до <...>
- Количество <...> в радиусе 3 км
- ...

### Работа с категориальными признаками

Описание методов можно посмотреть [здесь](https://towardsdatascience.com/encoding-categorical-features-21a2651a065c)

Мало категориальных признаков? Можно **создать их!**
- [Feature discretization](https://towardsdatascience.com/an-introduction-to-discretization-in-data-science-55ef8c9775a2)
- [Feature binarization](https://subscription.packtpub.com/book/big_data_and_business_intelligence/9781789808452/1/ch01lvl1sec17/binarization)

## 6. Сохранение результатов

In [77]:
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms_x,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,...,INLAND,ISLAND,NEAR BAY,<1H OCEAN,INLAND.1,ISLAND.1,NEAR BAY.1,NEAR OCEAN,total_rooms_y,total_rooms
0,122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY,...,0,0,1,0,0,0,1,0,2083.0,2083.0
1,122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY,...,0,0,1,0,0,0,1,0,2083.0,2083.0
2,122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY,...,0,0,1,0,0,0,1,0,2083.0,2083.0
3,122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY,...,0,0,1,0,0,0,1,0,2083.0,2083.0
4,122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY,...,0,0,1,0,0,0,1,0,2083.0,2083.0


In [78]:
df.to_csv(PREPARED_DATASET_PATH, index=False, encoding='utf-8')

## 7**. Подготовка данных в реальном проекте

In [79]:
import numpy as np
import pandas as pd

In [80]:
class DataPipeline:
    """Подготовка исходных данных"""
    
    def __init__(self):
        """Параметры класса"""
        self.medians = None
        
    def fit(self, df):
        """Сохранение статистик"""
        
        # Расчетет медиан
        self.medians = df[['population', 'housing_median_age', 'total_bedrooms']].median()
        self.longitude_median = df['longitude'].median()
        self.latitude_median = df['latitude'].median()
        
    def transform(self, df):
        """Трансформация данных"""
        
        # !. Пропуски
        df[['population', 'housing_median_age', 'total_bedrooms']] =\
            df[['population', 'housing_median_age', 'total_bedrooms']].fillna(self.medians)
        
        
        # 2. Выбросы (outliers)
        df.loc[df['longitude'] > 0, 'longitude'] = df.loc[df['longitude'] > 0, 'longitude'] * -1
        df.loc[df['longitude'] == 0, 'longitude'] = self.longitude_median
        df.loc[(df['latitude'] <= 0) | (df['latitude'] > 50), 'latitude'] = self.latitude_median
        
        
        # 3. Новые фичи (features)
        
        # Доля спален в общем кол-ве комнат
        df['bedroom_share'] = df['total_bedrooms'] / df['total_rooms'] * 100

        # Сколько человек в среднем живут в одной комнате
        df['population_per_room'] = df['population'] / df['total_rooms']
        
        # Обработка категорий
        df = pd.concat([df, pd.get_dummies(df['ocean_proximity'])], axis=1)
        
        return df


In [82]:
DATASET_PATH = '../housing.csv'
PREPARED_DATASET_PATH = '../housing_prepared.csv'

In [83]:
df = pd.read_csv(DATASET_PATH)

pipe = DataPipeline()
pipe.fit(df) # расчет статисстик
df = pipe.transform(df)

df.to_csv(PREPARED_DATASET_PATH, index=False, encoding='utf-8')

In [91]:
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,id,bedroom_share,population_per_room,-,<1H OCEAN,INLAND,ISLAND,NEAR BAY,NEAR OCEAN
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY,0,14.659091,0.365909,0,0,0,0,1,0
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY,1,15.579659,0.338217,0,0,0,0,1,0
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY,2,12.951602,0.338105,0,0,0,0,1,0
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY,3,18.44584,0.437991,0,0,0,0,1,0
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY,4,17.209588,0.347265,0,0,0,0,1,0


In [182]:
df.isnull().sum()

longitude              0
latitude               0
housing_median_age     0
total_rooms            0
total_bedrooms         0
population             0
households             0
median_income          0
median_house_value     0
ocean_proximity        0
id                     0
bedroom_share          0
population_per_room    0
-                      0
<1H OCEAN              0
INLAND                 0
ISLAND                 0
NEAR BAY               0
NEAR OCEAN             0
dtype: int64