# Вебинар 2. Предобработка данных.

**Подключение библиотек и скриптов**

In [8]:
import numpy as np
import pandas as pd

**Пути к директориям и файлам**

In [9]:
DATASET_PATH = 'housing.csv'
PREPARED_DATASET_PATH = 'housing_prepared.csv'

### Загрузка данных

**Описание датасета**

Статистические данные о ряде домов в Калифорнии, основанные на переписи 1990 года.

* **longitude** - долгота
* **latitude** - широта
* **housing_median_age** - средний возраст дома
* **total_rooms** - общее количество комнат
* **total_bedrooms** - общее количество спален
* **population** - количество проживающих
* **households** - домохозяйства
* **ocean_proximity** - близость океана
* **median_income** - среднй доход
* **median_house_value** - средняя стоимость дома

In [51]:
df=pd.read_csv(DATASET_PATH)
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,id
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY,0
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY,1
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY,2
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY,3
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY,4


In [11]:
df.shape

(20640, 11)

### Приведение типов данных

In [50]:
df.dtypes

longitude                     float64
latitude                      float64
housing_median_age            float64
total_rooms                   float64
total_bedrooms                float64
population                    float64
households                    float64
median_income                 float64
median_house_value            float64
ocean_proximity_<1H OCEAN       uint8
ocean_proximity_INLAND          uint8
ocean_proximity_ISLAND          uint8
ocean_proximity_NEAR BAY        uint8
ocean_proximity_NEAR OCEAN      uint8
dtype: object

In [13]:
df['ocean_proximity'].dtype

dtype('O')

In [14]:
df['id']=df['id'].astype(int)

In [15]:
df['id'].dtype

dtype('int64')

In [16]:
df['id']=df['id'].astype(str)
df['id'].dtype

dtype('O')

### Обзор количественных переменных

In [17]:
df_num=df.select_dtypes(include = ['float64'])
df_num.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0


In [18]:
df_num.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
count,20640.0,20640.0,19918.0,20640.0,20433.0,20041.0,20640.0,20640.0,20640.0
mean,-119.471242,35.036934,28.65363,2635.763081,537.870553,1425.418243,499.53968,3.870671,206855.816909
std,5.041408,94.903955,12.576796,2181.615252,421.38507,1135.185798,382.329753,1.899822,115395.615874
min,-124.35,-13534.03,1.0,2.0,1.0,3.0,1.0,0.4999,14999.0
25%,-121.8,33.93,18.0,1447.75,296.0,786.0,280.0,2.5634,119600.0
50%,-118.49,34.26,29.0,2127.0,435.0,1165.0,409.0,3.5348,179700.0
75%,-118.01,37.71,37.0,3148.0,647.0,1726.0,605.0,4.74325,264725.0
max,122.03,1327.13,52.0,39320.0,6445.0,35682.0,6082.0,15.0001,500001.0


### Обзор номинативных переменных

In [19]:
df_num=df.select_dtypes(include = ['object'])
df_num.head()

Unnamed: 0,ocean_proximity,id
0,NEAR BAY,0
1,NEAR BAY,1
2,NEAR BAY,2
3,NEAR BAY,3
4,NEAR BAY,4


In [20]:
df['ocean_proximity'].value_counts()

<1H OCEAN     9127
INLAND        6542
NEAR OCEAN    2655
NEAR BAY      2288
-               23
ISLAND           5
Name: ocean_proximity, dtype: int64

### Обработка пропусков

In [21]:
df['total_bedrooms'].median()

435.0

In [22]:
missing_total_bedroom = df.loc[df['total_bedrooms'].isnull(),'total_bedrooms']
missing_total_bedroom[:5]

290   NaN
341   NaN
538   NaN
563   NaN
696   NaN
Name: total_bedrooms, dtype: float64

In [23]:
df.loc[df['total_bedrooms'].isnull(),'total_bedrooms']=df['total_bedrooms'].median()

**total_bedrooms**

In [24]:
df.loc[280:295,'total_bedrooms']

280     195.0
281     336.0
282     317.0
283    2048.0
284     260.0
285     746.0
286     440.0
287     322.0
288     177.0
289     128.0
290     435.0
291     194.0
292     397.0
293     349.0
294     342.0
295     533.0
Name: total_bedrooms, dtype: float64

In [25]:
df['total_bedrooms'].median()

435.0

**housing_median_age**

In [26]:
df.loc[df['housing_median_age'].isnull(),'housing_median_age']=df['housing_median_age'].median()

**population**

In [27]:
df.loc[df['population'].isnull(),'population']=df['population'].median()

**ocean_proximity**

In [28]:
df['ocean_proximity'].mode()[0]

'<1H OCEAN'

In [29]:
df.loc[df['ocean_proximity'] == '-'][:10]

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,id
1153,-121.46,39.54,14.0,5549.0,1000.0,1822.0,919.0,2.9562,142300.0,-,1153
2435,-119.59,36.57,19.0,1733.0,303.0,911.0,281.0,3.5987,131700.0,-,2435
2636,-124.15,40.59,39.0,1186.0,238.0,539.0,212.0,2.0938,79600.0,-,2636
5980,-117.74,34.1,26.0,2723.0,604.0,1847.0,498.0,2.6779,136000.0,-,5980
6373,-118.02,34.15,44.0,2419.0,437.0,1045.0,432.0,3.875,280800.0,-,6373
7677,-118.09,33.92,33.0,879.0,181.0,547.0,169.0,5.3146,168600.0,-,7677
9664,-120.08,41.79,34.0,1355.0,262.0,434.0,178.0,2.0903,56100.0,-,9664
9941,-122.25,38.17,34.0,778.0,137.0,406.0,136.0,4.2955,121300.0,-,9941
10358,-117.67,33.6,29.0,1213.0,171.0,565.0,170.0,7.2592,314800.0,-,10358
11098,-117.88,33.83,25.0,1785.0,248.0,750.0,251.0,6.8407,266700.0,-,11098


In [30]:
df.replace({'ocean_proximity':{'-':df['ocean_proximity'].mode()[0]}}, inplace=True)

In [31]:
df['ocean_proximity'].value_counts()

<1H OCEAN     9150
INLAND        6542
NEAR OCEAN    2655
NEAR BAY      2288
ISLAND           5
Name: ocean_proximity, dtype: int64

In [32]:
df.loc[df['population'].isnull(),'population']

Series([], Name: population, dtype: float64)

### Обработка выбросов

In [33]:
df.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
count,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0
mean,-119.471242,35.036934,28.665746,2635.763081,536.838857,1417.860562,499.53968,3.870671,206855.816909
std,5.041408,94.903955,12.355019,2181.615252,419.391878,1119.445348,382.329753,1.899822,115395.615874
min,-124.35,-13534.03,1.0,2.0,1.0,3.0,1.0,0.4999,14999.0
25%,-121.8,33.93,19.0,1447.75,297.0,797.0,280.0,2.5634,119600.0
50%,-118.49,34.26,29.0,2127.0,435.0,1165.0,409.0,3.5348,179700.0
75%,-118.01,37.71,37.0,3148.0,643.25,1701.0,605.0,4.74325,264725.0
max,122.03,1327.13,52.0,39320.0,6445.0,35682.0,6082.0,15.0001,500001.0


**longitude**

In [34]:
df[df['longitude']>=0]

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,id
3479,118.51,34.29,29.0,1287.0,194.0,525.0,187.0,6.4171,319300.0,<1H OCEAN,3479
5904,118.43,34.29,39.0,1769.0,410.0,1499.0,390.0,3.1212,153500.0,<1H OCEAN,5904
8405,118.36,33.93,40.0,1625.0,500.0,2036.0,476.0,2.6298,156500.0,<1H OCEAN,8405
8636,118.41,33.88,43.0,2492.0,449.0,1033.0,437.0,7.9614,500001.0,<1H OCEAN,8636
13051,121.29,38.61,17.0,13553.0,2474.0,6544.0,2359.0,3.9727,132700.0,INLAND,13051
15263,117.27,33.02,21.0,2144.0,340.0,928.0,344.0,5.798,286100.0,NEAR OCEAN,15263
17085,0.0,37.47,33.0,1266.0,415.0,1991.0,334.0,2.92,202800.0,NEAR OCEAN,17085
17359,0.0,34.88,4.0,3680.0,559.0,1678.0,569.0,5.0639,201700.0,<1H OCEAN,17359
18551,122.03,36.96,28.0,1607.0,421.0,926.0,385.0,2.425,216100.0,NEAR OCEAN,18551
19423,0.0,37.69,5.0,9601.0,1639.0,4449.0,1575.0,4.5332,195500.0,INLAND,19423


In [35]:
df.loc[df['longitude']>0,'longitude']=df.loc[df['longitude']>0,'longitude'] * -1

In [36]:
df[df['longitude']>=0]

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,id
17085,0.0,37.47,33.0,1266.0,415.0,1991.0,334.0,2.92,202800.0,NEAR OCEAN,17085
17359,0.0,34.88,4.0,3680.0,559.0,1678.0,569.0,5.0639,201700.0,<1H OCEAN,17359
19423,0.0,37.69,5.0,9601.0,1639.0,4449.0,1575.0,4.5332,195500.0,INLAND,19423


In [37]:
df.loc[df['longitude']==0,'longitude']=df['longitude'].median()

In [38]:
df[df['longitude']>=0]

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,id


**latitude**

In [39]:
df[(df['latitude']<0) | (df['latitude']>60)]

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,id
8283,-118.13,-13534.03,45.0,1016.0,172.0,361.0,163.0,7.5,434500.0,NEAR OCEAN,8283
12772,-121.42,1327.13,29.0,2217.0,536.0,1203.0,507.0,1.9412,73100.0,INLAND,12772


In [40]:
df.loc[((df['latitude']<0) | (df['latitude']>60)),'latitude']=df['latitude'].median()

In [41]:
df.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
count,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0
mean,-119.569308,35.631673,28.665746,2635.763081,536.838857,1417.860562,499.53968,3.870671,206855.816909
std,2.003455,2.135854,12.355019,2181.615252,419.391878,1119.445348,382.329753,1.899822,115395.615874
min,-124.35,32.54,1.0,2.0,1.0,3.0,1.0,0.4999,14999.0
25%,-121.8,33.93,19.0,1447.75,297.0,797.0,280.0,2.5634,119600.0
50%,-118.49,34.26,29.0,2127.0,435.0,1165.0,409.0,3.5348,179700.0
75%,-118.01,37.71,37.0,3148.0,643.25,1701.0,605.0,4.74325,264725.0
max,-114.31,41.95,52.0,39320.0,6445.0,35682.0,6082.0,15.0001,500001.0


### Отбор и построение новых признаков

**Исключаем признак "id"**

In [42]:
df=df[df.columns[:-1]]

**Преобразуем категориальный признак "ocean_proximity" в несколько бинарных**

Для представления качественных признаков в модель можно вводить бинарные (фиктивные -dummy) переменные, которые принимают значение 1, если данный качественный признак присутствует в наблюдении, и значение 0 при его отсутствии
https://studopedia.ru/9_148075_fiktivnie-peremennie-i-osobennosti-ih-ispolzovaniya-v-modelyah.html

In [43]:
df=pd.get_dummies(df)

In [44]:
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,0,0,0,1,0
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,0,0,0,1,0
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,0,0,0,1,0
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,0,0,0,1,0
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,0,0,0,1,0


In [45]:
df.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
count,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0
mean,-119.569308,35.631673,28.665746,2635.763081,536.838857,1417.860562,499.53968,3.870671,206855.816909,0.443314,0.316957,0.000242,0.110853,0.128634
std,2.003455,2.135854,12.355019,2181.615252,419.391878,1119.445348,382.329753,1.899822,115395.615874,0.496788,0.465302,0.015563,0.313957,0.334802
min,-124.35,32.54,1.0,2.0,1.0,3.0,1.0,0.4999,14999.0,0.0,0.0,0.0,0.0,0.0
25%,-121.8,33.93,19.0,1447.75,297.0,797.0,280.0,2.5634,119600.0,0.0,0.0,0.0,0.0,0.0
50%,-118.49,34.26,29.0,2127.0,435.0,1165.0,409.0,3.5348,179700.0,0.0,0.0,0.0,0.0,0.0
75%,-118.01,37.71,37.0,3148.0,643.25,1701.0,605.0,4.74325,264725.0,1.0,1.0,0.0,0.0,0.0
max,-114.31,41.95,52.0,39320.0,6445.0,35682.0,6082.0,15.0001,500001.0,1.0,1.0,1.0,1.0,1.0


### Сохранение результатов

In [46]:
df.to_csv(PREPARED_DATASET_PATH,index=False)