## Линейные модели для регрессии

<p>Файл CSV включает почасовую/ежедневную сводку для Szeged, Венгрия, в период с 2006 по 2016 год.</p>

<p>Данные, доступные в почасовом ответе:</p>

- форматированная дата
- облачно/ясно/...
- осадки
- температура
- ощущение температуры
- влажность
- скорость ветра
- подшипник ветра (градусы)
- видимость (км)
- громкая обложка
- давление

In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold 
from sklearn.model_selection import cross_val_score

In [24]:
filename = 'weatherHistory.csv'
df = pd.read_csv(filename, index_col='Formatted Date') 
df.head()

Unnamed: 0_level_0,Summary,Precip Type,Temperature (C),Apparent Temperature (C),Humidity,Wind Speed (km/h),Wind Bearing (degrees),Visibility (km),Loud Cover,Pressure (millibars),Daily Summary
Formatted Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2006-04-01 00:00:00.000 +0200,Partly Cloudy,rain,9.472222,7.388889,0.89,14.1197,251.0,15.8263,0.0,1015.13,Partly cloudy throughout the day.
2006-04-01 01:00:00.000 +0200,Partly Cloudy,rain,9.355556,7.227778,0.86,14.2646,259.0,15.8263,0.0,1015.63,Partly cloudy throughout the day.
2006-04-01 02:00:00.000 +0200,Mostly Cloudy,rain,9.377778,9.377778,0.89,3.9284,204.0,14.9569,0.0,1015.94,Partly cloudy throughout the day.
2006-04-01 03:00:00.000 +0200,Partly Cloudy,rain,8.288889,5.944444,0.83,14.1036,269.0,15.8263,0.0,1016.41,Partly cloudy throughout the day.
2006-04-01 04:00:00.000 +0200,Mostly Cloudy,rain,8.755556,6.977778,0.83,11.0446,259.0,15.8263,0.0,1016.51,Partly cloudy throughout the day.


Некоторые данные представлены в строковом формате, но их можно проклассифицировать:

Рассмотрим данные второго столбца и соберем все отличающиеся классификации

In [25]:
summary_values = []
for i in df['Summary']:
    if not i in summary_values:
        summary_values.append(i)

In [26]:
summary_values.sort()
summary_values

['Breezy',
 'Breezy and Dry',
 'Breezy and Foggy',
 'Breezy and Mostly Cloudy',
 'Breezy and Overcast',
 'Breezy and Partly Cloudy',
 'Clear',
 'Dangerously Windy and Partly Cloudy',
 'Drizzle',
 'Dry',
 'Dry and Mostly Cloudy',
 'Dry and Partly Cloudy',
 'Foggy',
 'Humid and Mostly Cloudy',
 'Humid and Overcast',
 'Humid and Partly Cloudy',
 'Light Rain',
 'Mostly Cloudy',
 'Overcast',
 'Partly Cloudy',
 'Rain',
 'Windy',
 'Windy and Dry',
 'Windy and Foggy',
 'Windy and Mostly Cloudy',
 'Windy and Overcast',
 'Windy and Partly Cloudy']

в соответствии каждому наименованию запишем числовую характеристику

In [27]:
s_v = {
 'Breezy': 1.0,
 'Breezy and Dry': 1.2,
 'Breezy and Foggy': 1.4,
 'Breezy and Mostly Cloudy': 1.6,
 'Breezy and Overcast': 1.8,
 'Breezy and Partly Cloudy': 2,
 'Clear': 3,
 'Dangerously Windy and Partly Cloudy': 4,
 'Drizzle': 5,
 'Dry': 6.3,
 'Dry and Mostly Cloudy': 6.6,
 'Dry and Partly Cloudy': 6.9,
 'Foggy': 8,
 'Humid and Mostly Cloudy': 9.3,
 'Humid and Overcast': 9.6,
 'Humid and Partly Cloudy': 9.9,
 'Light Rain': 11,
 'Mostly Cloudy': 12,
 'Overcast': 13,
 'Partly Cloudy': 14,
 'Rain': 15,
 'Windy': 16,
 'Windy and Dry': 16.2,
 'Windy and Foggy': 16.4,
 'Windy and Mostly Cloudy': 16.6,
 'Windy and Overcast': 16.8,
 'Windy and Partly Cloudy': 17
}

Создаем новый столбец с числовыми значениями

In [28]:
df['Summary_Nums'] = [s_v[i] for i in df['Summary']]

In [29]:
df.head()

Unnamed: 0_level_0,Summary,Precip Type,Temperature (C),Apparent Temperature (C),Humidity,Wind Speed (km/h),Wind Bearing (degrees),Visibility (km),Loud Cover,Pressure (millibars),Daily Summary,Summary_Nums
Formatted Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2006-04-01 00:00:00.000 +0200,Partly Cloudy,rain,9.472222,7.388889,0.89,14.1197,251.0,15.8263,0.0,1015.13,Partly cloudy throughout the day.,14.0
2006-04-01 01:00:00.000 +0200,Partly Cloudy,rain,9.355556,7.227778,0.86,14.2646,259.0,15.8263,0.0,1015.63,Partly cloudy throughout the day.,14.0
2006-04-01 02:00:00.000 +0200,Mostly Cloudy,rain,9.377778,9.377778,0.89,3.9284,204.0,14.9569,0.0,1015.94,Partly cloudy throughout the day.,12.0
2006-04-01 03:00:00.000 +0200,Partly Cloudy,rain,8.288889,5.944444,0.83,14.1036,269.0,15.8263,0.0,1016.41,Partly cloudy throughout the day.,14.0
2006-04-01 04:00:00.000 +0200,Mostly Cloudy,rain,8.755556,6.977778,0.83,11.0446,259.0,15.8263,0.0,1016.51,Partly cloudy throughout the day.,12.0


Аналогичная процедура со столбцом Осадки

In [30]:
precip_type = []
for i in df['Precip Type']:
    if not i in precip_type:
        precip_type.append(i)
precip_type

['rain', 'snow', nan]

In [31]:
p_t = {
    'rain': 1,
    'snow': 2,
    np.nan: 0
}

In [32]:
df['Precip Type_Nums'] = [p_t[i] for i in df['Precip Type']]

In [33]:
df.head()

Unnamed: 0_level_0,Summary,Precip Type,Temperature (C),Apparent Temperature (C),Humidity,Wind Speed (km/h),Wind Bearing (degrees),Visibility (km),Loud Cover,Pressure (millibars),Daily Summary,Summary_Nums,Precip Type_Nums
Formatted Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2006-04-01 00:00:00.000 +0200,Partly Cloudy,rain,9.472222,7.388889,0.89,14.1197,251.0,15.8263,0.0,1015.13,Partly cloudy throughout the day.,14.0,1
2006-04-01 01:00:00.000 +0200,Partly Cloudy,rain,9.355556,7.227778,0.86,14.2646,259.0,15.8263,0.0,1015.63,Partly cloudy throughout the day.,14.0,1
2006-04-01 02:00:00.000 +0200,Mostly Cloudy,rain,9.377778,9.377778,0.89,3.9284,204.0,14.9569,0.0,1015.94,Partly cloudy throughout the day.,12.0,1
2006-04-01 03:00:00.000 +0200,Partly Cloudy,rain,8.288889,5.944444,0.83,14.1036,269.0,15.8263,0.0,1016.41,Partly cloudy throughout the day.,14.0,1
2006-04-01 04:00:00.000 +0200,Mostly Cloudy,rain,8.755556,6.977778,0.83,11.0446,259.0,15.8263,0.0,1016.51,Partly cloudy throughout the day.,12.0,1


Удаляем ненужные столбцы

In [34]:
final_data = df.drop(['Daily Summary', 'Summary', 'Precip Type'], axis=1)

In [35]:
final_data.head()

Unnamed: 0_level_0,Temperature (C),Apparent Temperature (C),Humidity,Wind Speed (km/h),Wind Bearing (degrees),Visibility (km),Loud Cover,Pressure (millibars),Summary_Nums,Precip Type_Nums
Formatted Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2006-04-01 00:00:00.000 +0200,9.472222,7.388889,0.89,14.1197,251.0,15.8263,0.0,1015.13,14.0,1
2006-04-01 01:00:00.000 +0200,9.355556,7.227778,0.86,14.2646,259.0,15.8263,0.0,1015.63,14.0,1
2006-04-01 02:00:00.000 +0200,9.377778,9.377778,0.89,3.9284,204.0,14.9569,0.0,1015.94,12.0,1
2006-04-01 03:00:00.000 +0200,8.288889,5.944444,0.83,14.1036,269.0,15.8263,0.0,1016.41,14.0,1
2006-04-01 04:00:00.000 +0200,8.755556,6.977778,0.83,11.0446,259.0,15.8263,0.0,1016.51,12.0,1


# Построение классической регрессии (линейная регрессия)

In [36]:
from sklearn.linear_model import LinearRegression

In [37]:
X = final_data.drop(['Apparent Temperature (C)'], axis=1).values
y = final_data['Apparent Temperature (C)'].values

In [41]:
kFold = KFold(n_splits = 10, random_state = 7, shuffle = True)
model = LinearRegression() #экземпляр класса

In [42]:
# метод, которым будет оценен результат (среднеквадратичная ошибка)
scoring = 'neg_mean_squared_error'

# применим кросс-валидацию
results = cross_val_score(model, X, y, cv = kFold, scoring = scoring)
print('Среднеквадратичная ошибка: ',results.mean())

Среднеквадратичная ошибка:  -1.1482364925487931


# Регрессия с резуляризацией (резуляризация - переобучаемость модели)

## Ridge Regression

In [43]:
from sklearn.linear_model import Ridge

In [46]:
model = Ridge()

# применим кросс-валидацию
results = cross_val_score(model, X, y, cv = kFold, scoring = scoring)
print('Среднеквадратичная ошибка: ',results.mean())

Среднеквадратичная ошибка:  -1.1482364775066454


## Lasso Regression (L1-регуляризация: удаление незначимых признаков)

In [47]:
from sklearn.linear_model import Lasso

In [48]:
model = Lasso()

# применим кросс-валидацию
results = cross_val_score(model, X, y, cv = kFold, scoring = scoring)
print('Среднеквадратичная ошибка: ',results.mean())

Среднеквадратичная ошибка:  -1.217223198739984


## Elastic Net Regression

In [49]:
from sklearn.linear_model import ElasticNet

In [50]:
model = ElasticNet()

# применим кросс-валидацию
results = cross_val_score(model, X, y, cv = kFold, scoring = scoring)
print('Среднеквадратичная ошибка: ',results.mean())

Среднеквадратичная ошибка:  -1.2031668573399943
