1. Обучите модель линейной регрессии на полиномиальных признаках. Чему равно значение метрик?
2. Выведите значения коэффициентов полученной модели. Посмотрите на степени коэффициентов.
3. Постройте линейную регрессию с L1-регуляризацией(Lasso) на полиномиальных признаках. В качестве параметра alpha используйте значение поумолчанию, параметр max_iter установите в значение 2000. Чему равно значение метрик?
4. Постройте линейную регрессию с L2-регуляризацией на полиномиальных признаках. В качестве параметра alpha используйте значение по умолчанию. Чему равно значение метрик?
5. Пожеланию: подобрать лучшее значение alpha(если оно есть).

In [None]:
import numpy as np
import pandas as pd
from sklearn import linear_model
from sklearn import metrics
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
import os
from google.colab import drive

drive.mount('/content/drive')
cwd = os.getcwd()
print(cwd)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content


In [None]:
df = pd.read_csv('drive/MyDrive/insurance.csv')
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [None]:
df.isnull().sum()

Unnamed: 0,0
age,0
sex,0
bmi,0
children,0
smoker,0
region,0
charges,0


In [None]:
df.dtypes

Unnamed: 0,0
age,int64
sex,object
bmi,float64
children,int64
smoker,object
region,object
charges,float64


In [None]:
# Кодируем категориальные признаки
df['sex'] = df['sex'].apply(lambda x: 0 if x == 'female' else 1)
df['smoker'] = df['smoker'].apply(lambda x: 1 if x == 'yes' else 0)
df = pd.get_dummies(df)
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,charges,region_northeast,region_northwest,region_southeast,region_southwest
0,19,0,27.9,0,1,16884.924,False,False,False,True
1,18,1,33.77,1,0,1725.5523,False,False,True,False
2,28,1,33.0,3,0,4449.462,False,False,True,False
3,33,1,22.705,0,0,21984.47061,False,True,False,False
4,32,1,28.88,0,0,3866.8552,False,True,False,False


In [None]:
# Разбиваем данные на тренировочные и тестовые
X = df.drop('charges', axis=1)
y = df['charges']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)
print('Train:', X_train.shape)
print('Test:', X_test.shape)

Train: (1070, 9)
Test: (268, 9)


In [None]:
# Нормализуем данные
scaler = preprocessing.MinMaxScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Строим полиномы
poly = preprocessing.PolynomialFeatures(degree=2, include_bias=False)
poly.fit(X_train_scaled)
X_train_scaled_poly = poly.transform(X_train_scaled)
X_test_scaled_poly = poly.transform(X_test_scaled)
print(X_train_scaled_poly.shape)

(1070, 54)


In [None]:
# Функция для отображения метрик
def print_metrics(y_train, y_train_predict, y_test, y_test_predict):
    print('Train R^2: {:.5f}'.format(metrics.r2_score(y_train, y_train_predict)))
    print('Train MAE: {:.0f}'.format(metrics.mean_absolute_error(y_train, y_train_predict)))
    print('Train MAPE: {:.0f}'.format(metrics.mean_absolute_percentage_error(y_train, y_train_predict)*100))
    print('\n')
    print('Test R^2: {:.5f}'.format(metrics.r2_score(y_test, y_test_predict)))
    print('Test MAE: {:.0f}'.format(metrics.mean_absolute_error(y_test, y_test_predict)))
    print('Test MAPE: {:.0f}'.format(metrics.mean_absolute_percentage_error(y_test, y_test_predict)*100))

In [None]:
# Обучение и предсказание
lr = linear_model.LinearRegression()
lr.fit(X_train_scaled_poly, y_train)

y_train_predict = lr.predict(X_train_scaled_poly)
y_test_predict = lr.predict(X_test_scaled_poly)

print_metrics(y_train, y_train_predict, y_test, y_test_predict)


Train R^2: 0.84180
Train MAE: 2915
Train MAPE: 30


Test R^2: 0.86663
Test MAE: 2748
Test MAPE: 31


In [None]:
# Значения коэффициентов
print(f'w0:{lr.intercept_}')
print(f'wn:{lr.coef_}')

w0:-1552928108735083.2
wn:[-6.13187729e+16  8.41033718e+14  5.83238304e+15 -1.13904139e+16
 -6.00962542e+13  9.58667470e+14  7.01129182e+14  4.04850498e+14
  1.29733639e+15  8.36800000e+03  8.56000000e+02  1.06800000e+03
 -1.65600000e+03  1.64000000e+02  6.13187729e+16  6.13187729e+16
  6.13187729e+16  6.13187729e+16 -9.22451557e+13  6.59500000e+02
 -1.23675000e+03  2.44250000e+02 -7.48788562e+14 -7.48788562e+14
 -7.48788562e+14 -7.48788562e+14 -9.70562500e+03  1.55137500e+03
  5.48453125e+04 -5.83238304e+15 -5.83238304e+15 -5.83238304e+15
 -5.83238304e+15 -2.13100000e+03 -2.23400000e+03  1.13904139e+16
  1.13904139e+16  1.13904139e+16  1.13904139e+16  1.31600009e+14
 -7.15037545e+13 -7.15037545e+13 -7.15037545e+13 -7.15037545e+13
  5.94260639e+14  0.00000000e+00  0.00000000e+00  0.00000000e+00
  8.51798927e+14  0.00000000e+00  0.00000000e+00  1.14807761e+15
  0.00000000e+00  2.55591716e+14]


In [None]:
lr_lasso = linear_model.Lasso(max_iter=2000)
lr_lasso.fit(X_train_scaled_poly, y_train)

y_train_predict = lr_lasso.predict(X_train_scaled_poly)
y_test_predict = lr_lasso.predict(X_test_scaled_poly)

print_metrics(y_train, y_train_predict, y_test, y_test_predict)

Train R^2: 0.84178
Train MAE: 2890
Train MAPE: 29


Test R^2: 0.86681
Test MAE: 2719
Test MAPE: 30


In [None]:
print(f'w0:{lr_lasso.intercept_}') # Значения коэффициентов значительно ниже
print(f'wn:{lr_lasso.coef_}')

w0:2249.713524103523
wn:[ 3.24221653e+03 -4.92313816e+02  5.07612317e+03  4.58362056e+03
  1.97928569e+03  1.88025707e+02 -4.70775218e+02  5.83657101e+02
 -2.43543456e+02  8.29826751e+03  7.74581954e+02  8.44448593e+02
 -1.20775688e+03  1.34809915e+02 -1.04309726e+03 -0.00000000e+00
  1.16964229e+03  1.61765564e+03 -0.00000000e+00  5.05847740e+02
 -1.13425152e+03  2.65796922e+02 -1.22240479e+03  0.00000000e+00
  2.36213242e+02 -5.80031340e+02 -8.17963257e+03  1.29487768e+03
  5.47148553e+04  3.58531873e+03  2.40690210e+03 -9.63549427e+02
  0.00000000e+00 -1.67464337e+03 -2.05436459e+03  1.94204864e+03
  1.95394442e+03 -0.00000000e+00 -1.06364416e+03  8.00606774e+00
  5.93688978e+02 -0.00000000e+00 -2.30643423e+02  1.97933477e+03
  6.31615519e+02  0.00000000e+00  0.00000000e+00  0.00000000e+00
 -0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00 -6.05849873e-01]


In [None]:
lr_ridge = linear_model.Ridge()
lr_ridge.fit(X_train_scaled_poly, y_train)

y_train_predict = lr_ridge.predict(X_train_scaled_poly)
y_test_predict = lr_ridge.predict(X_test_scaled_poly)

print_metrics(y_train, y_train_predict, y_test, y_test_predict)

Train R^2: 0.83864
Train MAE: 2949
Train MAPE: 30


Test R^2: 0.86334
Test MAE: 2861
Test MAPE: 31


In [None]:
print(f'w0:{lr_ridge.intercept_}')
print(f'wn:{lr_ridge.coef_}')

w0:2450.9757632246
wn:[ 3369.38386245  -457.59604061  3655.82413935  3198.89989857
  2682.62197133   199.05715111  -228.717235     191.94660928
  -162.28652539  7548.51142058   581.66782381   935.50427948
  -681.52086519   663.45387027  -450.92174702   438.63623056
  1406.72601433  1974.94336458  -457.59604061   947.62267378
  -902.74408745   739.02044241  -822.63208356   302.52583364
   351.60830856  -289.09809926 -4274.11806366  2734.55631801
 44541.21377369  3667.33801864  1719.64618126 -1249.79159857
  -481.36846198 -1314.96604775 -2172.72264458  2171.5603185
  1997.18187938   112.64828731 -1082.49058663  2682.62197133
    94.23317954  -208.05994804   535.71682279  2260.73191704
   199.05715111     0.             0.             0.
  -228.717235       0.             0.           191.94660928
     0.          -162.28652539]
