In [2]:
import numpy as np
import pandas as pd
from sklearn import linear_model
from sklearn import metrics
df = pd.read_csv('insurance.csv')
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [3]:
df['sex'] = df['sex'].apply(lambda x: 1*(x=='male'))
df['smoker'] = df['smoker'].apply(lambda x: 1*(x=='yes'))
df = pd.get_dummies(df)
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,charges,region_northeast,region_northwest,region_southeast,region_southwest
0,19,0,27.9,0,1,16884.924,False,False,False,True
1,18,1,33.77,1,0,1725.5523,False,False,True,False
2,28,1,33.0,3,0,4449.462,False,False,True,False
3,33,1,22.705,0,0,21984.47061,False,True,False,False
4,32,1,28.88,0,0,3866.8552,False,True,False,False


In [4]:
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn import *
def print_metrics(y_train, y_train_predict, y_test, y_test_predict):
    print('Train R^2: {:.3f}'.format(metrics.r2_score(y_train, y_train_predict)))
    print('Train MAE: {:.0f}'.format(metrics.mean_absolute_error(y_train, y_train_predict)))
    print('Train MAPE: {:.0f}'.format(metrics.mean_absolute_percentage_error(y_train, y_train_predict)*100))
    print('\n')
    print('Test R^2: {:.3f}'.format(metrics.r2_score(y_test, y_test_predict)))
    print('Test MAE: {:.0f}'.format(metrics.mean_absolute_error(y_test, y_test_predict)))
    print('Test MAPE: {:.0f}'.format(metrics.mean_absolute_percentage_error(y_test, y_test_predict)*100))

features = df.drop('charges', axis=1).columns
X, y = df[features], df['charges']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)
print('Train:', X_train.shape)
print('Test:', X_test.shape)

scaler = preprocessing.MinMaxScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

poly = preprocessing.PolynomialFeatures(degree=2, include_bias=False)
poly.fit(X_train_scaled)
X_train_scaled_poly = poly.transform(X_train_scaled)
X_test_scaled_poly = poly.transform(X_test_scaled)
print(X_train_scaled_poly.shape)

Train: (1070, 9)
Test: (268, 9)
(1070, 54)


In [5]:
#Задание 1
lr_model = linear_model.LinearRegression()
lr_model.fit(X_train_scaled_poly,y_train)
y_train_predict = lr_model.predict(X_train_scaled_poly)
y_test_predict = lr_model.predict(X_test_scaled_poly)

print_metrics(y_train, y_train_predict, y_test, y_test_predict)
#Нет переобучения, модель работает хорошо

Train R^2: 0.841
Train MAE: 2892
Train MAPE: 29


Test R^2: 0.865
Test MAE: 2750
Test MAPE: 30


In [6]:
#Задание 2
print(lr_model.coef_)
print("Свободный член",lr_model.intercept_)
#Некоторые коэффициенты занулены и не вносят вклад в предсказание, степени коэффициентов так же сильно разнятся откуда следует что не все вносят значимый вклад,
# только те, у которых большая степень

[-9.91313536e+15  1.39768961e+16  4.41625289e+16  1.47526021e+16
 -5.22184762e+16  7.52469545e+16 -5.31445010e+16 -7.14539120e+16
  1.63452895e+17  8.20800000e+03  9.92000000e+02  1.03200000e+03
 -1.77600000e+03  1.52000000e+02  9.91313536e+15  9.91313536e+15
  9.91313536e+15  9.91313536e+15  4.80247972e+16  7.98000000e+02
 -1.46400000e+03  3.44000000e+02 -6.20016933e+16 -6.20016933e+16
 -6.20016933e+16 -6.20016933e+16 -9.25600000e+03  1.43200000e+03
  5.46360000e+04 -4.41625289e+16 -4.41625289e+16 -4.41625289e+16
 -4.41625289e+16 -1.87200000e+03 -2.19600000e+03 -1.47526021e+16
 -1.47526021e+16 -1.47526021e+16 -1.47526021e+16 -4.86554529e+16
  1.00873929e+17  1.00873929e+17  1.00873929e+17  1.00873929e+17
 -1.56740163e+17  0.00000000e+00  0.00000000e+00  0.00000000e+00
 -2.83487077e+16  0.00000000e+00  0.00000000e+00 -1.00392967e+16
  0.00000000e+00 -2.44946103e+17]
Свободный член 8.149320868507432e+16


In [7]:
#Задание 3

lasso_lr_poly = linear_model.Lasso(max_iter=2000)

lasso_lr_poly.fit(X_train_scaled_poly, y_train)

y_train_predict_poly = lasso_lr_poly.predict(X_train_scaled_poly)

y_test_predict_poly = lasso_lr_poly.predict(X_test_scaled_poly)

print_metrics(y_train, y_train_predict, y_test, y_test_predict)
#Метод Lasso никак не поменял значение метрик

Train R^2: 0.841
Train MAE: 2892
Train MAPE: 29


Test R^2: 0.865
Test MAE: 2750
Test MAPE: 30


In [8]:
#Задание 4

ridge_lr_poly = linear_model.Ridge(max_iter=2000)

ridge_lr_poly.fit(X_train_scaled_poly, y_train)

y_train_predict_poly = ridge_lr_poly.predict(X_train_scaled_poly)

y_test_predict_poly = ridge_lr_poly.predict(X_test_scaled_poly)

print_metrics(y_train, y_train_predict, y_test, y_test_predict)
print(ridge_lr_poly.coef_)
print(ridge_lr_poly.intercept_)
#Коэффициенты как и свободный член другие, но метрики те же

Train R^2: 0.841
Train MAE: 2892
Train MAPE: 29


Test R^2: 0.865
Test MAE: 2750
Test MAPE: 30
[ 3369.38386245  -457.59604062  3655.82413935  3198.89989857
  2682.62197132   199.05715111  -228.717235     191.94660928
  -162.28652539  7548.51142058   581.66782381   935.50427947
  -681.52086519   663.45387027  -450.92174702   438.63623056
  1406.72601433  1974.94336458  -457.59604062   947.62267378
  -902.74408745   739.02044241  -822.63208356   302.52583365
   351.60830856  -289.09809925 -4274.11806366  2734.55631801
 44541.21377369  3667.33801864  1719.64618126 -1249.79159858
  -481.36846198 -1314.96604775 -2172.72264458  2171.5603185
  1997.18187938   112.64828731 -1082.49058663  2682.62197132
    94.23317955  -208.05994802   535.7168228   2260.73191705
   199.05715111     0.             0.             0.
  -228.717235       0.             0.           191.94660928
     0.          -162.28652539]
2450.9757632248748


In [9]:
#Задание 5
names = ['r2','alpha']
i =0
alpha_list = np.linspace(1, 1000)
r2 = pd.DataFrame(columns=names)
for a in alpha_list:
    lasso_lr_poly = linear_model.Lasso(alpha=a)
    lasso_lr_poly.fit(X_train_scaled_poly, y_train)
    y_test_predict_poly = lasso_lr_poly.predict(X_test_scaled_poly)
    r2.loc[i] = [metrics.r2_score(y_test, y_test_predict_poly), a]
    i+=1
r2.max()
    

  model = cd_fast.enet_coordinate_descent(


r2          0.866834
alpha    1000.000000
dtype: float64

In [10]:
names = ['r2','alpha']
i =0
alpha_list = np.linspace(1, 1000,1000)

r2 = pd.DataFrame(columns=names)
for a in alpha_list:
    ridge_lr_poly = linear_model.Ridge(max_iter=2000)
    ridge_lr_poly.fit(X_train_scaled_poly, y_train)
    y_test_predict_poly = ridge_lr_poly.predict(X_test_scaled_poly)
    r2.loc[i] = [metrics.r2_score(y_test, y_test_predict_poly), a]
    i+=1
r2.head()

Unnamed: 0,r2,alpha
0,0.863338,1.0
1,0.863338,2.0
2,0.863338,3.0
3,0.863338,4.0
4,0.863338,5.0
