In [52]:
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler

In [53]:
# Создаём сэмпл
n_samples = 1000

age_owner = np.random.choice(90, n_samples) + 21
length = np.random.choice(120, n_samples) + 15
width = np.random.choice(80, n_samples) + 10

price = length * width * 100 + 126

data = pd.DataFrame({'age_owner': age_owner, 'length': length, 'width': width, 'price': price})
data.head(5)

Unnamed: 0,age_owner,length,width,price
0,48,31,72,223326
1,92,61,48,292926
2,53,63,81,510426
3,73,46,10,46126
4,25,62,75,465126


In [54]:
from sklearn.metrics import mean_absolute_error

X = data[['age_owner', 'length', 'width']]
y = data['price']

reg = LinearRegression().fit(X, y)
print('Weights: {}'.format(reg.coef_))
print('Bias: {}'.format(reg.intercept_))

pred_values = reg.predict(data[['age_owner', 'length', 'width']])
print('Error: {}'.format(mean_absolute_error(pred_values, y)))

Weights: [  42.69766938 4861.60306924 7528.71383597]
Bias: -369978.6263091202
Error: 57096.75521899475


In [55]:
y.median()

295926.0

In [56]:
X = data[['length', 'width']]
y = data['price']
reg = LinearRegression().fit(X, y)
print('Weights: {}'.format(reg.coef_))
print('Bias: {}'.format(reg.intercept_))

pred_values = reg.predict(data[['length', 'width']])
print('Error: {}'.format(mean_absolute_error(pred_values, y)))

Weights: [4859.97286225 7530.17450283]
Bias: -367155.3574811759
Error: 57129.67129544484


In [57]:
# Создаем новый признак
data['mult'] = data['length'] * data['width']
data.head(5)

Unnamed: 0,age_owner,length,width,price,mult
0,48,31,72,223326,2232
1,92,61,48,292926,2928
2,53,63,81,510426,5103
3,73,46,10,46126,460
4,25,62,75,465126,4650


In [58]:
X = data[['mult']]
y = data['price']
reg = LinearRegression().fit(X, y)
print('Weights: {}'.format(reg.coef_))
print('Bias: {}'.format(reg.intercept_))

pred_values = reg.predict(data[['mult']])
print('Error: {}'.format(mean_absolute_error(pred_values, y)))

Weights: [100.]
Bias: 126.00000000011642
Error: 5.814581527374685e-11


### Домашнее задание

Будем описывать зависимость сумм на счетах вкладчика в банке от его зарплаты, возраста и количества членов семьи.

In [59]:
# Создаём сэмпл
n_samples = 1000

age = np.random.choice(75, n_samples) + 18
salary = np.random.choice(80000, n_samples) + 20000
relatives = np.random.choice(5, n_samples) + 1

deposit = salary*18/(relatives*0.5)

data = pd.DataFrame({'age': age, 'salary': salary, 'relatives': relatives, 'deposit': deposit})
data.head()

Unnamed: 0,age,salary,relatives,deposit
0,72,78434,4,705906.0
1,81,47224,3,566688.0
2,71,47160,2,848880.0
3,80,22722,4,204498.0
4,30,47770,2,859860.0


Создаем модель линейной регрессии, основаной на всех исходных признаках.

In [60]:
from sklearn.metrics import mean_absolute_error

X = data[['age', 'salary', 'relatives']]
y = data['deposit']

sc = StandardScaler()
sc.fit(X)
X_std = sc.transform(X)

reg = LinearRegression().fit(X_std, y)
print('Weights: {}'.format(reg.coef_))
print('Bias: {}'.format(reg.intercept_))

pred_values = reg.predict(X_std)
print('Error: {}'.format(mean_absolute_error(pred_values, y)))

Weights: [ -11090.22852565  379640.89873739 -547861.42761105]
Bias: 968568.8028
Error: 290808.79578929476


Пробуем создать модель, исключив из признаков возраст вкладчика

In [61]:
X = data[['salary', 'relatives']]
y = data['deposit']

sc = StandardScaler()
sc.fit(X)
X_std = sc.transform(X)

reg = LinearRegression().fit(X_std, y)
print('Weights: {}'.format(reg.coef_))
print('Bias: {}'.format(reg.intercept_))

pred_values = reg.predict(X_std)
print('Error: {}'.format(mean_absolute_error(pred_values, y)))

Weights: [ 380013.09158191 -547296.43527178]
Bias: 968568.8028
Error: 290872.0008949115


Создаем новый признак

In [62]:
# Создаем новый признак
data['dev'] = data['salary'] / data['relatives']
data.head(5)

Unnamed: 0,age,salary,relatives,deposit,dev
0,72,78434,4,705906.0,19608.5
1,81,47224,3,566688.0,15741.333333
2,71,47160,2,848880.0,23580.0
3,80,22722,4,204498.0,5680.5
4,30,47770,2,859860.0,23885.0


In [63]:
X = data[['dev']]
y = data['deposit']

sc = StandardScaler()
sc.fit(X)
X_std = sc.transform(X)

reg = LinearRegression().fit(X_std, y)
print('Weights: {}'.format(reg.coef_))
print('Bias: {}'.format(reg.intercept_))

pred_values = reg.predict(X_std)
print('Error: {}'.format(mean_absolute_error(pred_values, y)))

Weights: [754189.76643481]
Bias: 968568.8028000001
Error: 2.5087501853704452e-11


Ошибка модели практически равна нулю