# Домашнее задание "Введение в Feature Selection"

1.Разделить дата сет на трейн и тест в отношение 50:50 70:30 80:20 (с перемешиванием)

2.Обучать наши модели на трейне. Предсказывать и замерять метрику R^2 и на трейне и на тесте

3.Проверить следующие модели, для каждого разделения: а) sales ~ log_tv + radio б) sales ~ TV + radio в) sales ~ TV + radio + newspaper

In [38]:
# Common imports
import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.model_selection import train_test_split
import math

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
import matplotlib
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import axes3d
%matplotlib inline

plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
plt.rcParams['figure.figsize'] = (10, 5)

In [39]:
import os
import urllib
import shutil

def download_file(url, dir_path="data"):
    if not os.path.exists(dir_path):
        os.makedirs(dir_path)
        
    file_name = os.path.split(url)[-1]
    file_path = os.path.join(dir_path, file_name)
    
    with urllib.request.urlopen(url) as response, open(file_path, 'wb') as out_file:
        shutil.copyfileobj(response, out_file)
        
    return file_path

In [40]:
from sklearn.preprocessing import StandardScaler, normalize, MinMaxScaler

In [41]:
download_file("http://www-bcf.usc.edu/~gareth/ISL/Advertising.csv")
adv_df = pd.read_csv('data/Advertising.csv', usecols=[1,2,3,4])
adv_df.head()

Unnamed: 0,TV,radio,newspaper,sales
0,230.1,37.8,69.2,22.1
1,44.5,39.3,45.1,10.4
2,17.2,45.9,69.3,9.3
3,151.5,41.3,58.5,18.5
4,180.8,10.8,58.4,12.9


In [42]:
adv_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 4 columns):
TV           200 non-null float64
radio        200 non-null float64
newspaper    200 non-null float64
sales        200 non-null float64
dtypes: float64(4)
memory usage: 6.3 KB


In [43]:
adv_df['log_tv'] = adv_df.TV.apply(lambda x: math.pow(x, 0.4))

In [44]:
#50/50
from sklearn.model_selection import train_test_split

adv_train_50, adv_test_50 = train_test_split(adv_df, test_size=0.5, random_state=42, shuffle=True)

print("Total transactions in train dataset: ", len(adv_train_50))
print("Total transactions in test dataset: ", len(adv_test_50))

Total transactions in train dataset:  100
Total transactions in test dataset:  100


In [87]:
#70/30

adv_train_70, adv_test_30 = train_test_split(adv_df, test_size=0.3, random_state=42, shuffle=True)

print("Total transactions in train dataset: ", len(adv_train_70))
print("Total transactions in test dataset: ", len(adv_test_30))

Total transactions in train dataset:  140
Total transactions in test dataset:  60


In [88]:
#80/20

adv_train_80, adv_test_20 = train_test_split(adv_df, test_size=0.2, random_state=42, shuffle=True)

print("Total transactions in train dataset: ", len(adv_train_80))
print("Total transactions in test dataset: ", len(adv_test_20))

Total transactions in train dataset:  160
Total transactions in test dataset:  40


# sales ~ log_tv + radio

In [56]:
three_x_lm = smf.ols('sales ~ log_tv + radio', adv_train_50).fit()


#расчет метрик
print("RSS:", np.sum(three_x_lm.resid ** 2))
print("RSE:", np.sqrt(np.sum(three_x_lm.resid ** 2)) / (adv_train_50.shape[0] - 2 - 1))
print("R^2:", three_x_lm.rsquared)

RSS: 200.23499560383388
RSE: 0.14588084055549225
R^2: 0.9273049082016976


In [68]:
#получение предссказания на тесте
y_pred_test = three_x_lm.predict(adv_test_50[['log_tv', 'radio']])
y_test = adv_test_50['sales']

#рассчет метрик
TSS_test = np.sum((y_test - y_test.mean())**2)
RSS_test = np.sum((y_test - y_pred_test)**2)
RSE_test = np.sqrt(RSS_test / (adv_test_50.shape[0] - 2 - 1))

R_2_test2 = sklearn.metrics.r2_score(y_test, y_pred_test)
print('for sales ~ log_tv + radio:')
print("RSS_test:", RSS_test)
print("RSE_test:", RSE_test)
print("R^2_test:", R_2_test2)


for sales ~ log_tv + radio:
RSS_test: 169.91612211411694
RSE_test: 1.3235227988646843
R^2_test: 0.9350266772150859


In [54]:
three_x_lm = smf.ols('sales ~ log_tv + radio', adv_train_70).fit()


#расчет метрик
print("RSS:", np.sum(three_x_lm.resid ** 2))
print("RSE:", np.sqrt(np.sum(three_x_lm.resid ** 2)) / (adv_train_70.shape[0] - 2 - 1))
print("R^2:", three_x_lm.rsquared)

RSS: 256.44355253955246
RSE: 0.11688945268659374
R^2: 0.9301954868019677


In [66]:
#получение предссказания на тесте
y_pred_test = three_x_lm.predict(adv_test_30[['log_tv', 'radio']])
y_test = adv_test_30['sales']

#рассчет метрик
TSS_test = np.sum((y_test - y_test.mean())**2)
RSS_test = np.sum((y_test - y_pred_test)**2)
RSE_test = np.sqrt(RSS_test / (adv_test_30.shape[0] - 2 - 1))

R_2_test2 = sklearn.metrics.r2_score(y_test, y_pred_test)
print('for sales ~ log_tv + radio:')
print("RSS_test:", RSS_test)
print("RSE_test:", RSE_test)
print("R^2_test:", R_2_test2)

for sales ~ log_tv + radio:
RSS_test: 114.84581950563864
RSE_test: 1.4194502240992513
R^2_test: 0.929898390469717


In [63]:
three_x_lm = smf.ols('sales ~ log_tv + radio', adv_train_80).fit()


#расчет метрик
print("RSS:", np.sum(three_x_lm.resid ** 2))
print("RSE:", np.sqrt(np.sum(three_x_lm.resid ** 2)) / (adv_train_80.shape[0] - 2 - 1))
print("R^2:", three_x_lm.rsquared)

RSS: 300.130315845859
RSE: 0.11034566599798143
R^2: 0.9276759564687794


In [65]:
#получение предссказания на тесте
y_pred_test = three_x_lm.predict(adv_test_20[['log_tv', 'radio']])
y_test = adv_test_20['sales']

#рассчет метрик
TSS_test = np.sum((y_test - y_test.mean())**2)
RSS_test = np.sum((y_test - y_pred_test)**2)
RSE_test = np.sqrt(RSS_test / (adv_test_20.shape[0] - 2 - 1))

R_2_test2 = sklearn.metrics.r2_score(y_test, y_pred_test)

print('for sales ~ log_tv + radio:')
print("RSS_test:", RSS_test)
print("RSE_test:", RSE_test)
print("R^2_test:", R_2_test2)

for sales ~ log_tv + radio:
RSS_test: 72.51557727371339
RSE_test: 1.3999573089409256
R^2_test: 0.942563909350695


# sales ~ TV + radio

In [71]:
three_x_lm = smf.ols('sales ~ TV + radio', adv_train_50).fit()


#расчет метрик
print("RSS:", np.sum(three_x_lm.resid ** 2))
print("RSE:", np.sqrt(np.sum(three_x_lm.resid ** 2)) / (adv_train_50.shape[0] - 2 - 1))
print("R^2:", three_x_lm.rsquared)

RSS: 269.79672077541716
RSE: 0.16933494815087197
R^2: 0.9020506014720118


In [72]:
#получение предссказания на тесте
y_pred_test = three_x_lm.predict(adv_test_50[['radio', 'TV']])
y_test = adv_test_50['sales']

#рассчет метрик
TSS_test = np.sum((y_test - y_test.mean())**2)
RSS_test = np.sum((y_test - y_pred_test)**2)
RSE_test = np.sqrt(RSS_test / (adv_test_50.shape[0] - 2 - 1))

R_2_test2 = sklearn.metrics.r2_score(y_test, y_pred_test)

print('for sales ~ TV + radio:')
print("RSS_test:", RSS_test)
print("RSE_test:", RSE_test)
print("R^2_test:", R_2_test2)

for sales ~ TV + radio:
RSS_test: 306.90663871598946
RSE_test: 1.7787596707099644
R^2_test: 0.8826436017134698


In [73]:
three_x_lm = smf.ols('sales ~ TV + radio', adv_train_70).fit()


#расчет метрик
print("RSS:", np.sum(three_x_lm.resid ** 2))
print("RSE:", np.sqrt(np.sum(three_x_lm.resid ** 2)) / (adv_train_70.shape[0] - 2 - 1))
print("R^2:", three_x_lm.rsquared)

RSS: 349.6011203718844
RSE: 0.13647900283453715
R^2: 0.9048377867980043


In [74]:
#получение предссказания на тесте
y_pred_test = three_x_lm.predict(adv_test_30[['radio', 'TV']])
y_test = adv_test_30['sales']

#рассчет метрик
TSS_test = np.sum((y_test - y_test.mean())**2)
RSS_test = np.sum((y_test - y_pred_test)**2)
RSE_test = np.sqrt(RSS_test / (adv_test_30.shape[0] - 2 - 1))

R_2_test2 = sklearn.metrics.r2_score(y_test, y_pred_test)

print('for sales ~ TV + radio:')
print("RSS_test:", RSS_test)
print("RSE_test:", RSE_test)
print("R^2_test:", R_2_test2)

for sales ~ TV + radio:
RSS_test: 220.14282327184094
RSE_test: 1.9652365746247507
R^2_test: 0.8656253548947074


In [75]:
three_x_lm = smf.ols('sales ~ TV + radio', adv_train_80).fit()


#расчет метрик
print("RSS:", np.sum(three_x_lm.resid ** 2))
print("RSE:", np.sqrt(np.sum(three_x_lm.resid ** 2)) / (adv_train_80.shape[0] - 2 - 1))
print("R^2:", three_x_lm.rsquared)

RSS: 433.2465274979225
RSE: 0.13257691007330222
R^2: 0.8955982149747163


In [77]:
#получение предссказания на тесте
y_pred_test = three_x_lm.predict(adv_test_20[['radio', 'TV']])
y_test = adv_test_20['sales']

#рассчет метрик
TSS_test = np.sum((y_test - y_test.mean())**2)
RSS_test = np.sum((y_test - y_pred_test)**2)
RSE_test = np.sqrt(RSS_test / (adv_test_20.shape[0] - 2 - 1))

R_2_test2 = sklearn.metrics.r2_score(y_test, y_pred_test)

print('for sales ~ TV + radio:')
print("RSS_test:", RSS_test)
print("RSE_test:", RSE_test)
print("R^2_test:", R_2_test2)

for sales ~ TV + radio:
RSS_test: 125.51792036273409
RSE_test: 1.841840445320886
R^2_test: 0.9005833101920356


# sales ~ TV + radio + newspaper

In [78]:
three_x_lm = smf.ols('sales ~ TV + radio + newspaper', adv_train_50).fit()


#расчет метрик
print("RSS:", np.sum(three_x_lm.resid ** 2))
print("RSE:", np.sqrt(np.sum(three_x_lm.resid ** 2)) / (adv_train_50.shape[0] - 2 - 1))
print("R^2:", three_x_lm.rsquared)

RSS: 263.7072834762901
RSE: 0.1674130575532674
R^2: 0.9042613648908893


In [86]:
#получение предссказания на тесте
y_pred_test = three_x_lm.predict(adv_test_50[['radio', 'TV', 'newspaper']])
y_test = adv_test_50['sales']

#рассчет метрик
TSS_test = np.sum((y_test - y_test.mean())**2)
RSS_test = np.sum((y_test - y_pred_test)**2)
RSE_test = np.sqrt(RSS_test / (adv_test_50.shape[0] - 2 - 1))

R_2_test2 = sklearn.metrics.r2_score(y_test, y_pred_test)

print('for sales ~ TV + radio + newspaper:')
print("RSS_test:", RSS_test)
print("RSE_test:", RSE_test)
print("R^2_test:", R_2_test2)

for sales ~ TV + radio + newspaper:
RSS_test: 286.0152971741764
RSE_test: 1.7171520931687978
R^2_test: 0.8906321307624936


In [80]:
three_x_lm = smf.ols('sales ~ TV + radio + newspaper', adv_train_70).fit()


#расчет метрик
print("RSS:", np.sum(three_x_lm.resid ** 2))
print("RSE:", np.sqrt(np.sum(three_x_lm.resid ** 2)) / (adv_train_70.shape[0] - 2 - 1))
print("R^2:", three_x_lm.rsquared)

RSS: 347.1097250468102
RSE: 0.13599183176626933
R^2: 0.9055159502227753


In [85]:
#получение предссказания на тесте
y_pred_test = three_x_lm.predict(adv_test_30[['radio', 'TV', 'newspaper']])
y_test = adv_test_30['sales']

#рассчет метрик
TSS_test = np.sum((y_test - y_test.mean())**2)
RSS_test = np.sum((y_test - y_pred_test)**2)
RSE_test = np.sqrt(RSS_test / (adv_test_30.shape[0] - 2 - 1))

R_2_test2 = sklearn.metrics.r2_score(y_test, y_pred_test)

print('sales ~ TV + radio + newspaper:')
print("RSS_test:", RSS_test)
print("RSE_test:", RSE_test)
print("R^2_test:", R_2_test2)

sales ~ TV + radio + newspaper:
RSS_test: 207.17073164947735
RSE_test: 1.9064559369323264
R^2_test: 0.8735434881416676


In [82]:
three_x_lm = smf.ols('sales ~ TV + radio + newspaper', adv_train_80).fit()


#расчет метрик
print("RSS:", np.sum(three_x_lm.resid ** 2))
print("RSE:", np.sqrt(np.sum(three_x_lm.resid ** 2)) / (adv_train_80.shape[0] - 2 - 1))
print("R^2:", three_x_lm.rsquared)

RSS: 432.82070769302624
RSE: 0.13251174191989148
R^2: 0.8957008271017818


In [84]:
#получение предссказания на тесте
y_pred_test = three_x_lm.predict(adv_test_20[['radio', 'TV', 'newspaper']])
y_test = adv_test_20['sales']

#рассчет метрик
TSS_test = np.sum((y_test - y_test.mean())**2)
RSS_test = np.sum((y_test - y_pred_test)**2)
RSE_test = np.sqrt(RSS_test / (adv_test_20.shape[0] - 2 - 1))

R_2_test2 = sklearn.metrics.r2_score(y_test, y_pred_test)

print('for sales ~ TV + radio + newspaper:')
print("RSS_test:", RSS_test)
print("RSE_test:", RSE_test)
print("R^2_test:", R_2_test2)

for sales ~ TV + radio + newspaper:
RSS_test: 126.96389415904423
RSE_test: 1.852419120742681
R^2_test: 0.8994380241009119
