In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
import random
from sklearn.linear_model import LinearRegression
import math
from sklearn.preprocessing import StandardScaler, normalize, MinMaxScaler
from sklearn.model_selection import train_test_split
import sklearn

In [4]:
import os
import urllib
import shutil

def download_file(url, dir_path="data"):
    if not os.path.exists(dir_path):
        os.makedirs(dir_path)
        
    file_name = os.path.split(url)[-1]
    file_path = os.path.join(dir_path, file_name)
    
    with urllib.request.urlopen(url) as response, open(file_path, 'wb') as out_file:
        shutil.copyfileobj(response, out_file)
        
    return file_path

In [5]:
download_file("http://www-bcf.usc.edu/~gareth/ISL/Advertising.csv")
df_rek = pd.read_csv('data/Advertising.csv', usecols=[1,2,3,4])
df_rek.head()

Unnamed: 0,TV,radio,newspaper,sales
0,230.1,37.8,69.2,22.1
1,44.5,39.3,45.1,10.4
2,17.2,45.9,69.3,9.3
3,151.5,41.3,58.5,18.5
4,180.8,10.8,58.4,12.9


In [6]:
df_rek['log_tv'] = df_rek.TV.apply(lambda x: math.pow(x, 0.4))
df_rek.head()

Unnamed: 0,TV,radio,newspaper,sales,log_tv
0,230.1,37.8,69.2,22.1,8.805756
1,44.5,39.3,45.1,10.4,4.563983
2,17.2,45.9,69.3,9.3,3.120408
3,151.5,41.3,58.5,18.5,7.450151
4,180.8,10.8,58.4,12.9,7.996121


# Разделение дата сета на трейн и тест, в соотношении 50:50

In [33]:
rek_train, rek_test = train_test_split(df_rek, test_size=0.5, random_state=42, shuffle=True)

print("Total transactions in train dataset: ", len(rek_train))
print("Total transactions in test dataset: ", len(rek_test))

Total transactions in train dataset:  100
Total transactions in test dataset:  100


### Модель: sales ~ log_tv + radio

In [56]:
three_x_lm = smf.ols('sales ~ log_tv + radio', rek_train).fit()
print("RSS_train:", np.sum(three_x_lm.resid ** 2))
print("RSE_train:", np.sqrt(np.sum(three_x_lm.resid ** 2)) / (rek_train.shape[0] - 2 - 1))
print("R^2_train:", three_x_lm.rsquared)

RSS_train: 200.2349956038338
RSE_train: 0.1458808405554922
R^2_train: 0.9273049082016976


In [39]:
y_pred_test = three_x_lm.predict(rek_test[['log_tv', 'radio']])
y_test = rek_test['sales']

In [40]:
TSS_test = np.sum((y_test - y_test.mean())**2)
RSS_test = np.sum((y_test - y_pred_test)**2)
RSE_test = np.sqrt(RSS_test / (rek_test.shape[0] - 2 - 1))
R_2_test = (1 - RSS_test/TSS_test)
R_2_test2 = sklearn.metrics.r2_score(y_test, y_pred_test)

print("RSS_test:", RSS_test)
print("RSE_test:", RSE_test)
print("R^2_test:", R_2_test2)

RSS_test: 180.2527057870789
RSE_test: 1.363185723734138
R^2_test: 0.9310741259261295


### Модель: sales ~ TV + radio

In [55]:
three_x_lm = smf.ols('sales ~ TV + radio', rek_train).fit()
print("RSS_train:", np.sum(three_x_lm.resid ** 2))
print("RSE_train:", np.sqrt(np.sum(three_x_lm.resid ** 2)) / (rek_train.shape[0] - 2 - 1))
print("R^2_train:", three_x_lm.rsquared)

RSS_train: 269.7967207754174
RSE_train: 0.16933494815087202
R^2_train: 0.9020506014720117


In [45]:
y_pred_test = three_x_lm.predict(rek_test[['TV', 'radio']])
y_test = rek_test['sales']

In [46]:
TSS_test = np.sum((y_test - y_test.mean())**2)
RSS_test = np.sum((y_test - y_pred_test)**2)
RSE_test = np.sqrt(RSS_test / (rek_test.shape[0] - 2 - 1))
R_2_test = (1 - RSS_test/TSS_test)
R_2_test2 = sklearn.metrics.r2_score(y_test, y_pred_test)

print("RSS_test:", RSS_test)
print("RSE_test:", RSE_test)
print("R^2_test:", R_2_test2)

RSS_test: 306.90663871598935
RSE_test: 1.7787596707099642
R^2_test: 0.8826436017134699


### Модель: sales ~ TV + radio + newspaper

In [49]:
three_x_lm = smf.ols('sales ~ TV + radio + newspaper', rek_train).fit()
print("RSS_train:", np.sum(three_x_lm.resid ** 2))
print("RSE_train:", np.sqrt(np.sum(three_x_lm.resid ** 2)) / (rek_train.shape[0] - 3 - 1))
print("R^2_train:", three_x_lm.rsquared)

RSS: 263.70728347629023
RSE: 0.16915694356944733
R^2: 0.9042613648908893


In [53]:
y_pred_test = three_x_lm.predict(rek_test[['TV', 'radio', 'newspaper']])
y_test = rek_test['sales']

In [54]:
TSS_test = np.sum((y_test - y_test.mean())**2)
RSS_test = np.sum((y_test - y_pred_test)**2)
RSE_test = np.sqrt(RSS_test / (rek_test.shape[0] - 3 - 1))
R_2_test = (1 - RSS_test/TSS_test)
R_2_test2 = sklearn.metrics.r2_score(y_test, y_pred_test)

print("RSS_test:", RSS_test)
print("RSE_test:", RSE_test)
print("R^2_test:", R_2_test2)

RSS_test: 334.4786637735296
RSE_test: 1.8665885319590212
R^2_test: 0.872100481604513


# Разделение дата сета на трейн и тест, в соотношении 70:30

In [7]:
rek_train1, rek_test1 = train_test_split(df_rek, test_size=0.3, random_state=42, shuffle=True)

print("Total transactions in train dataset: ", len(rek_train1))
print("Total transactions in test dataset: ", len(rek_test1))

Total transactions in train dataset:  140
Total transactions in test dataset:  60


### Модель: sales ~ log_tv + radio

In [8]:
y = df_rek['sales']
df_1 = df_rek.drop(['TV','sales','newspaper'], axis=1)

In [9]:
x_train, x_test, y_train, y_test = train_test_split(df_1, y, test_size=0.3, shuffle=True, random_state=42)

lm = LinearRegression()
model = lm.fit(x_train, y_train)

In [11]:
print('R^2 score for train:', model.score(x_train, y_train))
print('R^2 score for test:', model.score(x_test, y_test))

R^2 score for train: 0.9301954868019677
R^2 score for test: 0.9271446043449548


### Модель: sales ~ TV + radio

In [12]:
y = df_rek['sales']
df_2 = df_rek.drop(['log_tv','sales','newspaper'], axis=1)

In [13]:
x_train, x_test, y_train, y_test = train_test_split(df_2, y, test_size=0.3, shuffle=True, random_state=42)

lm = LinearRegression()
model = lm.fit(x_train, y_train)

In [14]:
print('R^2 score for train:', model.score(x_train, y_train))
print('R^2 score for test:', model.score(x_test, y_test))

R^2 score for train: 0.9048377867980043
R^2 score for test: 0.8656253548947075


### Модель: sales ~ TV + radio + newspaper

In [15]:
y = df_rek['sales']
df_3 = df_rek.drop(['log_tv','sales'], axis=1)

In [16]:
x_train, x_test, y_train, y_test = train_test_split(df_3, y, test_size=0.3, shuffle=True, random_state=42)

lm = LinearRegression()
model = lm.fit(x_train, y_train)

In [17]:
print('R^2 score for train:', model.score(x_train, y_train))
print('R^2 score for test:', model.score(x_test, y_test))

R^2 score for train: 0.9055159502227753
R^2 score for test: 0.8609466508230366


# Разделение дата сета на трейн и тест, в соотношении 80:20

### Модель: sales ~ log_tv + radio

In [18]:
y = df_rek['sales']
df_1 = df_rek.drop(['TV','sales','newspaper'], axis=1)

In [19]:
x_train, x_test, y_train, y_test = train_test_split(df_1, y, test_size=0.2, shuffle=True, random_state=42)
#и тут до меня дошло, что разделение заранее можно не делать, ибо предыдущая строка это делает
lm = LinearRegression()
model = lm.fit(x_train, y_train)

In [20]:
print('R^2 score for train:', model.score(x_train, y_train))
print('R^2 score for test:', model.score(x_test, y_test))

R^2 score for train: 0.9276759564687794
R^2 score for test: 0.9425639093506952


### Модель: sales ~ TV + radio

In [21]:
y = df_rek['sales']
df_2 = df_rek.drop(['log_tv','sales','newspaper'], axis=1)

In [25]:
x_train, x_test, y_train, y_test = train_test_split(df_2, y, test_size=0.2, shuffle=True, random_state=42)

lm = LinearRegression()
model = lm.fit(x_train, y_train)

In [26]:
print('R^2 score for train:', model.score(x_train, y_train))
print('R^2 score for test:', model.score(x_test, y_test))

R^2 score for train: 0.8955982149747163
R^2 score for test: 0.9005833101920355


### Модель: sales ~ TV + radio + newspaper

In [27]:
y = df_rek['sales']
df_3 = df_rek.drop(['log_tv','sales'], axis=1)

In [28]:
x_train, x_test, y_train, y_test = train_test_split(df_3, y, test_size=0.2, shuffle=True, random_state=42)
 
lm = LinearRegression()
model = lm.fit(x_train, y_train)

In [29]:
print('R^2 score for train:', model.score(x_train, y_train))
print('R^2 score for test:', model.score(x_test, y_test))

R^2 score for train: 0.8957008271017818
R^2 score for test: 0.899438024100912
