In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

#Load data from CSV

file_path = r'Advertising.csv'

df = pd.read_csv(file_path)

print(df.head())

   sample index     TV  radio  newspaper  sales
0             1  230.1   37.8       69.2   22.1
1             2   44.5   39.3       45.1   10.4
2             3   17.2   45.9       69.3    9.3
3             4  151.5   41.3       58.5   18.5
4             5  180.8   10.8       58.4   12.9


In [2]:
X = df.iloc[:,1:-1]
y = df.iloc[:,-1:]
x_train, x_test, y_train, y_test = train_test_split(X, y, train_size= 0.8, random_state = 42)
print("Shape of X: ", X.shape)
print("Shape of y: ", y.shape)
print("Shape of training set: ", x_train.shape)

Shape of X:  (200, 3)
Shape of y:  (200, 1)
Shape of training set:  (160, 3)


In [14]:
from sklearn.linear_model import LinearRegression

trained_data = LinearRegression().fit(x_train,y_train)
print(trained_data.coef_)
print(trained_data.intercept_)

print("Y =",str(trained_data.intercept_[0]),"+",str(trained_data.coef_[0][0])+"*TV +",str(trained_data.coef_[0][1])+"*Radio +",
      str(trained_data.coef_[0][2])+"*Newspaper")

[[0.04472952 0.18919505 0.00276111]]
[2.97906734]
Y = 2.9790673381226256 + 0.044729517468716326*TV + 0.1891950542343766*Radio + 0.0027611143413672056*Newspaper


In [6]:
import sklearn.metrics

y_predict_test = trained_data.predict(x_test)
y_predict_train = trained_data.predict(x_train)

RSS1 = np.sum(np.square(y_predict_test-y_test))
print("RSS test data:", np.squeeze(RSS1))
RSS2 = np.sum(np.square(y_predict_train-y_train))
print("RSS train data:", np.squeeze(RSS2))

RSS test data: 126.96389415904417
RSS train data: 432.82070769302624


In [7]:
RSE1 = np.sqrt(RSS1/(y_test.shape[0]-4))
print("RSE test data: ",np.squeeze(RSE1))
RSE2 = np.sqrt(RSS2/(y_train.shape[0]-4))
print("RSE train data: ",np.squeeze(RSE2))

RSE test data:  1.877970936343592
RSE train data:  1.6656805564035793


In [8]:
MSE = sklearn.metrics.mean_squared_error(y_test,y_predict_test)
print("MSE test data: ",MSE)
MSE = sklearn.metrics.mean_squared_error(y_train,y_predict_train)
print("MSE train data: ",MSE)

MSE test data:  3.174097353976104
MSE train data:  2.705129423081414


In [9]:
R2 = sklearn.metrics.r2_score(y_test,y_predict_test)
print("R2 test data: ",R2)
R2 = sklearn.metrics.r2_score(y_train,y_predict_train)
print("R2 train data: ",R2)

R2 test data:  0.899438024100912
R2 train data:  0.8957008271017817


In [11]:
import statsmodels.api as sm

intercept=sm.add_constant(x_train)
Model=sm.OLS(y_train,intercept).fit()

print(Model.summary())

                            OLS Regression Results                            
Dep. Variable:                  sales   R-squared:                       0.896
Model:                            OLS   Adj. R-squared:                  0.894
Method:                 Least Squares   F-statistic:                     446.6
Date:                Mon, 11 Sep 2023   Prob (F-statistic):           2.53e-76
Time:                        14:00:49   Log-Likelihood:                -306.64
No. Observations:                 160   AIC:                             621.3
Df Residuals:                     156   BIC:                             633.6
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          2.9791      0.354      8.427      0.0

In [12]:
intercept=sm.add_constant(x_test)
Model=sm.OLS(y_test,intercept).fit()

print(Model.summary())

                            OLS Regression Results                            
Dep. Variable:                  sales   R-squared:                       0.910
Model:                            OLS   Adj. R-squared:                  0.902
Method:                 Least Squares   F-statistic:                     120.8
Date:                Mon, 11 Sep 2023   Prob (F-statistic):           7.51e-19
Time:                        14:00:54   Log-Likelihood:                -77.715
No. Observations:                  40   AIC:                             163.4
Df Residuals:                      36   BIC:                             170.2
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          2.8305      0.692      4.090      0.0

In [40]:
data = {'TV': [25],'radio': [25],'newspaper': [0]}
df1 = pd.DataFrame(data)
test1_predict = trained_data.predict(df1)
print('Predicted sales for both TV and Radio = ',test1_predict[0][0])

data = {'TV': [50],'radio': [0],'newspaper': [0]}
df1 = pd.DataFrame(data)
test1_predict = trained_data.predict(df1)
print('Predicted sales for only TV = ',test1_predict[0][0])

data = {'TV': [0],'radio': [50],'newspaper': [0]}
df1 = pd.DataFrame(data)
test1_predict = trained_data.predict(df1)
print('Predicted sales for only Radio = ',test1_predict[0][0])


Predicted sales for both TV and Radio =  8.827181630699949
Predicted sales for only TV =  5.215543211558442
Predicted sales for only Radio =  12.438820049841455
