In [1]:
# Import the required libraries
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.formula.api as smf
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np

# allow plots to appear directly in the notebook
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

### 1. Explain the dataset

In [2]:
# Loading the csv file into a pandas dataframe.
df = pd.read_csv('Advertising.csv')
df

Unnamed: 0.1,Unnamed: 0,TV,Radio,Newspaper,Sales
0,1,230.1,37.8,69.2,22.1
1,2,44.5,39.3,45.1,10.4
2,3,17.2,45.9,69.3,9.3
3,4,151.5,41.3,58.5,18.5
4,5,180.8,10.8,58.4,12.9
...,...,...,...,...,...
195,196,38.2,3.7,13.8,7.6
196,197,94.2,4.9,8.1,9.7
197,198,177.0,9.3,6.4,12.8
198,199,283.6,42.0,66.2,25.5


In [3]:
# summary of the dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  200 non-null    int64  
 1   TV          200 non-null    float64
 2   Radio       200 non-null    float64
 3   Newspaper   200 non-null    float64
 4   Sales       200 non-null    float64
dtypes: float64(4), int64(1)
memory usage: 7.9 KB


This dataset contains the following data

there is 200 rows and 5 features. this dataset has TV, Radio, Newspaper, Sales and a unnamed features.

we can see that there is feature called unnamed it is the index values so we don't need that , we can remove it.

In [4]:
# remove the unwanted feature
df1 = df.drop('Unnamed: 0', axis=1)
df1

Unnamed: 0,TV,Radio,Newspaper,Sales
0,230.1,37.8,69.2,22.1
1,44.5,39.3,45.1,10.4
2,17.2,45.9,69.3,9.3
3,151.5,41.3,58.5,18.5
4,180.8,10.8,58.4,12.9
...,...,...,...,...
195,38.2,3.7,13.8,7.6
196,94.2,4.9,8.1,9.7
197,177.0,9.3,6.4,12.8
198,283.6,42.0,66.2,25.5


In [5]:
# Calculating the null values present in each columns in the dataset
df1.isna().sum() # or you can use df.isnull().sum() as well

TV           0
Radio        0
Newspaper    0
Sales        0
dtype: int64

As you can see that, there is no missing values present in the dataset. so we can move to the questions

### 2. Check the summary statistics and discuss the max, min, avg, median, and percentiles.

In [6]:
# statistical summary of the data
df1.describe().T # diagonal by writing rows as columns 

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
TV,200.0,147.0425,85.854236,0.7,74.375,149.75,218.825,296.4
Radio,200.0,23.264,14.846809,0.0,9.975,22.9,36.525,49.6
Newspaper,200.0,30.554,21.778621,0.3,12.75,25.75,45.1,114.0
Sales,200.0,14.0225,5.217457,1.6,10.375,12.9,17.4,27.0


This is the complete statisticsl summary of the data. we can also use the seperate summary of each data.

In [7]:
# Maximum values present in the dataset
print('Maximum values present in the dataset : \n',np.max(df1))

Maximum values present in the dataset : 
 TV           296.4
Radio         49.6
Newspaper    114.0
Sales         27.0
dtype: float64


In [8]:
# Minimum values present in the dataset
print('Minimum values present in the dataset : \n',np.min(df1))

Minimum values present in the dataset : 
 TV           0.7
Radio        0.0
Newspaper    0.3
Sales        1.6
dtype: float64


In [9]:
# Average values present in the dataset
print('Average values present in the dataset : \n',df1.mean())

Average values present in the dataset : 
 TV           147.0425
Radio         23.2640
Newspaper     30.5540
Sales         14.0225
dtype: float64


In [10]:
# Median values present in the dataset
print('Median values present in the dataset : ',df1.median())

Median values present in the dataset :  TV           149.75
Radio         22.90
Newspaper     25.75
Sales         12.90
dtype: float64


In [11]:
#Compute the q-th percentile of the data along the specified axis

#25th percentile
print('25th percentile of TV: ',np.percentile(df1['TV'],25))
print('25th percentile of Radio: ',np.percentile(df1['Radio'],25))
print('25th percentile of Newspaper: ',np.percentile(df1['Newspaper'],25))
print('25th percentile of Sales: ',np.percentile(df1['Sales'],25))

# median and 50th percentile are the same
#50th percentile
print('50th percentile of TV: ',np.percentile(df1['TV'],50))
print('50th percentile of Radio: ',np.percentile(df1['Radio'],50))
print('50th percentile of Newspaper: ',np.percentile(df1['Newspaper'],50))
print('50th percentile of Sales: ',np.percentile(df1['Sales'],50))

#75th percentile
print('75th percentile of TV: ',np.percentile(df1['TV'],75))
print('75th percentile of Radio: ',np.percentile(df1['Radio'],75))
print('75th percentile of Newspaper: ',np.percentile(df1['Newspaper'],75))
print('75th percentile of Sales: ',np.percentile(df1['Sales'],75))

25th percentile of TV:  74.375
25th percentile of Radio:  9.975
25th percentile of Newspaper:  12.75
25th percentile of Sales:  10.375
50th percentile of TV:  149.75
50th percentile of Radio:  22.9
50th percentile of Newspaper:  25.75
50th percentile of Sales:  12.9
75th percentile of TV:  218.825
75th percentile of Radio:  36.525
75th percentile of Newspaper:  45.1
75th percentile of Sales:  17.4


In [12]:
#equivalent to percentile, except with q in the range [0, 1]
df1.quantile([.25,.5,.75])

Unnamed: 0,TV,Radio,Newspaper,Sales
0.25,74.375,9.975,12.75,10.375
0.5,149.75,22.9,25.75,12.9
0.75,218.825,36.525,45.1,17.4


### 3. The manager wants to find out whether the same amount was spent for the three advertisements (TV , Radio and Newspaper). Comment on your findings.

Here we are having 3 independent variables with target variable and we need to find the mean of each. Hence we use one way ANOVA.

- H0 (Null Hypothesis): The mean of the amount spent for the advertisement on TV , Radio and Newspaper are the same
- H1 (Alternate Hypothesis): The mean of the amount spent for the advertisement on TV , Radio and Newspaper are not the same

In [13]:
# first we need to import the libraries for one way ANOVA which is present in the scipy.stat model
import scipy.stats as stats

In [14]:
# find the f_statistic and p_value for one way ANOVA
f_statistic, p_value = stats.f_oneway(df['TV'],df['Radio'],df['Newspaper'])

In [15]:
# display the values of f_statistic and p_value
print('f_statistic=',round(f_statistic,5))
print('p_value=',round(p_value,5))

f_statistic= 358.85146
p_value= 0.0


In [16]:
# let's check the hypothesis analysis
if p_value<0.05:
    print('Reject the null hypothesis and we can say that there is a significant difference between the mean of the amount spent for the advertisement on TV , Radio and Newspaper are not the same')
else:
    print('Accept the null hypothesis and we can say that there is no significant difference between the mean of the amount spent for the advertisement on TV , Radio and Newspaper are the same')

Reject the null hypothesis and we can say that there is a significant difference between the mean of the amount spent for the advertisement on TV , Radio and Newspaper are not the same
