In [1]:
import pandas as pd

                                                   DATA UNDERSTANDING

In [3]:
df= pd.read_csv(r"C:\Users\Ruchira Chaukiyal\Downloads\Advertising.csv.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,TV Ad Budget ($),Radio Ad Budget ($),Newspaper Ad Budget ($),Sales ($)
0,1,230.1,37.8,69.2,22.1
1,2,44.5,39.3,45.1,10.4
2,3,17.2,45.9,69.3,9.3
3,4,151.5,41.3,58.5,18.5
4,5,180.8,10.8,58.4,12.9


In [4]:
df.shape

(200, 5)

In [9]:
df.columns

Index(['Unnamed: 0', 'TV Ad Budget ($)', 'Radio Ad Budget ($)',
       'Newspaper Ad Budget ($)', 'Sales ($)'],
      dtype='object')

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 5 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Unnamed: 0               200 non-null    int64  
 1   TV Ad Budget ($)         200 non-null    float64
 2   Radio Ad Budget ($)      200 non-null    float64
 3   Newspaper Ad Budget ($)  200 non-null    float64
 4   Sales ($)                200 non-null    float64
dtypes: float64(4), int64(1)
memory usage: 7.9 KB


In [6]:
df.isna().sum()

Unnamed: 0                 0
TV Ad Budget ($)           0
Radio Ad Budget ($)        0
Newspaper Ad Budget ($)    0
Sales ($)                  0
dtype: int64

In [7]:
df.duplicated().sum()

np.int64(0)

In [8]:
df.describe()

Unnamed: 0.1,Unnamed: 0,TV Ad Budget ($),Radio Ad Budget ($),Newspaper Ad Budget ($),Sales ($)
count,200.0,200.0,200.0,200.0,200.0
mean,100.5,147.0425,23.264,30.554,14.0225
std,57.879185,85.854236,14.846809,21.778621,5.217457
min,1.0,0.7,0.0,0.3,1.6
25%,50.75,74.375,9.975,12.75,10.375
50%,100.5,149.75,22.9,25.75,12.9
75%,150.25,218.825,36.525,45.1,17.4
max,200.0,296.4,49.6,114.0,27.0


##      The dataset was already clean with no missing values or duplicates, so minimal preprocessing was required.

                                                    Data Cleaning

In [None]:
df.drop("Unnamed: 0", axis= 1)

In [17]:
df.columns

Index(['TV Ad Budget ($)', 'Radio Ad Budget ($)', 'Newspaper Ad Budget ($)',
       'Sales ($)'],
      dtype='object')

                                                Data preparation

In [18]:
x= df.drop("Sales ($)", axis= 1)
y= df["Sales ($)"]

In [20]:
x.shape

(200, 3)

In [21]:
y.shape

(200,)

## I first validated the dataset by checking shape, data types, missing values, duplicates, and summary statistics. Since the dataset was clean, I removed the index column, separated features and target, and prepared it for modeling.

                                          Train / Test Split (Regression)

In [22]:
#We train the model on one part of the data and test it on unseen data to evaluate how well it generalizes

from sklearn.model_selection import train_test_split

In [23]:
#no stratify needed

x_train, x_test, y_train, y_test= train_test_split(x, y, test_size= 0.2, random_state= 42)

In [24]:
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(160, 3)
(40, 3)
(160,)
(40,)


## I split the data into training and testing sets using an 80–20 split to evaluate model performance on unseen data and avoid overfitting.

                                                      Model building

                                  Linear Regression  

In [25]:
# Linear Regression is simple, interpretable, and provides a strong baseline to compare more complex models

from sklearn.linear_model import LinearRegression

In [26]:
lr= LinearRegression()

In [27]:
#Learns the relationship between ad budgets and sales
#Finds the best-fit line (actually a plane in 3D)

lr.fit(x_train, y_train)

In [28]:
#These are predicted sales values.

y_pred_lr= lr.predict(x_test)

In [29]:
#Evaluate the model

from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

In [30]:
#Calculate RMSE & R²

rmse_lr= np.sqrt(mean_squared_error(y_test, y_pred_lr))
r2_lr= r2_score(y_test, y_pred_lr)

print("Linear Regression RMSE:", rmse_lr)
print("Linear Regression R2:", r2_lr)

Linear Regression RMSE: 1.7815996615334502
Linear Regression R2: 0.899438024100912


In [None]:
# RMSE- Average prediction error in sales units. Lower RMSE → better model.
# R2 score- Percentage of variance in sales explained by the model.[Example:R² = 0.90 → 90% of sales variation explained]

#Metrics: RMSE ≈ 1.78 → very low error (excellent)...., R² ≈ 0.90 → ~90% of sales variance explained

In [33]:
coefficients= pd.DataFrame({"Feature": x_train.columns, "Coefficient": lr.coef_})
print(coefficients)

                   Feature  Coefficient
0         TV Ad Budget ($)     0.044730
1      Radio Ad Budget ($)     0.189195
2  Newspaper Ad Budget ($)     0.002761


In [None]:
#Positive coefficient → increases sales
#Larger value → stronger impact

## I trained a Linear Regression model as a baseline, evaluated it using RMSE and R², and analyzed feature coefficients to understand the impact of advertising channels on sales.

## Linear Regression coefficients showed that Radio spend has a higher marginal impact per dollar compared to TV, while Newspaper spend contributes very little. This aligns with known behavior of the dataset and indicates diminishing returns for certain channels

In [None]:
#“Which channel is most important?”
#----Coefficient magnitude shows marginal impact, but feature importance should be evaluated using standardized coefficients or tree-based models.(RF)

                                           Random Forest

In [None]:
#Linear Regression assumes linear relationships. Random Forest captures non-linear interactions and often improves performance.

In [34]:
from sklearn.ensemble import RandomForestRegressor

In [35]:
#Create the model

rf= RandomForestRegressor(n_estimators= 200, random_state= 42)

In [36]:
#Train the model

rf.fit(x_train, y_train)

In [37]:
# Make predictions

y_pred_rf= rf.predict(x_test)

In [38]:
# evaluate

rmse_rf= np.sqrt(mean_squared_error(y_test, y_pred_rf))
r2_rf= r2_score(y_test, y_pred_rf)

print("Random Forest RMSE:", rmse_rf)
print("Random Forest R2:", r2_rf)

Random Forest RMSE: 0.7572349907723515
Random Forest R2: 0.9818333477552758


In [39]:
# Compare with Linear Regression

print("Linear Regression RMSE:", rmse_lr)
print("Random Forest RMSE:", rmse_rf)

print("Linear Regression R2:", r2_lr)
print("Random Forest R2:", r2_rf)

Linear Regression RMSE: 1.7815996615334502
Random Forest RMSE: 0.7572349907723515
Linear Regression R2: 0.899438024100912
Random Forest R2: 0.9818333477552758


In [None]:
#RF RMSE → slightly lower..., RF R² → slightly higher or similar
#If RF doesn’t beat LR by much → that’s OK

                                                       Feature Importance

In [43]:
import pandas as pd

feature_importance= pd.DataFrame({"Feature":x_train.columns, "Importance": rf.feature_importances_})
feature_importance= feature_importance.sort_values(by="Importance", ascending=False)

feature_importance

Unnamed: 0,Feature,Importance
0,TV Ad Budget ($),0.624727
1,Radio Ad Budget ($),0.362119
2,Newspaper Ad Budget ($),0.013153


In [None]:
# Random Forest feature importance confirmed that TV and Radio are the primary drivers of sales, while Newspaper contributes minimally.

# Built a Random Forest regression model to capture non-linear relationships and compared performance against Linear Regression using RMSE and R². Analyzed feature importance to identify key revenue drivers.

# What the model learned
# TV and Radio advertising budgets are the strongest drivers of sales
# Newspaper ads contribute very little to sales
# Linear Regression already explains ~90% of sales variance
# Random Forest confirms the same insights and captures non-linear effects

## Business takeaway (VERY IMPORTANT)
## --The company should prioritize TV and Radio advertising spend and reconsider investment in Newspaper ads to improve ROI.