<a href="https://www.kaggle.com/code/shreeyashah/sales-prediction-multiplelinearregression?scriptVersionId=282373377" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# Multiple Linear Regression - Sales Prediction

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Importing the Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

# Ignore all warnings
warnings.filterwarnings("ignore")

## Importing the Dataset

In [None]:
df = pd.read_csv('/kaggle/input/advertising-dataset/Advertising.csv')
df.head()

In [None]:
#Droping the unnamed column
df = df.iloc[:,1:]
df.head()

In [None]:
X = df.iloc[:,:-1]
y = df.iloc[:, -1]

## Exploratory Data Analysis

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
#Checking for null values
df.isnull().sum()

In [None]:
#Checking for duplicates
df.duplicated().sum()

In [None]:
#Boxplot to check for outliers
sns.boxplot(data=X, palette='Blues')
plt.title('Outlier detection: Advertising spend by channel')
plt.show()

In [None]:
#Pair plot
df_pg = sns.PairGrid(df)
df_pg.map(plt.scatter)
plt.show()

In [None]:
#Regression Plots
sns.pairplot(data = df,x_vars=["TV", "Radio", "Newspaper"], y_vars="Sales", kind="reg")
plt.show()

In [None]:
#Correlation between the features
df_corr = df.corr()
print(df_corr)

In [None]:
#Visualising the correlation using heatmap
sns.heatmap(df_corr, annot=True, cmap='Blues')
plt.show()

In [None]:
#Checking multicollinearity using VIF
from statsmodels.stats.outliers_influence import variance_inflation_factor
vif_df = pd.DataFrame({
    'Feature': X.columns,
    'VIF' : [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
})
vif_df

In [None]:
#Sales distribution
sns.displot(df['Sales'], kde=True)
plt.show()

In [None]:
#Splitting data into training and test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=1)

## Training the Model

In [None]:
import statsmodels.api as sm
X_train = sm.add_constant(X_train)
model = sm.OLS(y_train, X_train).fit()

In [None]:
model.summary()

## Making Predictions

In [None]:
X_test = sm.add_constant(X_test)
y_pred = model.predict(X_test)

In [None]:
np.set_printoptions(precision=2)
print(np.concatenate((np.array(y_test).reshape(len(y_test),1),np.array(y_pred).reshape(len(y_pred), 1)), axis=1))

## Evaluation Metrics

In [None]:
from sklearn.metrics import mean_squared_error , mean_absolute_error, r2_score
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
metrics_df = pd.DataFrame({'Metrics':['MSE', 'RMSE', 'MAE','R2'], 'Value':[mse, rmse, mae, r2]})
metrics_df

In [None]:
#Actucal vs predicted plot
#Shows how close predictions are to actual values.
#A perfect model’s points lie on the diagonal line (y = x).
sns.scatterplot(x=y_test, y=y_pred)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], color='r')
plt.xlabel("Actual")
plt.ylabel("Predicted")
plt.title("Actual vs Predicted")
plt.show()

In [None]:
#Distribution of residuals
#Residuals should be centered at zero and roughly normal.
residuals = y_test-y_pred
sns.histplot(residuals, kde=True)
plt.xlabel("Residuals")
plt.title("Residuals Distribution")
plt.show()

In [None]:
#Residuals vs Predicted
#Checks if residuals behave randomly (ideal).
#Detects heteroscedasticity (variance pattern)
#If you see a funnel shape → model is violating constant variance assumption.
sns.scatterplot(x=y_pred, y=residuals)
plt.axhline(0, linestyle='--', color='black')
plt.xlabel("Predicted")
plt.ylabel("Residuals")
plt.title("Residuals vs Predicted")
plt.show()

In [None]:
#Q-Q Plot of Residuals
#Checks normality assumption visually
#If points fall along the 45° line → residuals ≈ normal
#Deviations at ends indicate outliers or heavy tails
sm.qqplot(residuals, line='45')
plt.title("Q-Q Plot of Residuals")
plt.show()