# Sales Prediction
- Sales prediction involves forecasting the amount of a product that customers will purchase, taking into account various factors such as advertising expenditure, target audience segmentation, and advertising platform selection.

- In businesses that offer products or services, the role of a Data Scientist is crucial for predicting future sales. They utilize machine learning techniques in Python to analyze and interpret data, enabling them to make informed decisions regarding advertising costs. By leveraging these predictions, businesses can optimize their advertising strategies and maximize sales potential. Let's embark on the journey of sales prediction using machine learning in Python.



In [None]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd

# Data Visualisation tools
import matplotlib.pyplot as plt 
import seaborn as sns

In [None]:
sales_data = pd.read_csv("advertising.csv")
sales_data.head() 

In [None]:
sales_data.shape

In [None]:
sales_data.info()

In [None]:
sales_data.describe()

### DATA CLEANING

In [None]:
#This checks if there are any NULL values and since there isn't any, means the data is clean. 
sales_data.isnull().sum()*100/sales_data.shape[0]

### Examining Outliers


In [None]:
fig, axs = plt.subplots(3, figsize = (5,5))
plt1 = sns.boxplot(sales_data['TV'], ax = axs[0], orient='h')
plt2 = sns.boxplot(sales_data['Newspaper'], ax = axs[1], orient='h')
plt3 = sns.boxplot(sales_data['Radio'], ax = axs[2], orient='h')
plt.tight_layout()

### EXPLORATORY DATA ANALYSIS

In [None]:
sns.boxplot(sales_data['Sales'], orient='h')
plt.show()

### Sales relation with the other vairables

In [None]:
sns.pairplot(sales_data, x_vars=['TV', 'Newspaper', 'Radio'], y_vars='Sales', height=4, aspect=1, kind='scatter')
plt.show()

### Correlation between variables


In [None]:
sns.heatmap(sales_data.corr(), cmap="YlGnBu", annot = True)
plt.show()

### MODEL BUILDING - LINEAR REGRESSION 

In [None]:
X = sales_data['TV']
y = sales_data['Sales']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.7, test_size = 0.3, random_state = 100)

X_train.head()

In [None]:
y_train.head()

### Building a linear model

In [None]:
import statsmodels.api as sm

In [None]:
# Add a constant to get an intercept
X_train_sm = sm.add_constant(X_train)

# Fit the resgression line using 'OLS'
lr = sm.OLS(y_train, X_train_sm).fit()

#Print the parameters
lr.params

### Summary operation listing all parameters of the regression line


In [None]:
print(lr.summary())

In [None]:
plt.scatter(X_train, y_train)
plt.plot(X_train, 6.948 + 0.054*X_train, 'r')
plt.show()

### MODEL EVALUATION


In [None]:
y_train_pred = lr.predict(X_train_sm)
res = (y_train - y_train_pred)

fig = plt.figure()
sns.histplot(res, bins = 15)
fig.suptitle('Error Terms', fontsize = 11)                  # Plot heading 
plt.xlabel('y_train - y_train_pred', fontsize = 11)         # X-label
plt.show()

In [None]:
plt.scatter(X_train,res)
plt.show()

### PREDICTION ON THE TEST SET

In [None]:
# Add a constant to X_test
X_test_sm = sm.add_constant(X_test)

# Predict the y values corresponding to X_test_sm
y_pred = lr.predict(X_test_sm)
y_pred.head()

In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

np.sqrt(mean_squared_error(y_test, y_pred))

#### R-SQUARED ON TEST SET


In [None]:
r_squared = r2_score(y_test, y_pred)
r_squared

In [None]:
plt.scatter(X_test, y_test)
plt.plot(X_test, 6.948 + 0.054 * X_test, 'r')
plt.show()