<a href="https://colab.research.google.com/github/Saranya003/Final_Project/blob/main/Project_2_Item_demand_Forecasting.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Accurate forecasting of demand can help the manufacturers to maintain appropriate stock which results in reduction in loss due to product not being sold and also reduces the opportunity cost (i.e. higher demand but less availability => opportunity lost)**


**Data fields**

- **date** - Date of the sale data. There are no holiday effects or store closures.
- **store** - Store ID
- **item** - Item ID
- **sales** - Number of items sold at a particular store on a particular date.


In this project, the goal is to forecast 3-month sales for 50 different products in 10 different stores when given 5 years of store item sales data.

In [None]:
#import libraries
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn import metrics
import matplotlib.pyplot as plt
%matplotlib inline
from time import time

import warnings
warnings.filterwarnings("ignore")

In [None]:
# Read the dataset
dataset = pd.read_csv("item.csv")
dataset

### **DATA CLEANING**







In [None]:
dataset.info()

DATA HAS NO MISSING AND ZERO NULL VALUES FOR ALL COLUMNS SO NO NEED OF IMPUTE AND DROP THE DATA

In [None]:
#change the date column datatype object to datetime 
from datetime import datetime, timedelta, date
dataset['date'] = pd.to_datetime(dataset['date'])

In [None]:
dataset.info()

**Understanding Dataset**

In [None]:
dataset.isnull().sum() # no null values and no duplicate rows

date     0
store    0
item     0
sales    0
dtype: int64

In [None]:
dataset['store'].unique()
#dataset.store.unique()


In [None]:
dataset.store.nunique()

10

In [None]:
dataset.groupby(["store"]).agg({"sales": ["count","sum", "mean", "median", "std", "min", "max"]})

In [None]:
dataset.item.unique()

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
       35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50])

In [None]:
dataset.item.nunique()

50

In [None]:
dataset.groupby(["item"]).agg({"sales": ["count","sum", "mean", "median", "std", "min", "max"]})

**Outliers**

In [None]:
dataset.describe()

In [None]:
iqr = dataset['store'].quantile(0.75) - dataset['store'].quantile(0.25)
upper_threshold = dataset['store'].quantile(0.75) + (1.5 * iqr)
lower_threshold = dataset['store'].quantile(0.25) - (1.5 * iqr)
upper_threshold, lower_threshold


(15.5, -4.5)

In [None]:
import plotly.express as px
from matplotlib.pyplot import figure

#sns.boxplot(dataset['store'])
fig = px.box(dataset["store"])
fig.show()


AS WE OBSERVED THE GRAPH THEIR  IS NO OUTLIERS IN THE STORE


In [None]:
iqr = dataset['item'].quantile(0.75) - dataset['item'].quantile(0.25)
upper_threshold = dataset['item'].quantile(0.75) + (1.5 * iqr)
lower_threshold = dataset['item'].quantile(0.25) - (1.5 * iqr)
upper_threshold, lower_threshold


(75.5, -24.5)

In [None]:
#sns.boxplot(dataset['item'])
fig = px.box(dataset["item"])
fig.show()

AS WE OBSERVED THE GRAPH THEIR IS NO OUTLIERS IN THE ITEM

In [None]:
iqr = dataset['sales'].quantile(0.75) - dataset['sales'].quantile(0.25)
upper_threshold = dataset['sales'].quantile(0.75) + (1.5 * iqr)
lower_threshold = dataset['sales'].quantile(0.25) - (1.5 * iqr)
upper_threshold, lower_threshold


(130.0, -30.0)

In [None]:
fig = px.box(dataset["sales"])
fig.show()

AS WE OBSERVED THE GRAPH THEIR  IS  OUTLIERS IN THE SALES NEED TO CLIP 



In [None]:
dataset['sales'] = dataset['sales'].clip(upper_threshold,lower_threshold)
dataset.sales
fig = px.box(dataset["sales"])
fig.show()

AFTER CLIPPING NO OUTLIERS FOUND






In [None]:
dataset.describe()

In [None]:
plt.subplots(figsize=(12, 5))
plt.subplot(1, 2, 1)
sns.distplot(dataset['sales'])
  
plt.subplot(1, 2, 2)
sns.boxplot(dataset['sales'])
plt.show()

### **TASK JAR**

In [None]:
dataset.plot(x='item', y='sales', style='o')
plt.title('item vs sales')
plt.xlabel('item')
plt.ylabel('sales')
plt.show()

In [None]:
dataset.plot(x='store', y='sales', style='o')
plt.title('store vs sales')
plt.xlabel('store')
plt.ylabel('sales')
plt.show()

In [None]:
dataset.plot(x='date', y='sales', style='o')
plt.title('date vs sales')
plt.xlabel('date')
plt.ylabel('sales')
plt.show()

In [1]:
#Sales Data Per Item
sales_by_item = dataset.groupby('item')['sales'].sum().reset_index()
sales_by_item

NameError: ignored

In [None]:
fig, ax = plt.subplots(figsize=(20,8))
sns.barplot(sales_by_item.item, sales_by_item.sales, order=sales_by_item.sort_values('sales', ascending = False).item)
#ax.set(xlabel = "Item Id", ylabel = "Sum of Sales", title = "Total Sales Per Item")
    

In [None]:
#Sales Data Per Store
sales_by_store = dataset.groupby('store')['sales'].sum().reset_index()

In [None]:
fig, ax = plt.subplots(figsize=(10,6))
sns.barplot(sales_by_store.store, sales_by_store.sales, order=sales_by_store.sort_values('sales',ascending = False).store)
#ax.set(xlabel = "Store Id", ylabel = "Sum of Sales", title = "Total Sales Per Store")
   

In [None]:
figure(figsize=(18, 4), dpi=80)
item_daily = dataset.groupby(["date","store"],as_index=False).agg({"sales":"sum"})

item_daily['date'] = pd.to_datetime(item_daily.date, format='%Y/%m/%d')
item_1 = item_daily[item_daily['store']==1]
ax_2 = sns.scatterplot(data=item_1,x='date',y='sales')
ax_2.set_ylabel("Sales/store")

In [None]:
figure(figsize=(18, 4), dpi=80)
item_daily = dataset.groupby(["date","item"],as_index=False).agg({"sales":"sum"})

item_daily['date'] = pd.to_datetime(item_daily.date, format='%Y/%m/%d')
item_1 = item_daily[item_daily['item']==1]
ax_2 = sns.scatterplot(data=item_1,x='date',y='sales')
ax_2.set_ylabel("Sales/Item_1")

**Feature Engineering**

In [None]:
# Convert the date column to a datetime object
dataset['date'] = pd.to_datetime(dataset['date'])

In [None]:
# Create new columns for year, month, and day
dataset['year'] = dataset['date'].dt.year
dataset['month'] = dataset['date'].dt.month
dataset['day'] = dataset['date'].dt.day

In [None]:
from datetime import datetime
import calendar
      
def weekend_or_weekday(year,month,day):
      
    d = datetime(year,month,day)
    if d.weekday()>4:
        return 1
    else:
        return 0
  
dataset['weekend'] = dataset.apply(lambda x:weekend_or_weekday(x['year'], x['month'], x['day']), axis=1)
dataset.head()

In [None]:
def which_day(year, month, day):
      
    d = datetime(year,month,day)
    return d.weekday()
  
dataset['weekday'] = dataset.apply(lambda x: which_day(x['year'],x['month'],x['day']),axis=1)
dataset.head()

In [None]:
from datetime import date
import holidays
def is_holiday(x):
    india_holidays = holidays.country_holidays('IN')
    if india_holidays.get(x):
      return 1
    else:
      return 0
dataset['holidays'] = dataset['date'].apply(is_holiday)
dataset.head()

In [None]:
features = ['store', 'year', 'month',\
            'weekday', 'weekend','holidays' ]
  
plt.subplots(figsize=(20, 10))
for i, col in enumerate(features):
    plt.subplot(2, 3, i + 1)
    dataset.groupby(col).mean()['sales'].plot.bar()
plt.show()

In [None]:
item_i = dataset[dataset['item']==3]
item_i

In [None]:
k = item_i.groupby(['date','item'])
item_1 = k.agg(sum)
item_1=item_1.reset_index()
j=[]
for i in range(89, len(item_1)):
    b = item_1['date'][0+i] # 0 is the starting date and 0+i is the end date
    j.append(b)
item = item_1.head(1737) # doubt
item['end']=j    # doubt
date_list = dataset['date'].to_list()
d =[]
for i in range(1737):
     r = item.loc[i, 'end']
     a = date_list.index(r)
     c =item_1.loc[i:a,'sales'].sum()
     d.append(c)
item['total'] = d
item['date'] = pd.to_datetime(item['date'])
item['year'] = item['date'].dt.year
item['month'] = item['date'].dt.month
item['day'] = item['date'].dt.day

In [None]:
item

**Split the Data**

In [None]:
x =  item.loc[:,['year','month','day']].values
y = item.loc[:,'total'].values


***Train and Test the Data***

In [None]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size =0.25)


**Scale the Data**

In [None]:
from sklearn.preprocessing import StandardScaler ## standrard scalig 
scaler = StandardScaler() #initialise to a variable
scaler.fit(x_train) # we are finding the values of mean and sd from the td
x_train = scaler.transform(x_train) # fit (mean, sd) and then transform the training data
x_test= scaler.transform(x_test) # transform the test data 

### **MODEL**

 **Linear Regression**

In [None]:
linear =LinearRegression()
linear.fit(x_train,y_train)
print('score for Linear Regression:',linear.score(x_test,y_test))

score for Linear Regression: 0.44510958153315994


**Decision Tree Regressor**

In [None]:
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score

for depth in [1,2,3,4,5,6,7,8,9,10,20,40,60]:
  dt = DecisionTreeRegressor(max_depth=depth)
  dt.fit(x_train,y_train)
  trainAccuracy = r2_score(y_train,dt.predict(x_train))
  dt = DecisionTreeRegressor(max_depth = depth)
  valAccuracy = cross_val_score(dt, x_train, y_train, cv=10, scoring = make_scorer(r2_score))
  print("Depth:",depth,'Train R2:',trainAccuracy,'Val Score:',np.mean(valAccuracy))
dt = DecisionTreeRegressor(max_depth = int(input('max depth value')))
dt.fit(x_train,y_train)
#print('score for Decision Treeregressor:',dt.score(x_test,y_test))


**KNeighborsRegressor**

In [None]:
#from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score
for i in [1,2,3,4,5,6,7,8,9,10,20,50]:
  knn = KNeighborsRegressor(i)
  knn.fit(x_train,y_train)
  print('k value:',i ,'train score:',knn.score(x_train,y_train),'cv score:',np.mean(cross_val_score(knn, x_train, y_train, cv=10, scoring = make_scorer(r2_score))))
knn =KNeighborsRegressor(int(input('enter k values:')))
knn.fit(x_train,y_train)
#print('score for knn regression :',knn.score(x_test,y_test))

In [None]:
!pip install -U scikit-learn

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


**XGBRegressor**

In [None]:
from xgboost import XGBRegressor
import xgboost as xgb

for lr in [0.01,0.02,0.03,0.04,0.05,0.06,0.07,0.08,0.09,0.1,0.11,0.12,0.13,0.14,0.15,0.2,0.5,0.7,1]:
  model = xgb.XGBRegressor(learning_rate = lr,n_estimators =100,verbosity =0)#initialise the model
  model.fit(x_train,y_train)
  model.score(x_test,y_test)
  print('Learning rate:',lr,"Train score",model.score(x_train,y_train),'Cross-Val score:',np.mean(cross_val_score(model,x_train,y_train,cv=10)))
model = xgb.XGBRegressor(learning_rate = float(input('LR value')),n_estimators =100) 
model.fit(x_train,y_train)

print('score for the XGBRegressor:',model.score(x_test,y_test))

**Accuracy**

In [None]:
print('Score for the Linear Regressor       :',linear.score(x_test,y_test))    
print('Score for the Decision TreeRegressor :',dt.score(x_test,y_test))    
print('Score for the KNN Regressor          :',knn.score(x_test,y_test)) 
print('Score for the XGBRegressor           :',model.score(x_test,y_test))  
#print('score for the Random Forest:',RF.score(x_test,y_test))

Score for the Linear Regressor       : 0.44510958153315994
Score for the Decision TreeRegressor : 0.9845198675189734
Score for the KNN Regressor          : 0.9738434304408886
Score for the XGBRegressor           : 0.9988394789713089


In [None]:
linear_pred = linear.predict(x_test)
dt_pred= dt.predict(x_test)
knn_pred=knn.predict(x_test)
xgb_pred=model.predict(x_test)

In [None]:
pd.DataFrame({"Actual":y_test, "linear_pred":linear_pred,"dt_pred":dt_pred, "knn_pred":knn_pred,"xgb_pred":xgb_pred })