In [59]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


import os
for dirname, _, filenames in os.walk('../input/datastore-30-exculibar'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


# **Preparing to start**
**1) Loading packages**


In [60]:
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
%matplotlib inline 

from itertools import product
import gc

from sklearn import preprocessing
import sklearn
import xgboost as xgb

**2) Additional Files**

In [61]:
from scipy.stats.stats import pearsonr  
from sklearn.model_selection import train_test_split
import math
import matplotlib.pyplot as plt
from sklearn.feature_selection import SelectKBest
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.pipeline import FeatureUnion
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, median_absolute_error

In [62]:
data = pd.read_csv('../input/datastore-30-exculibar/train_data.csv')
test= pd.read_csv('../input/datastore-30-exculibar/test_data.csv')
validate= pd.read_csv('../input/datastore-30-exculibar/validation_data.csv')

# **Explore the Sales data frame**
`Doing a comprehensive exploration of this Data.`

**1) Training DataSet**

In [63]:
data.head(5)

In [64]:
# creating a dataframe using dictionary
df = pd.DataFrame(data)
  
# using notnull() function 
df.notnull().sum()

In [65]:
df.duplicated().sum()

In [66]:
df.dtypes

**2) Test DataSet**

In [67]:
test.head(5)

In [68]:
test.dtypes

**3) Validation DataSet**

In [69]:
validate.head(5)

In [70]:
validate.dtypes

In [71]:
data.columns

## How can we generate forecasts for grouped Sales Items? 

**Our training data consists of 19921 timeseries.**

`Unit sales of product x, aggregated for each Weekly Sales.`
A simple method to generate forecasts for all levels is to focus only on the bottom level. All of its predictions are then summed up to create the forecasts of all levels up to the top. This is called the bottom-up approach.

In [72]:
trained= data.groupby(['CategoryCode','ItemCode'],axis="rows")

In [73]:
trained.head(5)

In [74]:
data.DateID = pd.to_datetime(data.DateID, format='%m/%d/%Y')
data.head(4)

In [75]:
data.head(4)

In [76]:
dateuni=data.DateID.unique()
dateuni

## Weekly Sales for Each Sales 

In this Code Segment, We are Segregating Items in Each day Sales. This will help us to find weekly Sales.

In [77]:
import csv,datetime
input={}
weeklySales={}

def getWeek(date):
    year=int(date.split("/")[2])
    day=int(date.split("/")[1])
    month=int(date.split("/")[0])
    weekNo=datetime.date(year, month, day).isocalendar()[1]
    return str(year)+"_"+str(weekNo)

with open('../input/datastore-30-exculibar/train_data.csv', 'r') as file:
    reader = csv.reader(file)
    for row in reader:
        itemCode=row[1]
        if not itemCode == "ItemCode":
            category=row[0]
            date=row[2]
            sales=row[3]
            week=getWeek(date)
            if input.get(week) == None:
                input[week] = {itemCode:{"category":category,"sales":sales}}
            else:
                if input[week].get(itemCode) == None:
                    input[week][itemCode]={"category":category,"sales":sales}
                else:
                    existing=input[week][itemCode]["sales"]
                    totalSalesQty=int(existing)+int(sales)
                    input[week][itemCode]["sales"]=totalSalesQty

Output of **Segregated Items** According to *Weekly Sales Train* data Set.

In [78]:
with open('output.csv','w' ,newline='') as csvfile:
    writer = csv.writer(csvfile, delimiter=',')
    writer.writerow(['CategoryCode','ItemCode','Week','WeeklySales'])
    for key in input.keys():
        products=input[key]
        for productId in products:
            product=input[key][productId]
            writer.writerow([product["category"],productId,key,product["sales"]])

In [79]:
outTrain= pd.read_csv('output.csv')
outTrain.head(10)

In [80]:
from sklearn.decomposition import PCA
del outTrain["CategoryCode"]
outTrain.head(10)

> There are 21 different items.Each Item has Weekly Sales

In [81]:
outTrain["Week"].nunique()

> There are 194 different items.Each Item has Weekly Sales

In [82]:
outTrain["ItemCode"].nunique()

## Data Visualization

### 1) Weeks vs Number of Total Sales

**In order to grasp most demanded Weeks for Sales**


In [83]:
# Draw Plot
import seaborn as sns

plt.figure(figsize=(33,15), dpi= 100)
sns.boxplot(x='Week', y='WeeklySales', data=outTrain, notch=False)

# Add N Obs inside boxplot (optional)
def add_n_obs(outTrain,group_col,y):
    medians_dict = {grp[0]:grp[1][y].median() for grp in outTrain.groupby(group_col)}
    xticklabels = [x.get_text() for x in plt.gca().get_xticklabels()]
    n_obs = outTrain.groupby(group_col)[y].size().values
    for (x, xticklabel), n_ob in zip(enumerate(xticklabels), n_obs):
        plt.text(x, medians_dict[xticklabel]*1.01, "#obs : "+str(n_ob), horizontalalignment='center', fontdict={'size':14}, color='white')

add_n_obs(outTrain,group_col='Week',y='WeeklySales')    

# Decoration
plt.title('Weekly Sales for Items According to Weeks', fontsize=22)
plt.ylim(10, 40)
plt.show()

In [84]:
#! pip install squarify
import squarify 

### 2) Itemcode vs Demanded Items Weekly

**In order to grasp most demanded Items in Weekly Sales**

`Use of TreeMap.`


In [85]:
# Prepare Data
df = outTrain.groupby('ItemCode').size().reset_index(name='counts')
labels = df.apply(lambda x: str(x[0]) + "\n (" + str(x[1]) + ")", axis=1)
sizes = df['counts'].values.tolist()
colors = [plt.cm.Spectral(i/float(len(labels))) for i in range(len(labels))]

# Draw Plot
plt.figure(figsize=(12,8), dpi= 80)
squarify.plot(sizes=sizes, label=labels, color=colors, alpha=.8)

# Decorate
plt.title('Treemap of ItemCodes Occurances according to Weeks')
plt.axis('off')
plt.show()

In [86]:
ItemSale=outTrain.groupby('ItemCode').sum()
ItemSale=ItemSale.reset_index()
ItemSale

In [87]:
ItemSale['ItemCode']= ItemSale['ItemCode'].astype(object)
ItemSale.dtypes

### 3) Itemcode vs Number of Total Sales Weekly

**In order to grasp most demanded Items **

In [88]:
y = ItemSale.WeeklySales
x = ItemSale['ItemCode']

# Draw Stripplot
fig, ax = plt.subplots(figsize=(16,10), dpi= 80)    
sns.stripplot(x, y, jitter=0.25, size=8, ax=ax, linewidth=.5)

# Decorations
plt.title('ItemCodes against Weekly Sales', fontsize=22)
plt.show()

## Test Data Analyzation 

In [89]:
del test["CategoryCode"]
test

In [90]:
# adding column with constant value Initially
test['PredictedSales'] = pd.Series([0 for x in range(len(test.Week))])

In [91]:
test.head(5)

In [92]:
outTrain.head(10)

## Train the Model

In [93]:
labels = outTrain.ItemCode
Outtrain=outTrain[["Week","WeeklySales"]]

In [94]:
X_train, X_test, y_train, y_test = train_test_split(Outtrain, labels, test_size=0.2, random_state = 2)

*   15 fold cross validation. Multiply by -1 to make values positive.

`Used median absolute error to learn how many sales my predictions are off by.`

In [95]:
def scoring(clf):
    scores = cross_val_score(clf, X_train, y_train, cv=15, n_jobs=1, scoring = 'neg_median_absolute_error')
    print (np.median(scores) * -1)

### 1) Random Forest Analyzation

In [96]:
rfr = RandomForestRegressor(n_estimators = 55,
                            min_samples_leaf = 3,
                            random_state = 2)
scoring(rfr)

### 2) Gradient Boosting Analyzation

In [97]:
gbr = GradientBoostingRegressor(learning_rate = 0.12,
                                n_estimators = 150,
                                max_depth = 8,
                                min_samples_leaf = 1,
                                random_state = 2)
scoring(gbr)

### 3) AdaBoostRegressor Analyzation

In [98]:
dtr = DecisionTreeRegressor(min_samples_leaf = 3,
                            max_depth = 8,
                            random_state = 2)
scoring(dtr)

In [99]:
#Train and make predictions with the best models.
rfr = rfr.fit(X_train, y_train)
gbr = gbr.fit(X_train, y_train)

rfr_preds = rfr.predict(X_test)
gbr_preds = gbr.predict(X_test)

#Weight the top models to find the best prediction
final_preds = rfr_preds*0.47 + gbr_preds*0.53
print ("Sales Predict Errors:", median_absolute_error(y_test, final_preds))

### **Final Prediction OutPut**

In [101]:
ItemSale.to_csv('submission.csv')