## Для начала посмотрим на наши данные, нужно понять с чем мы работаем

### Подключим все необходимые библиотеки

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
plt.style.use('ggplot')
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing  import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import balanced_accuracy_score, accuracy_score, mean_absolute_error
from collections import Counter
import re
import nltk
#import pymorphy2
from ast import literal_eval
import gensim
import math


### Поля данных

* warehouse_id - идентификатор магазина
* product_id - идентификатор продукта
* date - дата
* quantity - кол-во продаж
* id - уникальный идентификатор строки

 

In [None]:
train_data = pd.read_csv("../input/grocery-sales-forecast/train.csv")
print(train_data.shape)
train_data.head(16)

### Посмотрим на количество товаров и на их id

In [None]:
product_id = train_data.sort_values(by=['product_id'])['product_id'].unique()
print(len(product_id))
print(product_id[:5])

### Посмотрим на тестовую выборку и сделаем некоторые выводы

In [None]:
test_data = pd.read_csv("../input/grocery-sales-forecast/test.csv")
print(test_data.shape)
test_data.head(15)

### Сразу хочется посмотреть на warehouse_id и на product_id, так как при разных product_id будут проблемы


In [None]:
product_test_id = test_data.sort_values(by=['product_id'])['product_id'].unique()
print(len(product_test_id))
print(product_test_id[:5])
print(len(set(product_test_id == product_id)))  # Все значения совпадают, проблем нет

In [None]:
warehouse_id = test_data['warehouse_id'].unique()
print(len(warehouse_id))  # Всего два магазина

### В нашем случае всего два магазина и все значения id товаров совпадают. Проблем с данными нет.

### Посмотрим на simple submission

In [None]:
simple_data = pd.read_csv("../input/grocery-sales-forecast/sub.csv")
print(simple_data.shape)
simple_data.head()

## Разделение данных на train / test

### Итак, суть задачи ясна. Проблем пока не обнаружено, поэтому разделяем нашу train выборку и начинаем реализовывать идеи

In [None]:
'''
# Это плохой способ разделять данные в нашем случае!
X_train, X_test, y_train, y_test = train_test_split(train_data.drop('quantity', axis=1), train_data[['quantity']], test_size=0.1945, random_state=42)
print(X_train.head(3))
print(X_test.head(3))
print(y_train.head(3))
print(y_test.head(3))
'''

### Необходимо понимать, что в данных есть пропуски. Это видно по X_train

In [None]:
train_data = train_data.sort_values(by=['date', 'product_id'])
X_train = train_data.drop('quantity', axis=1)
y_train = train_data['quantity']
X_train

### Таким образом имеем отсортированные по дате и id данные. Идея деления:

*   Находим границу последних n дней
*   Делим на train / test по этой границе



In [None]:
# n = 5  # 2021-04-08 - 5 = 2021-04-03 -- Граница
X_train = train_data.where(train_data['date'] < '2021-04-03').dropna().drop('quantity', axis=1)
y_train = train_data.where(train_data['date'] < '2021-04-03').dropna()['quantity']
X_test = train_data.where(train_data['date'] >= '2021-04-03').dropna().drop('quantity', axis=1)
y_test = train_data.where(train_data['date'] >= '2021-04-03').dropna()['quantity']
print(X_train.head(3))
print(X_test.head(3))
print(y_train.head(3))
print(y_test.head(3))

## Первая идея: дни недели

### Суть идеи заключается в том, чтобы в каждом магазине найти некие зависимости в соотношении проданный товар/день недели, далее на основе этих зависимостей седлать прогноз



---





### Для начала создадим словарь: дата - день недели

In [None]:
data_ = pd.Series(pd.to_datetime(train_data['date'].unique()))

# print(data_, type(data_))
data_week = data_.dt.day_name()
# print(data_week)
date2week = dict(zip(train_data['date'].unique(), data_week))
print(date2week)

### Далее создадим словарь, где key = warehouse_id + product_id, а value = новый словарь с key = день недели и value = массив числа продаж

In [None]:
my_dict = {}
# k = 0
for product_info, product_quantity in tqdm(zip(X_train[['warehouse_id', 'product_id', 'date']].values, y_train)):
    '''
    product_info[0] - warehouse_id - идентификатор магазина
    product_info[1] - product_id - идентификатор продукта
    product_info[2] - date - дата
    product_quantity - quantity - кол-во продаж
    '''
    my_id = str(product_info[0]) + str(product_info[1])
    # print(my_id, product_info[0], product_info[1])
    if my_id in my_dict:
        if date2week[product_info[2]] in my_dict[my_id]:
            my_dict[my_id][date2week[product_info[2]]].append(product_quantity)
        else:
             my_dict[my_id][date2week[product_info[2]]] = [product_quantity]
    else:
        my_dict[my_id] = {}
        my_dict[my_id][date2week[product_info[2]]] = [product_quantity]
# my_dict

### Теперь для тестовой выборки будем смотреть в этот словарь и искать ответ в зависимости от данных словаря

In [None]:
answer_round, answer_floor, answer_ceil = [], [], []
nice_product, bad_product = 0, 0
for product_info in tqdm(X_test[['warehouse_id', 'product_id', 'date']].values):
    my_id_first = str(product_info[0]) + str(product_info[1])
    my_id_second = date2week[product_info[2]]
    try:
        my_array = my_dict[my_id_first][my_id_second]  # Если данных нет в словаре 
        nice_product += 1
    except:
        bad_product += 1
        my_array = []
    if len(my_array) < 1:
        answer_round.append(1)
        answer_floor.append(1)
        answer_ceil.append(1)
    else:  # Попробуем среднее
        answer_round.append(round(sum(my_array) / len(my_array)))
        answer_floor.append(math.floor(sum(my_array) / len(my_array)))
        answer_ceil.append(math.ceil(sum(my_array) / len(my_array)))
print()
print(nice_product, bad_product)
# print(len(answer), len(y_test))
print(mean_absolute_error(y_test, answer_round))
print(mean_absolute_error(y_test, answer_floor))
print(mean_absolute_error(y_test, answer_ceil))

### Показывает достаточно неплохой результат. Причём округление в большую сторону даёт лучший результат. Его и буду использовать. 

In [None]:
data_ = pd.Series(pd.to_datetime(test_data['date'].unique()))

# print(data_, type(data_))
data_week = data_.dt.day_name()
# print(data_week)
date2week = dict(zip(test_data['date'].unique(), data_week))
print(date2week)

In [None]:
answer_ceil = []
nice_product, bad_product = 0, 0
data_2 = pd.Series(pd.to_datetime(test_data['date'].unique()))
for product_info in tqdm(test_data[['warehouse_id', 'product_id', 'date']].values):
    #print()
    #print(product_info)
    #break
    my_id_first = str(float(product_info[0])) + str(float(product_info[1]))
    my_id_second = date2week[product_info[2]]
    try:
        my_array = my_dict[my_id_first][my_id_second]  # Если данных нет в словаре 
        nice_product += 1
    except:
        bad_product += 1
        my_array = []
    if len(my_array) < 3:  # Если товар в определённый день недели купили пару раз, то логично предположить, что его не купят вовсе
        answer_ceil.append(0)

    else:  
        answer_ceil.append(math.ceil(sum(my_array) / len(my_array)))
print()
print(nice_product, bad_product)
print(len(answer_ceil), len(simple_data))


In [None]:
#Result_table = pd.read_csv("/content/drive/MyDrive/Sales_Forecasting/sub.csv")
#Result_table['quantity'] = pd.Series(answer_ceil)
#Result_table[["id", "quantity"]].to_csv("FINAL.csv", index=False)

### Score = 0.46886 
### Мягко говоря - ужасно...


## Выводы после первой идеи:

* Корявая обработка нулей, нужно исправлять
* На основе количества продаж в определённые дни нужно пытаться делать что-то более нетривиальное, чем просто среднее
* Необходимо изменить test / train выборку, ибо нет учёта нулей



### Начнём исправляться с заполнения нашей выборки нулями
Идея такая: если в определённый день информации о товаре нет, то значит его купили 0 раз. Это поле должно храниться в выборке.

In [None]:
product_id = train_data['product_id'].unique()
date = train_data['date'].unique()
warehouse_id = train_data['warehouse_id'].unique()
print(len(product_id), len(date), len(warehouse_id), len(product_id) * len(date) * len(warehouse_id))

### Значит у нас должно быть 874608 записей. Не будем вставлять недостоющие записи в таблицу, а сразу будем составлять словарик. Он будет таким же, но более разряженее.

In [None]:
my_current_dict = {}
for product_info in tqdm(train_data[['warehouse_id', 'product_id', 'date', 'quantity']].values):
    '''
    product_info[0] - warehouse_id - идентификатор магазина
    product_info[1] - product_id - идентификатор продукта
    product_info[2] - date - дата
    product_info[3] - quantity - кол-во продаж
    '''
    my_id = str(product_info[0]) + str(product_info[1]) + str(product_info[2])
    my_current_dict[my_id] = product_info[3]
print()
print(len(my_current_dict))

### Информация есть менее чем о 10% товаров.

In [None]:
data_ = pd.Series(pd.to_datetime(train_data['date'].unique()))

# print(data_, type(data_))
data_week = data_.dt.day_name()
# print(data_week)
date2week = dict(zip(train_data['date'].unique(), data_week))
print(date2week)

In [None]:
my_huge_dict = {}
nice_product = 0
for id_warehouse in warehouse_id:
    for id_product in tqdm(product_id):
        for id_date in date:
            my_id = str(id_warehouse) + str(id_product) + str(id_date)
            if my_id in my_current_dict:
                nice_product += 1
                if id_warehouse in my_huge_dict:
                    if id_product in my_huge_dict[id_warehouse]:
                        if date2week[id_date] in my_huge_dict[id_warehouse][id_product]:
                            my_huge_dict[id_warehouse][id_product][date2week[id_date]].append(my_current_dict[my_id])
                        else:
                            my_huge_dict[id_warehouse][id_product][date2week[id_date]] = [my_current_dict[my_id]]
                    else:
                        my_huge_dict[id_warehouse][id_product] = {}
                        my_huge_dict[id_warehouse][id_product][date2week[id_date]] = [my_current_dict[my_id]]
                else:
                    my_huge_dict[id_warehouse] = {}
                    my_huge_dict[id_warehouse][id_product] = {}
                    my_huge_dict[id_warehouse][id_product][date2week[id_date]] = [my_current_dict[my_id]]
            else:
                if id_warehouse in my_huge_dict:
                    if id_product in my_huge_dict[id_warehouse]:
                        if date2week[id_date] in my_huge_dict[id_warehouse][id_product]:
                            my_huge_dict[id_warehouse][id_product][date2week[id_date]].append(0)
                        else:
                            my_huge_dict[id_warehouse][id_product][date2week[id_date]] = [0]
                    else:
                        my_huge_dict[id_warehouse][id_product] = {}
                        my_huge_dict[id_warehouse][id_product][date2week[id_date]] = [0]
                else:
                    my_huge_dict[id_warehouse] = {}
                    my_huge_dict[id_warehouse][id_product] = {}
                    my_huge_dict[id_warehouse][id_product][date2week[id_date]] = [0]
print()
print(nice_product)

### Проверим себя

In [None]:
amount = 0
set_amount = set()
for id_warehouse in warehouse_id:
    for id_product in tqdm(product_id):
        for id_date in date:
            if str(id_warehouse) + str(id_product) + str(date2week[id_date]) not in set_amount:
                amount += len(my_huge_dict[id_warehouse][id_product][date2week[id_date]])
                set_amount.add(str(id_warehouse) + str(id_product) + str(date2week[id_date]))
print()
print(amount)

### Размеры совпадают, теперь можно делить на train / test. За границу возьмём 2021-04-01, ровно 7 дней

In [None]:
# Понимаю, что проделываю такую же работу, что и пару ячеек выше. Но хочу сделать акцент на понимании кода
my_huge_dict_train = {}
my_huge_dict_test = {}
nice_product = 0
for id_warehouse in warehouse_id:
    for id_product in tqdm(product_id):
        for id_date in date:
            if id_date > '2021-04-01':
                my_id = str(id_warehouse) + str(id_product) + str(id_date)
                if my_id in my_current_dict:
                    nice_product += 1
                    if id_warehouse in my_huge_dict_test:
                        if id_product in my_huge_dict_test[id_warehouse]:
                            if date2week[id_date] in my_huge_dict_test[id_warehouse][id_product]:
                                my_huge_dict_test[id_warehouse][id_product][date2week[id_date]].append(my_current_dict[my_id])
                            else:
                                my_huge_dict_test[id_warehouse][id_product][date2week[id_date]] = [my_current_dict[my_id]]
                        else:
                            my_huge_dict_test[id_warehouse][id_product] = {}
                            my_huge_dict_test[id_warehouse][id_product][date2week[id_date]] = [my_current_dict[my_id]]
                    else:
                        my_huge_dict_test[id_warehouse] = {}
                        my_huge_dict_test[id_warehouse][id_product] = {}
                        my_huge_dict_test[id_warehouse][id_product][date2week[id_date]] = [my_current_dict[my_id]]
                else:
                    if id_warehouse in my_huge_dict_test:
                        if id_product in my_huge_dict_test[id_warehouse]:
                            if date2week[id_date] in my_huge_dict_test[id_warehouse][id_product]:
                                my_huge_dict_test[id_warehouse][id_product][date2week[id_date]].append(0)
                            else:
                                my_huge_dict_test[id_warehouse][id_product][date2week[id_date]] = [0]
                        else:
                            my_huge_dict_test[id_warehouse][id_product] = {}
                            my_huge_dict_test[id_warehouse][id_product][date2week[id_date]] = [0]
                    else:
                        my_huge_dict_test[id_warehouse] = {}
                        my_huge_dict_test[id_warehouse][id_product] = {}
                        my_huge_dict_test[id_warehouse][id_product][date2week[id_date]] = [0]
            else:
                my_id = str(id_warehouse) + str(id_product) + str(id_date)
                if my_id in my_current_dict:
                    nice_product += 1
                    if id_warehouse in my_huge_dict_train:
                        if id_product in my_huge_dict_train[id_warehouse]:
                            if date2week[id_date] in my_huge_dict_train[id_warehouse][id_product]:
                                my_huge_dict_train[id_warehouse][id_product][date2week[id_date]].append(my_current_dict[my_id])
                            else:
                                my_huge_dict_train[id_warehouse][id_product][date2week[id_date]] = [my_current_dict[my_id]]
                        else:
                            my_huge_dict_train[id_warehouse][id_product] = {}
                            my_huge_dict_train[id_warehouse][id_product][date2week[id_date]] = [my_current_dict[my_id]]
                    else:
                        my_huge_dict_train[id_warehouse] = {}
                        my_huge_dict_train[id_warehouse][id_product] = {}
                        my_huge_dict_train[id_warehouse][id_product][date2week[id_date]] = [my_current_dict[my_id]]
                else:
                    if id_warehouse in my_huge_dict_train:
                        if id_product in my_huge_dict_train[id_warehouse]:
                            if date2week[id_date] in my_huge_dict_train[id_warehouse][id_product]:
                                my_huge_dict_train[id_warehouse][id_product][date2week[id_date]].append(0)
                            else:
                                my_huge_dict_train[id_warehouse][id_product][date2week[id_date]] = [0]
                        else:
                            my_huge_dict_train[id_warehouse][id_product] = {}
                            my_huge_dict_train[id_warehouse][id_product][date2week[id_date]] = [0]
                    else:
                        my_huge_dict_train[id_warehouse] = {}
                        my_huge_dict_train[id_warehouse][id_product] = {}
                        my_huge_dict_train[id_warehouse][id_product][date2week[id_date]] = [0]
print()
print(nice_product)

In [None]:
amount = 0
set_amount = set()
for id_warehouse in warehouse_id:
    for id_product in tqdm(product_id):
        for id_date in date:
            if str(id_warehouse) + str(id_product) + str(date2week[id_date]) not in set_amount:
                amount += len(my_huge_dict_train[id_warehouse][id_product][date2week[id_date]])
                set_amount.add(str(id_warehouse) + str(id_product) + str(date2week[id_date]))
print()
print(amount)

In [None]:
amount = 0
set_amount = set()
for id_warehouse in warehouse_id:
    for id_product in tqdm(product_id):
        for id_date in date:
            if str(id_warehouse) + str(id_product) + str(date2week[id_date]) not in set_amount:
                amount += 1  # len(my_huge_dict_test[id_warehouse][id_product][date2week[id_date]])  # Работает, так как у нас всегда массив из одного числа
                set_amount.add(str(id_warehouse) + str(id_product) + str(date2week[id_date]))
print()
print(amount)

### Тестовая выборка занимает всего 5%. Это нужно будет учитывать!

In [None]:
right_answer = []
answer_round, answer_floor, answer_ceil = [], [], []
set_amount = set()
for id_warehouse in warehouse_id:
    for id_product in tqdm(product_id):
        for id_date in date:
            right_answer.append(my_huge_dict_test[id_warehouse][id_product][date2week[id_date]][0])
            answer_round.append(round(sum(my_huge_dict_train[id_warehouse][id_product][date2week[id_date]])/len(my_huge_dict_train[id_warehouse][id_product][date2week[id_date]])))
            answer_floor.append(math.floor(sum(my_huge_dict_train[id_warehouse][id_product][date2week[id_date]])/len(my_huge_dict_train[id_warehouse][id_product][date2week[id_date]])))
            answer_ceil.append(math.ceil(sum(my_huge_dict_train[id_warehouse][id_product][date2week[id_date]])/len(my_huge_dict_train[id_warehouse][id_product][date2week[id_date]])))
print(mean_absolute_error(right_answer, answer_round))
print(mean_absolute_error(right_answer, answer_floor))
print(mean_absolute_error(right_answer, answer_ceil))

### Не будем повторять ошибок. Вместо ceil возьмём floor

In [None]:
data_ = pd.Series(pd.to_datetime(test_data['date'].unique()))


#print(data_, type(data_))
data_week = data_.dt.day_name()
# print(data_week)
date2week = dict(zip(test_data['date'].unique(), data_week))
print(date2week)

In [None]:
answer = []
for product_info in tqdm(test_data[['warehouse_id', 'product_id', 'date']].values):
    id_warehouse = (product_info[0])
    id_product = (product_info[1])
    id_date = (product_info[2])
    #print(id_warehouse)
    #print(id_product)
    #print(id_date)
    #print(my_huge_dict[id_warehouse][id_product][date2week[id_date]])
    #break
    answer.append(math.floor(sum(my_huge_dict[id_warehouse][id_product][date2week[id_date]])/len(my_huge_dict[id_warehouse][id_product][date2week[id_date]])))
    
print()
print(len(answer), len(simple_data))


In [None]:
#Result_table = pd.read_csv("/content/drive/MyDrive/Sales_Forecasting/sub.csv")
#Result_table['quantity'] = pd.Series(answer)
#Result_table[["id", "quantity"]].to_csv("FINAL.csv", index=False)

### Score = 0.20210
### Это уже гораздо лучше, но не предел :)

## Вторая идея: модернизация среднего



### Пока оставим идею с днями недели, но будем рассматривать не обычное среднее, а что-то поинтереснее

In [None]:
data_ = pd.Series(pd.to_datetime(train_data['date'].unique()))

# print(data_, type(data_))
data_week = data_.dt.day_name()
# print(data_week)
date2week = dict(zip(train_data['date'].unique(), data_week))
print(date2week)

In [None]:
import statistics
from scipy import stats as s

right_answer = []
answer_round, answer_floor, answer_ceil = [], [], []
answer_median, answer_median_low, answer_median_high = [], [], []
answer_mode = []
set_amount = set()
for id_warehouse in warehouse_id:
    for id_product in tqdm(product_id):
        for id_date in date:
            right_answer.append(my_huge_dict_test[id_warehouse][id_product][date2week[id_date]][0])
            answer_round.append(round(sum(my_huge_dict_train[id_warehouse][id_product][date2week[id_date]])/len(my_huge_dict_train[id_warehouse][id_product][date2week[id_date]])))
            answer_floor.append(math.floor(sum(my_huge_dict_train[id_warehouse][id_product][date2week[id_date]])/len(my_huge_dict_train[id_warehouse][id_product][date2week[id_date]])))
            answer_ceil.append(math.ceil(sum(my_huge_dict_train[id_warehouse][id_product][date2week[id_date]])/len(my_huge_dict_train[id_warehouse][id_product][date2week[id_date]])))

            answer_median.append(statistics.median(my_huge_dict_train[id_warehouse][id_product][date2week[id_date]]))
            answer_median_low.append(statistics.median_low(my_huge_dict_train[id_warehouse][id_product][date2week[id_date]]))
            answer_median_high.append(statistics.median_high(my_huge_dict_train[id_warehouse][id_product][date2week[id_date]]))
            
            answer_mode.append(int(s.mode(my_huge_dict_train[id_warehouse][id_product][date2week[id_date]])[0]))

print(mean_absolute_error(right_answer, answer_round))
print(mean_absolute_error(right_answer, answer_floor))
print(mean_absolute_error(right_answer, answer_ceil))

print(mean_absolute_error(right_answer, answer_median))
print(mean_absolute_error(right_answer, answer_median_low))
print(mean_absolute_error(right_answer, answer_median_high))

print(mean_absolute_error(right_answer, answer_mode))

### Медиана даёт неплохой результат. Но давайте попробуем нечто сложнее


In [None]:
def exponential_smoothing(series, alpha):
    result = [series[0]] # first value is same as series
    for n in range(1, len(series)):
        result.append(alpha * series[n] + (1 - alpha) * result[n-1])
    return result[-1]

In [None]:

cur_min = 1
cur_alpha = 0

for i in tqdm(range(1, 30)):
    alpha = i/100
    right_answer = []
    answer_exponential_smoothing_round, answer_exponential_smoothing_floor, answer_exponential_smoothing_ceil = [], [], []
    for id_warehouse in warehouse_id:
        for id_product in (product_id):
            for id_date in date:
                right_answer.append(my_huge_dict_test[id_warehouse][id_product][date2week[id_date]][0])
                #answer_exponential_smoothing_round.append(round(exponential_smoothing(my_huge_dict_train[id_warehouse][id_product][date2week[id_date]], alpha)))
                answer_exponential_smoothing_floor.append(math.floor(exponential_smoothing(my_huge_dict_train[id_warehouse][id_product][date2week[id_date]], alpha)))
                #answer_exponential_smoothing_ceil.append(math.ceil(exponential_smoothing(my_huge_dict_train[id_warehouse][id_product][date2week[id_date]], alpha)))
    if mean_absolute_error(right_answer, answer_exponential_smoothing_floor) < cur_min:
        cur_min = mean_absolute_error(right_answer, answer_exponential_smoothing_floor)
        cur_alpha = alpha
#print(mean_absolute_error(right_answer, answer_exponential_smoothing_round))
print(cur_min, cur_alpha)
#print(mean_absolute_error(right_answer, answer_exponential_smoothing_ceil))

### Сильного прироста нет, поэтому двойное экспонициальное сглаживание использовать не имеет смысла

In [None]:
data_ = pd.Series(pd.to_datetime(test_data['date'].unique()))


#print(data_, type(data_))
data_week = data_.dt.day_name()
# print(data_week)
date2week = dict(zip(test_data['date'].unique(), data_week))
print(date2week)

In [None]:
answer = []
alpha = 0.14
for product_info in tqdm(test_data[['warehouse_id', 'product_id', 'date']].values):
    id_warehouse = (product_info[0])
    id_product = (product_info[1])
    id_date = (product_info[2])
    answer.append(math.floor(exponential_smoothing(my_huge_dict[id_warehouse][id_product][date2week[id_date]], alpha)))
    
print()
print(len(answer), len(simple_data))

In [None]:
#Result_table = pd.read_csv("/content/drive/MyDrive/Sales_Forecasting/sub.csv")
#Result_table['quantity'] = pd.Series(answer)
#Result_table[["id", "quantity"]].to_csv("FINAL_exp.csv", index=False)

### Score = 0.20085
### Сильного роста нет. Нужно что-то сложнее...

## Третья идея: учёт соседних дней


### Хочется делать предсказание не только по всем четвергам для предсказания продаж в четверг, но и как-то учитывать соседние дни. Для этого давайте попробуем визуализировать наши данные.


In [None]:
first_warehouse = {}
second_warehouse = {}
number_of_products = 10
set_of_products = set()

for product_info in tqdm(test_data[['warehouse_id', 'product_id', 'date']].values):
    id_warehouse = (product_info[0])
    id_product = (product_info[1])
    id_date = date2week[(product_info[2])]

    if id_product in set_of_products or len(set_of_products) < number_of_products:
        set_of_products.add(id_product)

        if id_warehouse == 0:
            if id_product not in first_warehouse:
                first_warehouse[id_product] = {}
                first_warehouse[id_product][id_date] = my_huge_dict[id_warehouse][id_product][id_date]

            else:
                if id_date not in first_warehouse[id_product]:
                    first_warehouse[id_product][id_date] = my_huge_dict[id_warehouse][id_product][id_date]

        else:
            if id_product not in second_warehouse:
                second_warehouse[id_product] = {}
                second_warehouse[id_product][id_date] = my_huge_dict[id_warehouse][id_product][id_date]

            else:
                if id_date not in second_warehouse[id_product]:
                    second_warehouse[id_product][id_date] = my_huge_dict[id_warehouse][id_product][id_date]
        

    
print()

print(first_warehouse)
print(len(first_warehouse))
print(len(first_warehouse[71165]))

print(second_warehouse)
print(len(second_warehouse))
print(len(second_warehouse[71165]))

In [None]:


# fig, axs = plt.subplots(2, 7, sharex=True, sharey=True)
fig = plt.figure(figsize=(90, 10))

week2number = {'Friday': 0, 'Saturday': 1, 'Sunday': 2, 'Monday': 3, 'Tuesday': 4, 'Wednesday': 5, 'Thursday': 6}
product2color = {71165: 'orange', 71170: 'black', 71185: 'blue', 71215: 'green', 71220: 'yellow', 71225: 'red', 71230: 'purple', 71235: 'brown', 71285: 'pink', 71350: 'c'}
title_x = []
flag_print = True
for i in range(1, 20):
    title_x.append(i)

for i in first_warehouse.items():
    for j in i[1].items():
        position = 171 + week2number[j[0]]
        ax = fig.add_subplot(position)
        ax.plot(title_x, j[1], color=product2color[i[0]])
        if flag_print:
            print(j[0])
    flag_print = False

'''
for i in second_warehouse.values():
    for j in i.items():
        position = 276 + week2number[j[0]]
        ax = fig.add_subplot(position)
        ax.plot(title_x, j[1], color='orange', label=j[0])
'''
#fig = plt.figure(figsize=(cm_to_inch(15),cm_to_inch(10)))
#plt.tight_layout()
plt.show()


### Исходя из графиков можно сделать выводы:

*   Имеется **тенденция роста**. Скорее всего это связано с тем, что чем дольше существует магазин, тем больше покупателей в него ходит. Поэтому экспоненциальное сглаживание дало небольшой прирост.
*   Есть ярко выраженные **выбросы**. Можно попробовать с ними побороться.
*   В **Saturday** люди закупаются занчительнее чаще, чем в остальные дни. Его в качестве соседнего дня лучше не использовать.
*   В **Sunday** и **Wednesday** люди покупают продукты подозрительно равномерно.

Все эти выводы сделаны на основе 10 первых продуктов из первого магазина. Поэтому на последний пункт закроем глаза. А вот с остальными пунктами можно поработать.




### Борьба с выбросами



In [None]:
data_ = pd.Series(pd.to_datetime(train_data['date'].unique()))

# print(data_, type(data_))
data_week = data_.dt.day_name()
# print(data_week)
date2week = dict(zip(train_data['date'].unique(), data_week))
print(date2week)

In [None]:
right_answer = []
answer_round, answer_floor, answer_ceil = [], [], []
answer_median, answer_median_low, answer_median_high = [], [], []
answer_mode, answer_exp = [], []
threshold  = 8
for id_warehouse in warehouse_id:
    for id_product in tqdm(product_id):
        for id_date in date:
            data = my_huge_dict_train[id_warehouse][id_product][date2week[id_date]].copy()
            local_max = 0
            local_i = 0
            for i in range(len(data)):
                if local_max < data[i]:
                    local_max = data[i]
                    local_i = i

            data[local_i] = data[-1]  # Удаление максимального элемента
            data.pop()


            right_answer.append(my_huge_dict_test[id_warehouse][id_product][date2week[id_date]][0])
            answer_round.append(round(sum(data)/len(data)))
            answer_floor.append(math.floor(sum(data)/len(data)))
            answer_ceil.append(math.ceil(sum(data)/len(data)))

            answer_median.append(statistics.median(data))
            answer_median_low.append(statistics.median_low(data))
            answer_median_high.append(statistics.median_high(data))
            
            answer_exp.append(math.floor(exponential_smoothing(data, 0.14)))
            #answer_mode.append(int(s.mode(data)[0]))

print(mean_absolute_error(right_answer, answer_round))
print(mean_absolute_error(right_answer, answer_floor))
print(mean_absolute_error(right_answer, answer_ceil))

print(mean_absolute_error(right_answer, answer_median))
print(mean_absolute_error(right_answer, answer_median_low))
print(mean_absolute_error(right_answer, answer_median_high))

print(mean_absolute_error(right_answer, answer_exp))
#print(mean_absolute_error(right_answer, answer_mode))

### Пока это лучший результат. Помогло самое обычное удаление максимума.

### Итак, теперь попробуем разбить дни на пары и пытаться делать выводы:

*   Суббота отдельно, как говорил выше
*   Пятница - воскресенье, как выходные дни
*   Понедельник - вторник
*   Среда - чтеверг

Зададим коэффициент, который будет понижать важность второго дня, ибо брать чисто среднее не очень круто




In [None]:
cur_min = 1
best_alpha = 0
best_beta = 0
for alpha in tqdm(range(1, 12)):
    for beta in range(1, 11):
        if beta <= alpha:
            right_answer = []
            answer_exp = []
            for id_warehouse in warehouse_id:
                for id_product in (product_id):
                    for id_date in date:
                        '''
                        Friday
                        Saturday
                        Sunday
                        Monday
                        Tuesday
                        Wednesday
                        Thursday
                        '''
                        data = my_huge_dict_train[id_warehouse][id_product][date2week[id_date]].copy()
                        local_max = 0
                        local_i = 0
                        for i in range(len(data)):
                            if local_max < data[i]:
                                local_max = data[i]
                                local_i = i

                        data[local_i] = data[-1]  # Удаление максимального элемента
                        data.pop()
                        right_answer.append(my_huge_dict_test[id_warehouse][id_product][date2week[id_date]][0])
                        my_ans_by_day = math.floor(exponential_smoothing(data, 0.14))

                        if date2week[id_date] == 'Saturday':
                            answer_exp.append(my_ans_by_day)
                        
                        #-----------------------------------
                        elif date2week[id_date] == 'Friday':
                            data2 = my_huge_dict_train[id_warehouse][id_product]['Sunday'].copy()
                            my_ans_by_second_day = math.floor(exponential_smoothing(data2, 0.14))
                            answer_exp.append(((my_ans_by_day * alpha + my_ans_by_second_day * beta) / (alpha + beta)))
                        
                        elif date2week[id_date] == 'Sunday':
                            data2 = my_huge_dict_train[id_warehouse][id_product]['Friday'].copy()
                            my_ans_by_second_day = math.floor(exponential_smoothing(data2, 0.14))
                            answer_exp.append((my_ans_by_day * alpha + my_ans_by_second_day * beta) / (alpha + beta))
                        
                        #-----------------------------------
                        elif date2week[id_date] == 'Monday':
                            data2 = my_huge_dict_train[id_warehouse][id_product]['Tuesday'].copy()
                            my_ans_by_second_day = math.floor(exponential_smoothing(data2, 0.14))
                            answer_exp.append((my_ans_by_day * alpha + my_ans_by_second_day * beta) / (alpha + beta))
                        
                        elif date2week[id_date] == 'Tuesday':
                            data2 = my_huge_dict_train[id_warehouse][id_product]['Monday'].copy()
                            my_ans_by_second_day = math.floor(exponential_smoothing(data2, 0.14))
                            answer_exp.append((my_ans_by_day * alpha + my_ans_by_second_day * beta) / (alpha + beta))
                        
                        #-----------------------------------
                        elif date2week[id_date] == 'Wednesday':
                            data2 = my_huge_dict_train[id_warehouse][id_product]['Thursday'].copy()
                            my_ans_by_second_day = math.floor(exponential_smoothing(data2, 0.14))
                            answer_exp.append((my_ans_by_day * alpha + my_ans_by_second_day * beta) / (alpha + beta))
                        
                        elif date2week[id_date] == 'Thursday':
                            data2 = my_huge_dict_train[id_warehouse][id_product]['Wednesday'].copy()
                            my_ans_by_second_day = math.floor(exponential_smoothing(data2, 0.14))
                            answer_exp.append((my_ans_by_day * alpha + my_ans_by_second_day * beta) / (alpha + beta))

            if mean_absolute_error(right_answer, answer_exp) < cur_min:
                best_alpha = alpha
                best_beta = beta
                cur_min = mean_absolute_error(right_answer, answer_exp)
print()
print(cur_min, best_alpha, best_beta)
            #print(mean_absolute_error(right_answer, answer_mode))

### Мда... Ну окей давайте зальём с таким выводом

In [None]:
data_ = pd.Series(pd.to_datetime(test_data['date'].unique()))


#print(data_, type(data_))
data_week = data_.dt.day_name()
# print(data_week)
date2week = dict(zip(test_data['date'].unique(), data_week))
print(date2week)

In [None]:
answer = []
alpha = 0.14
alpha = 1
beta = 1
for product_info in tqdm(test_data[['warehouse_id', 'product_id', 'date']].values):
    id_warehouse = (product_info[0])
    id_product = (product_info[1])
    id_date = (product_info[2])

    data = my_huge_dict[id_warehouse][id_product][date2week[id_date]].copy()
    local_max = 0
    local_i = 0
    for i in range(len(data)):
        if local_max < data[i]:
            local_max = data[i]
            local_i = i

    data[local_i] = data[-1]  # Удаление максимального элемента
    data.pop()
    #right_answer.append(my_huge_dict_test[id_warehouse][id_product][date2week[id_date]][0])
    my_ans_by_day = math.floor(exponential_smoothing(data, 0.14))

    if date2week[id_date] == 'Saturday':
        answer.append(my_ans_by_day)
    
    #-----------------------------------
    elif date2week[id_date] == 'Friday':
        data2 = my_huge_dict[id_warehouse][id_product]['Sunday'].copy()
        my_ans_by_second_day = math.floor(exponential_smoothing(data2, 0.14))
        answer.append((my_ans_by_day * alpha + my_ans_by_second_day * beta) / (alpha + beta))
    
    elif date2week[id_date] == 'Sunday':
        data2 = my_huge_dict[id_warehouse][id_product]['Friday'].copy()
        my_ans_by_second_day = math.floor(exponential_smoothing(data2, 0.14))
        answer.append((my_ans_by_day * alpha + my_ans_by_second_day * beta) / (alpha + beta))
    
    #-----------------------------------
    elif date2week[id_date] == 'Monday':
        data2 = my_huge_dict[id_warehouse][id_product]['Tuesday'].copy()
        my_ans_by_second_day = math.floor(exponential_smoothing(data2, 0.14))
        answer.append((my_ans_by_day * alpha + my_ans_by_second_day * beta) / (alpha + beta))
    
    elif date2week[id_date] == 'Tuesday':
        data2 = my_huge_dict[id_warehouse][id_product]['Monday'].copy()
        my_ans_by_second_day = math.floor(exponential_smoothing(data2, 0.14))
        answer.append((my_ans_by_day * alpha + my_ans_by_second_day * beta) / (alpha + beta))
    
    #-----------------------------------
    elif date2week[id_date] == 'Wednesday':
        data2 = my_huge_dict[id_warehouse][id_product]['Thursday'].copy()
        my_ans_by_second_day = math.floor(exponential_smoothing(data2, 0.14))
        answer.append((my_ans_by_day * alpha + my_ans_by_second_day * beta) / (alpha + beta))
    
    elif date2week[id_date] == 'Thursday':
        data2 = my_huge_dict[id_warehouse][id_product]['Wednesday'].copy()
        my_ans_by_second_day = math.floor(exponential_smoothing(data2, 0.14))
        answer.append((my_ans_by_day * alpha + my_ans_by_second_day * beta) / (alpha + beta))
    
print()
print(len(answer), len(simple_data))

In [None]:
#Result_table = pd.read_csv("/content/drive/MyDrive/Sales_Forecasting/sub.csv")
#Result_table['quantity'] = pd.Series(answer)
#Result_table[["id", "quantity"]].to_csv("FINAL_3.csv", index=False)

### Score = 0.20137
### Хуже чем прошлая попытка... Явно есть ошибка

### Забыл округлить... Давайте перебеём параметры заново.

In [None]:
data_ = pd.Series(pd.to_datetime(test_data['date'].unique()))


#print(data_, type(data_))
data_week = data_.dt.day_name()
# print(data_week)
date2week = dict(zip(test_data['date'].unique(), data_week))
print(date2week)

In [None]:
answer = []
alpha = 0.14
alpha = 1
beta = 1
for product_info in tqdm(test_data[['warehouse_id', 'product_id', 'date']].values):
    id_warehouse = (product_info[0])
    id_product = (product_info[1])
    id_date = (product_info[2])

    data = my_huge_dict[id_warehouse][id_product][date2week[id_date]].copy()
    local_max = 0
    local_i = 0
    for i in range(len(data)):
        if local_max < data[i]:
            local_max = data[i]
            local_i = i

    data[local_i] = data[-1]  # Удаление максимального элемента
    data.pop()
    #right_answer.append(my_huge_dict_test[id_warehouse][id_product][date2week[id_date]][0])
    my_ans_by_day = math.floor(exponential_smoothing(data, 0.14))

    if date2week[id_date] == 'Saturday':
        answer.append(my_ans_by_day)
    
    #-----------------------------------
    elif date2week[id_date] == 'Friday':
        data2 = my_huge_dict[id_warehouse][id_product]['Sunday'].copy()
        my_ans_by_second_day = math.floor(exponential_smoothing(data2, 0.14))
        answer.append(math.floor((my_ans_by_day * alpha + my_ans_by_second_day * beta) / (alpha + beta)))
    
    elif date2week[id_date] == 'Sunday':
        data2 = my_huge_dict[id_warehouse][id_product]['Friday'].copy()
        my_ans_by_second_day = math.floor(exponential_smoothing(data2, 0.14))
        answer.append(math.floor((my_ans_by_day * alpha + my_ans_by_second_day * beta) / (alpha + beta)))
    
    #-----------------------------------
    elif date2week[id_date] == 'Monday':
        data2 = my_huge_dict[id_warehouse][id_product]['Tuesday'].copy()
        my_ans_by_second_day = math.floor(exponential_smoothing(data2, 0.14))
        answer.append(math.floor((my_ans_by_day * alpha + my_ans_by_second_day * beta) / (alpha + beta)))
    
    elif date2week[id_date] == 'Tuesday':
        data2 = my_huge_dict[id_warehouse][id_product]['Monday'].copy()
        my_ans_by_second_day = math.floor(exponential_smoothing(data2, 0.14))
        answer.append(math.floor((my_ans_by_day * alpha + my_ans_by_second_day * beta) / (alpha + beta)))
    
    #-----------------------------------
    elif date2week[id_date] == 'Wednesday':
        data2 = my_huge_dict[id_warehouse][id_product]['Thursday'].copy()
        my_ans_by_second_day = math.floor(exponential_smoothing(data2, 0.14))
        answer.append(math.floor((my_ans_by_day * alpha + my_ans_by_second_day * beta) / (alpha + beta)))
    
    elif date2week[id_date] == 'Thursday':
        data2 = my_huge_dict[id_warehouse][id_product]['Wednesday'].copy()
        my_ans_by_second_day = math.floor(exponential_smoothing(data2, 0.14))
        answer.append(math.floor((my_ans_by_day * alpha + my_ans_by_second_day * beta) / (alpha + beta)))
    
print()
print(len(answer), len(simple_data))

In [None]:
Result_table = pd.read_csv("../input/grocery-sales-forecast/sub.csv")
Result_table['quantity'] = pd.Series(answer)
Result_table[["id", "quantity"]].to_csv("FINAL_33.csv", index=False)

### Score = 0.19921
### Потянет

## Финальная попытка усовершенствовать результат


### Пробуем 2 идеи:

*   Аккуратнее обрабатывать выбросы
*   Попробовать другую группировку дней



In [None]:
data_ = pd.Series(pd.to_datetime(train_data['date'].unique()))

# print(data_, type(data_))
data_week = data_.dt.day_name()
# print(data_week)
date2week = dict(zip(train_data['date'].unique(), data_week))
print(date2week)

In [None]:
right_answer = []
answer_round, answer_floor, answer_ceil = [], [], []
answer_median, answer_median_low, answer_median_high = [], [], []
answer_mode, answer_exp = [], []
threshold  = 1.5
for id_warehouse in warehouse_id:
    for id_product in tqdm(product_id):
        for id_date in date:
            data = my_huge_dict_train[id_warehouse][id_product][date2week[id_date]].copy()
            local_max = 0
            local_i = 0
            for i in range(len(data)):
                if local_max < data[i]:
                    local_max = data[i]
                    local_i = i
            
            if local_max > threshold * math.floor(sum(data)/len(data)):
                data[local_i] = data[-1]  # Удаление максимального элемента
                data.pop()
            else:
                data[local_i] = data[-1]  # Удаление максимального элемента
                data.pop()


            right_answer.append(my_huge_dict_test[id_warehouse][id_product][date2week[id_date]][0])
            answer_round.append(round(sum(data)/len(data)))
            answer_floor.append(math.floor(sum(data)/len(data)))
            answer_ceil.append(math.ceil(sum(data)/len(data)))

            answer_median.append(statistics.median(data))
            answer_median_low.append(statistics.median_low(data))
            answer_median_high.append(statistics.median_high(data))
            
            answer_exp.append(math.floor(exponential_smoothing(data, 0.14)))
            #answer_mode.append(int(s.mode(data)[0]))

print(mean_absolute_error(right_answer, answer_round))
print(mean_absolute_error(right_answer, answer_floor))
print(mean_absolute_error(right_answer, answer_ceil))

print(mean_absolute_error(right_answer, answer_median))
print(mean_absolute_error(right_answer, answer_median_low))
print(mean_absolute_error(right_answer, answer_median_high))

print(mean_absolute_error(right_answer, answer_exp))
#print(mean_absolute_error(right_answer, answer_mode))

In [None]:
cur_min = 1
alpha = 1
beta = 1
best_threshold = 1
A, B = [], []
for koef in tqdm(range(53, 72)):
    threshold = koef/10
    right_answer = []
    answer_exp = []
    for id_warehouse in warehouse_id:
        for id_product in (product_id):
            for id_date in date:
                data = my_huge_dict_train[id_warehouse][id_product][date2week[id_date]].copy()
                local_max = 0
                local_i = 0
                for i in range(len(data)):
                    if local_max < data[i]:
                        local_max = data[i]
                        local_i = i

                if local_max > threshold * math.floor(sum(data)/len(data)):
                    data[local_i] = data[-1]  # Удаление максимального элемента
                    data.pop()

                right_answer.append(my_huge_dict_test[id_warehouse][id_product][date2week[id_date]][0])
                my_ans_by_day = math.floor(exponential_smoothing(data, 0.14))

                if date2week[id_date] == 'Saturday':
                    answer_exp.append(my_ans_by_day)
                
                #-----------------------------------
                elif date2week[id_date] == 'Friday':
                    data2 = my_huge_dict_train[id_warehouse][id_product]['Sunday'].copy()
                    my_ans_by_second_day = math.floor(exponential_smoothing(data2, 0.14))
                    answer_exp.append(math.floor((my_ans_by_day * alpha + my_ans_by_second_day * beta) / (alpha + beta)))
                
                elif date2week[id_date] == 'Sunday':
                    data2 = my_huge_dict_train[id_warehouse][id_product]['Friday'].copy()
                    my_ans_by_second_day = math.floor(exponential_smoothing(data2, 0.14))
                    answer_exp.append(math.floor((my_ans_by_day * alpha + my_ans_by_second_day * beta) / (alpha + beta)))
                
                #-----------------------------------
                elif date2week[id_date] == 'Monday':
                    data2 = my_huge_dict_train[id_warehouse][id_product]['Tuesday'].copy()
                    my_ans_by_second_day = math.floor(exponential_smoothing(data2, 0.14))
                    answer_exp.append(math.floor((my_ans_by_day * alpha + my_ans_by_second_day * beta) / (alpha + beta)))
                
                elif date2week[id_date] == 'Tuesday':
                    data2 = my_huge_dict_train[id_warehouse][id_product]['Monday'].copy()
                    my_ans_by_second_day = math.floor(exponential_smoothing(data2, 0.14))
                    answer_exp.append(math.floor((my_ans_by_day * alpha + my_ans_by_second_day * beta) / (alpha + beta)))
                
                #-----------------------------------
                elif date2week[id_date] == 'Wednesday':
                    data2 = my_huge_dict_train[id_warehouse][id_product]['Thursday'].copy()
                    my_ans_by_second_day = math.floor(exponential_smoothing(data2, 0.14))
                    answer_exp.append(math.floor((my_ans_by_day * alpha + my_ans_by_second_day * beta) / (alpha + beta)))
                
                elif date2week[id_date] == 'Thursday':
                    data2 = my_huge_dict_train[id_warehouse][id_product]['Wednesday'].copy()
                    my_ans_by_second_day = math.floor(exponential_smoothing(data2, 0.14))
                    answer_exp.append(math.floor((my_ans_by_day * alpha + my_ans_by_second_day * beta) / (alpha + beta)))

    if mean_absolute_error(right_answer, answer_exp) < cur_min:
        best_threshold = threshold
        cur_min = mean_absolute_error(right_answer, answer_exp)
    
    A.append(mean_absolute_error(right_answer, answer_exp))
    B.append(threshold)
print()
print(cur_min, best_threshold)
fig = plt.figure(figsize=(20, 10))
ax = fig.add_subplot(111)
ax.plot(B, A, color='orange')
plt.show()

### Окей, это пока лучший результат. Оставим threshold = 5.5

### Попробуем сгруппировать другим образом:

*   Сб, вс-пн, вт-ср, чт-пт
*   Для каждого дня будем учитывать предыдущий, минуя субботу
*   Для каждого дня будем учитывать предыдущий, включая субботу

Пока 0.19406065345846368 лучший скор на наших данных


In [None]:
cur_min = 1
alpha = 1
beta = 1
threshold = 5.5
right_answer = []
answer_exp = []
for id_warehouse in warehouse_id:
    for id_product in (product_id):
        for id_date in date:
            data = my_huge_dict_train[id_warehouse][id_product][date2week[id_date]].copy()
            local_max = 0
            local_i = 0
            for i in range(len(data)):
                if local_max < data[i]:
                    local_max = data[i]
                    local_i = i

            if local_max > threshold * math.floor(sum(data)/len(data)):
                data[local_i] = data[-1]  # Удаление максимального элемента
                data.pop()

            right_answer.append(my_huge_dict_test[id_warehouse][id_product][date2week[id_date]][0])
            my_ans_by_day = math.floor(exponential_smoothing(data, 0.14))

            if date2week[id_date] == 'Saturday':
                answer_exp.append(my_ans_by_day)
            
            #-----------------------------------
            elif date2week[id_date] == 'Friday':
                data2 = my_huge_dict_train[id_warehouse][id_product]['Thursday'].copy()
                my_ans_by_second_day = math.floor(exponential_smoothing(data2, 0.14))
                answer_exp.append(math.floor((my_ans_by_day * alpha + my_ans_by_second_day * beta) / (alpha + beta)))
            
            elif date2week[id_date] == 'Thursday':
                data2 = my_huge_dict_train[id_warehouse][id_product]['Friday'].copy()
                my_ans_by_second_day = math.floor(exponential_smoothing(data2, 0.14))
                answer_exp.append(math.floor((my_ans_by_day * alpha + my_ans_by_second_day * beta) / (alpha + beta)))
            
            #-----------------------------------
            elif date2week[id_date] == 'Monday':
                data2 = my_huge_dict_train[id_warehouse][id_product]['Sunday'].copy()
                my_ans_by_second_day = math.floor(exponential_smoothing(data2, 0.14))
                answer_exp.append(math.floor((my_ans_by_day * alpha + my_ans_by_second_day * beta) / (alpha + beta)))
            
            elif date2week[id_date] == 'Sunday':
                data2 = my_huge_dict_train[id_warehouse][id_product]['Monday'].copy()
                my_ans_by_second_day = math.floor(exponential_smoothing(data2, 0.14))
                answer_exp.append(math.floor((my_ans_by_day * alpha + my_ans_by_second_day * beta) / (alpha + beta)))
            
            #-----------------------------------
            elif date2week[id_date] == 'Wednesday':
                data2 = my_huge_dict_train[id_warehouse][id_product]['Tuesday'].copy()
                my_ans_by_second_day = math.floor(exponential_smoothing(data2, 0.14))
                answer_exp.append(math.floor((my_ans_by_day * alpha + my_ans_by_second_day * beta) / (alpha + beta)))
            
            elif date2week[id_date] == 'Tuesday':
                data2 = my_huge_dict_train[id_warehouse][id_product]['Wednesday'].copy()
                my_ans_by_second_day = math.floor(exponential_smoothing(data2, 0.14))
                answer_exp.append(math.floor((my_ans_by_day * alpha + my_ans_by_second_day * beta) / (alpha + beta)))

print()
print(mean_absolute_error(right_answer, answer_exp))


In [None]:
cur_min = 1
alpha = 1
beta = 1
threshold = 5.5
right_answer = []
answer_exp = []
for id_warehouse in warehouse_id:
    for id_product in (product_id):
        for id_date in date:
            data = my_huge_dict_train[id_warehouse][id_product][date2week[id_date]].copy()
            local_max = 0
            local_i = 0
            for i in range(len(data)):
                if local_max < data[i]:
                    local_max = data[i]
                    local_i = i

            if local_max > threshold * math.floor(sum(data)/len(data)):
                data[local_i] = data[-1]  # Удаление максимального элемента
                data.pop()

            right_answer.append(my_huge_dict_test[id_warehouse][id_product][date2week[id_date]][0])
            my_ans_by_day = math.floor(exponential_smoothing(data, 0.14))

            if date2week[id_date] == 'Saturday':
                answer_exp.append(my_ans_by_day)
            
            #-----------------------------------
            elif date2week[id_date] == 'Friday':
                data2 = my_huge_dict_train[id_warehouse][id_product]['Thursday'].copy()
                my_ans_by_second_day = math.floor(exponential_smoothing(data2, 0.14))
                answer_exp.append(math.floor((my_ans_by_day * alpha + my_ans_by_second_day * beta) / (alpha + beta)))
            
            elif date2week[id_date] == 'Thursday':
                data2 = my_huge_dict_train[id_warehouse][id_product]['Wednesday'].copy()
                my_ans_by_second_day = math.floor(exponential_smoothing(data2, 0.14))
                answer_exp.append(math.floor((my_ans_by_day * alpha + my_ans_by_second_day * beta) / (alpha + beta)))
            
            #-----------------------------------
            elif date2week[id_date] == 'Monday':
                data2 = my_huge_dict_train[id_warehouse][id_product]['Sunday'].copy()
                my_ans_by_second_day = math.floor(exponential_smoothing(data2, 0.14))
                answer_exp.append(math.floor((my_ans_by_day * alpha + my_ans_by_second_day * beta) / (alpha + beta)))
            
            elif date2week[id_date] == 'Sunday':
                data2 = my_huge_dict_train[id_warehouse][id_product]['Friday'].copy()
                my_ans_by_second_day = math.floor(exponential_smoothing(data2, 0.14))
                answer_exp.append(math.floor((my_ans_by_day * alpha + my_ans_by_second_day * beta) / (alpha + beta)))
            
            #-----------------------------------
            elif date2week[id_date] == 'Wednesday':
                data2 = my_huge_dict_train[id_warehouse][id_product]['Tuesday'].copy()
                my_ans_by_second_day = math.floor(exponential_smoothing(data2, 0.14))
                answer_exp.append(math.floor((my_ans_by_day * alpha + my_ans_by_second_day * beta) / (alpha + beta)))
            
            elif date2week[id_date] == 'Tuesday':
                data2 = my_huge_dict_train[id_warehouse][id_product]['Monday'].copy()
                my_ans_by_second_day = math.floor(exponential_smoothing(data2, 0.14))
                answer_exp.append(math.floor((my_ans_by_day * alpha + my_ans_by_second_day * beta) / (alpha + beta)))

print()
print(mean_absolute_error(right_answer, answer_exp))


In [None]:
cur_min = 1
alpha = 1
beta = 1
threshold = 5.5
right_answer = []
answer_exp = []
for id_warehouse in warehouse_id:
    for id_product in (product_id):
        for id_date in date:
            data = my_huge_dict_train[id_warehouse][id_product][date2week[id_date]].copy()
            local_max = 0
            local_i = 0
            for i in range(len(data)):
                if local_max < data[i]:
                    local_max = data[i]
                    local_i = i

            if local_max > threshold * math.floor(sum(data)/len(data)):
                data[local_i] = data[-1]  # Удаление максимального элемента
                data.pop()

            right_answer.append(my_huge_dict_test[id_warehouse][id_product][date2week[id_date]][0])
            my_ans_by_day = math.floor(exponential_smoothing(data, 0.14))

            if date2week[id_date] == 'Saturday':
                data2 = my_huge_dict_train[id_warehouse][id_product]['Friday'].copy()
                my_ans_by_second_day = math.floor(exponential_smoothing(data2, 0.14))
                answer_exp.append(math.floor((my_ans_by_day * alpha + my_ans_by_second_day * beta) / (alpha + beta)))
            
            #-----------------------------------
            elif date2week[id_date] == 'Friday':
                data2 = my_huge_dict_train[id_warehouse][id_product]['Thursday'].copy()
                my_ans_by_second_day = math.floor(exponential_smoothing(data2, 0.14))
                answer_exp.append(math.floor((my_ans_by_day * alpha + my_ans_by_second_day * beta) / (alpha + beta)))
            
            elif date2week[id_date] == 'Thursday':
                data2 = my_huge_dict_train[id_warehouse][id_product]['Wednesday'].copy()
                my_ans_by_second_day = math.floor(exponential_smoothing(data2, 0.14))
                answer_exp.append(math.floor((my_ans_by_day * alpha + my_ans_by_second_day * beta) / (alpha + beta)))
            
            #-----------------------------------
            elif date2week[id_date] == 'Monday':
                data2 = my_huge_dict_train[id_warehouse][id_product]['Sunday'].copy()
                my_ans_by_second_day = math.floor(exponential_smoothing(data2, 0.14))
                answer_exp.append(math.floor((my_ans_by_day * alpha + my_ans_by_second_day * beta) / (alpha + beta)))
            
            elif date2week[id_date] == 'Sunday':
                data2 = my_huge_dict_train[id_warehouse][id_product]['Saturday'].copy()
                my_ans_by_second_day = math.floor(exponential_smoothing(data2, 0.14))
                answer_exp.append(math.floor((my_ans_by_day * alpha + my_ans_by_second_day * beta) / (alpha + beta)))
            
            #-----------------------------------
            elif date2week[id_date] == 'Wednesday':
                data2 = my_huge_dict_train[id_warehouse][id_product]['Tuesday'].copy()
                my_ans_by_second_day = math.floor(exponential_smoothing(data2, 0.14))
                answer_exp.append(math.floor((my_ans_by_day * alpha + my_ans_by_second_day * beta) / (alpha + beta)))
            
            elif date2week[id_date] == 'Tuesday':
                data2 = my_huge_dict_train[id_warehouse][id_product]['Monday'].copy()
                my_ans_by_second_day = math.floor(exponential_smoothing(data2, 0.14))
                answer_exp.append(math.floor((my_ans_by_day * alpha + my_ans_by_second_day * beta) / (alpha + beta)))

print()
print(mean_absolute_error(right_answer, answer_exp))


### Такс, ну субботу включать всё-таки не нужно, а вот идея с предыдущим днём стрельнула. На этом и остановлюсь, ибо тестировать группы по 3 дня это очень странная идея.

In [None]:
data_ = pd.Series(pd.to_datetime(test_data['date'].unique()))


#print(data_, type(data_))
data_week = data_.dt.day_name()
# print(data_week)
date2week = dict(zip(test_data['date'].unique(), data_week))
print(date2week)

In [None]:
cur_min = 1
alpha = 1
beta = 1
threshold = 5.5
right_answer = []
answer_exp = []
for product_info in tqdm(test_data[['warehouse_id', 'product_id', 'date']].values):
    id_warehouse = (product_info[0])
    id_product = (product_info[1])
    id_date = (product_info[2])

    data = my_huge_dict[id_warehouse][id_product][date2week[id_date]].copy()
    local_max = 0
    local_i = 0
    for i in range(len(data)):
        if local_max < data[i]:
            local_max = data[i]
            local_i = i

    if local_max > threshold * math.floor(sum(data)/len(data)):
        data[local_i] = data[-1]  # Удаление максимального элемента
        data.pop()

    # right_answer.append(my_huge_dict_test[id_warehouse][id_product][date2week[id_date]][0])
    my_ans_by_day = math.floor(exponential_smoothing(data, 0.14))

    if date2week[id_date] == 'Saturday':
        answer_exp.append(my_ans_by_day)
    
    #-----------------------------------
    elif date2week[id_date] == 'Friday':
        data2 = my_huge_dict_train[id_warehouse][id_product]['Thursday'].copy()
        my_ans_by_second_day = math.floor(exponential_smoothing(data2, 0.14))
        answer_exp.append(math.floor((my_ans_by_day * alpha + my_ans_by_second_day * beta) / (alpha + beta)))
    
    elif date2week[id_date] == 'Thursday':
        data2 = my_huge_dict_train[id_warehouse][id_product]['Wednesday'].copy()
        my_ans_by_second_day = math.floor(exponential_smoothing(data2, 0.14))
        answer_exp.append(math.floor((my_ans_by_day * alpha + my_ans_by_second_day * beta) / (alpha + beta)))
    
    #-----------------------------------
    elif date2week[id_date] == 'Monday':
        data2 = my_huge_dict_train[id_warehouse][id_product]['Sunday'].copy()
        my_ans_by_second_day = math.floor(exponential_smoothing(data2, 0.14))
        answer_exp.append(math.floor((my_ans_by_day * alpha + my_ans_by_second_day * beta) / (alpha + beta)))
    
    elif date2week[id_date] == 'Sunday':
        data2 = my_huge_dict_train[id_warehouse][id_product]['Friday'].copy()
        my_ans_by_second_day = math.floor(exponential_smoothing(data2, 0.14))
        answer_exp.append(math.floor((my_ans_by_day * alpha + my_ans_by_second_day * beta) / (alpha + beta)))
    
    #-----------------------------------
    elif date2week[id_date] == 'Wednesday':
        data2 = my_huge_dict_train[id_warehouse][id_product]['Tuesday'].copy()
        my_ans_by_second_day = math.floor(exponential_smoothing(data2, 0.14))
        answer_exp.append(math.floor((my_ans_by_day * alpha + my_ans_by_second_day * beta) / (alpha + beta)))
    
    elif date2week[id_date] == 'Tuesday':
        data2 = my_huge_dict_train[id_warehouse][id_product]['Monday'].copy()
        my_ans_by_second_day = math.floor(exponential_smoothing(data2, 0.14))
        answer_exp.append(math.floor((my_ans_by_day * alpha + my_ans_by_second_day * beta) / (alpha + beta)))

print()
# print(mean_absolute_error(right_answer, answer_exp))
print(len(answer_exp), len(simple_data))

In [None]:
#Result_table = pd.read_csv("/content/drive/MyDrive/Sales_Forecasting/sub.csv")
#Result_table['quantity'] = pd.Series(answer_exp)
#Result_table[["id", "quantity"]].to_csv("Last_try.csv", index=False)

### Score = 0.19935
### Хуже, чем прошлая попытка. Грустно. Но ничего нового я уже пробовать не буду, ибо попыток не осталось.