In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline

from sklearn.compose import ColumnTransformer
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression

from sklearn.metrics import confusion_matrix, classification_report, precision_score
from sklearn.metrics import roc_curve
from sklearn.neighbors import KNeighborsClassifier

from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge


pd.set_option('display.max_rows', 500)
pd.set_option('display.max_colwidth', 500)
pd.set_option('display.width', 1000)

import warnings
warnings.filterwarnings('ignore')

In [None]:
product = pd.read_csv('products.csv')
store = pd.read_csv('stores.csv')

store["store_id"] = store["store_id"].astype("object")
product["product_id"] = product["product_id"].astype("object")

store.drop_duplicates(inplace=True)
product.drop_duplicates(inplace=True)

In [None]:
def get_stats(X):
    X = eval(str(X))
    max_14 = np.max(X)
    min_14 = min(X)
    max_7 = max(X[7:])
    min_7 = min(X[7:])
    mean_14 = np.mean(X[:])
    mean_7 = np.mean(X[7:])
    mean_3 = np.mean(X[11:])
    std_14 = np.std(X)
    std_7 = np.std(X[7:])
    std_3 = np.std(X[11:])
    return pd.Series([max_14, min_14, max_7, min_7, mean_14, mean_7, mean_3, std_14, std_7, std_3], index=['max_14', 'min_14', 'max_7', 'min_7', 'mean_14', 'mean_7', 'mean_3', 'std_14', 'std_7', 'std_3'])


class Stats_translate(BaseEstimator, TransformerMixin):
    def __init__(self, base=np.exp(1)):
        BaseEstimator.__init__(self)
        TransformerMixin.__init__(self)
        self.base = base

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        Series_list = [get_stats(x) for x in X.sales]
        return pd.DataFrame(Series_list)



def transform_store_data(store_id):
    store_data = store[store["store_id"] == int(store_id)][['region', 'type']]
    return store_data


class Store_translate(BaseEstimator, TransformerMixin):

    def __init__(self, base=np.exp(1)):
        BaseEstimator.__init__(self)
        TransformerMixin.__init__(self)
        self.base = base

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        store_id = X.store_id
        df_list = [transform_store_data(i) for i in store_id]
        return pd.concat(df_list)


    
def transform_product_data(product_id):
    product_data = product[product["product_id"] == int(
        product_id)][["product_type", "manufacturer"]]
    return product_data


class Product_translate(BaseEstimator, TransformerMixin):
    def __init__(self, base=np.exp(1)):
        BaseEstimator.__init__(self)
        TransformerMixin.__init__(self)
        self.base = base

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        product_id = X.product_id
        df_list = [transform_product_data(i) for i in product_id]
        return pd.concat(df_list)

In [None]:
lst_product = [ ['полуфабрикаты и субпродукты', 'ветчина', 'сосиски', 'колбасы',
       'паштеты', 'рулеты', 'сар', 'сардельки', 'копчености'],['галерея вкуса', 'ферма', 'виталюр', 'производство']]

lst_store = [['Minsk', 'Mogilev', 'Minsk_reg', 'Grodno', 'Vitebsk', 'Brest'],['supermarket', 'minimarket']]

product_ohe = OneHotEncoder(categories=lst_product).fit(product[["product_type", "manufacturer"]])
product_pipeline = Pipeline([
    ('translate_product', Product_translate()),
    ("ohe", product_ohe)
])

product_pipeline_no_ohe = Pipeline([
    ('translate_product', Product_translate()),
    #("ohe", product_ohe)
])


store_ohe = OneHotEncoder(categories=lst_store).fit(store[["region", "type"]])
store_pipeline = Pipeline([
    ('translate_store', Store_translate()),
    ("ohe", store_ohe)
])

store_pipeline_no_ohe = Pipeline([
    ('translate_store', Store_translate()),
    #("ohe", store_ohe)
])

translate_pipeline = Pipeline([
    ('translate', Stats_translate())
])

preprocessor = ColumnTransformer(
    transformers=[
      ('get_store', store_pipeline, ["store_id"]),
     ('get_product', product_pipeline, ["product_id"]),
        ('stats', translate_pipeline, ['sales']),
    ]
)


preprocessor_no_ohe = ColumnTransformer(
    transformers=[
       ('get_store', store_pipeline_no_ohe, ["store_id"]),
       ('get_product', product_pipeline_no_ohe, ["product_id"]),
        ('stats', translate_pipeline, ['sales']),
    ]
)



In [None]:
df = pd.read_csv("Ядерный_чемоданчик_clear.csv")
df.drop(columns="Unnamed: 0", inplace=True)
X = df.drop('target', axis=1)
y = df['target']

# Mean

In [None]:
class MeanEstimator(BaseEstimator):
    def __init__(self):
        pass
    
    def fit(self, X,y):
        pass
    
    def predict(self,X,y):
        return np.apply_along_axis(np.mean, axis=1, arr = X[:,-14:])*7


    def fit_predict(self, X,y):
        return self.predict(X,y)
    

In [None]:
from sklearn.metrics import mean_absolute_percentage_error as mape

mean_pipeline = Pipeline(steps=[('pp', preprocessor),
                                ('mean', MeanEstimator())
                                ])
