In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
from nltk.stem.snowball import SnowballStemmer
import itertools
from sklearn.feature_extraction.text import HashingVectorizer
import gc

In [2]:
con_str_mp = pyodbc.connect("driver={SQL Server};server=main_server\main_server;database=mp;trusted_connection=true")
orders_xsell = pd.read_sql_query(sql = 'SELECT * FROM [test_db].[sell].[order_sell]', con = con_str_mp)
con_str_mp.close()

In [None]:
DECAY_WEIGHT = 0.995

In [4]:
def prepare_xsell(xsell_orders, decay_weight):
    main_goods = xsell_orders[['date', 'order_id', 'applestore', 'product_id', 'product_code', 
                               'product_name', 'category_id', 'category_name', 'subcategory', 'color', 
                               'im_brand_name']]
    for name in main_goods.columns[3:]:
         main_goods.rename({name : "main_" + name}, axis='columns', inplace = True)
    cross_goods = xsell_orders[['order_id', 'product_id', 'product_code', 'product_name', 
                          'category_id', 'category_name', 'subcategory', 'color', 'im_brand_name',
                          'order_qty', 'order_price', 'order_item', 'margin_plus_discount']]  
    for name in cross_goods.columns[1:]:
         cross_goods.rename({name : "cross_" + name}, axis='columns', inplace = True)
    xsell = main_goods.merge(cross_goods, on = 'order_id', how = 'inner')
     # не рекомендуем к товару его же самого
    xsell = xsell[xsell['main_product_code'] != xsell['cross_product_code']].sort_values('date')
    xsell['date'] = xsell['date'].apply(lambda x: datetime.strptime(x[:10], '%Y-%m-%d'))
    xsell['weight'] = xsell['date'].max() - xsell['date'] 
    xsell['weight'] = decay_weight ** xsell['weight'].dt.days
    xsell['profit'] = xsell['cross_margin_plus_discount']
    xsell.sort_values(['main_product_id', 'cross_product_id'], inplace = True)
    xsell = pd.concat(
                [xsell[['main_product_id',  'cross_product_id', 'order_id'
                         ]].groupby(['main_product_id',  'cross_product_id']).count(),
                xsell[['main_product_id',  'cross_product_id', 'weight', 'cross_order_item', 'profit'
                         ]].groupby(['main_product_id',  'cross_product_id']).sum(),
                xsell[['main_product_id',  'cross_product_id','cross_order_price', 
                       'cross_margin_plus_discount'
                        ]].groupby(['main_product_id',  'cross_product_id']).last(), 
                xsell[['main_product_id',  'cross_product_id', 'applestore', 'main_product_code', 
                          'main_product_name', 'main_category_name', 'main_subcategory','cross_product_code', 
                          'cross_product_name','cross_category_name', 'cross_subcategory'
                        ]].groupby(['main_product_id',  'cross_product_id']).first()],
                      axis = 1, join = 'inner')
    xsell.reset_index(inplace = True)
    xsell['purchases'] = xsell['weight'] / xsell['weight'].sum()  
    xsell.sort_values('main_product_code')
    xsell['uniq_orders'] = xsell['order_id']
    xsell.rename({'order_id':'poisson_target','weight':'gaussian_target','cross_order_item':'revenue',
                  'cross_order_price':'price','cross_margin_plus_discount':'margin', 
                  'purchases':'binomial_target'}, axis='columns', inplace = True)
    return xsell

orders_xsell = prepare_xsell(orders_xsell, DECAY_WEIGHT)

In [None]:
def clean_product_name(text, letters_num = 3):
    def stemmer_word(word_list):
        out_list = [stemmer.stem(word) for word in word_list]
        return out_list
    stemmer = SnowballStemmer("russian") 
    text = (text.str.lower()
                .str.replace("\(|\)", " ")
                .str.replace("\s[а-яА-Я]{1," + str(letters_num) + "}\s", " ")
                .str.replace("/", " ")
                .str.replace("[\s\t]+", " ")
                .str.split(" ")
                .apply(stemmer_word)
                .str.join(" "))
    return text

orders_xsell['main_product_name'] = clean_product_name(orders_xsell['main_product_name'])
orders_xsell['cross_product_name'] = clean_product_name(orders_xsell['cross_product_name'])
train_cat_data = orders_xsell[['poisson_target', 'uniq_orders', 'applestore', 'main_product_name', 
                                    'cross_product_name']]
train_cat_data.rename({'main_product_name':'cleaned_main_product',
                     'cross_product_name':'cleaned_cross_product'}, axis='columns', inplace = True)

In [None]:
growth = 2 
def creat_augmented(train_table, growth):
    augmented_table = pd.concat([np.repeat(train_table['cleaned_cross_product'], growth)
                                 .sample(len(train_table) * growth).reset_index(drop = True),
                np.repeat(train_table['cleaned_main_product'], growth).reset_index(drop = True),
                np.repeat(train_table['applestore'], growth).reset_index(drop = True)],
              axis = 1)
    augmented_table['poisson_target'], augmented_table['uniq_orders'] = 0, 1
    augmented_table = pd.merge(left = augmented_table,
         right = train_table[['cleaned_main_product', 'cleaned_cross_product']],
         how = 'left',
         indicator = True,
        on = ['cleaned_main_product', 'cleaned_cross_product']
             )
    augmented_table = augmented_table.loc[augmented_table._merge == 'left_only', :].drop(columns='_merge')
    augmented_table.drop_duplicates(inplace = True)
    augmented_table = pd.concat([train_table, augmented_table], axis = 0, sort = True)
    augmented_table = (augmented_table.groupby(['cleaned_cross_product', 'cleaned_main_product', 'applestore'], 
                                               as_index=False).sum())
    augmented_table.sort_values(by='poisson_target',ascending=False, inplace=True)
    (augmented_table.drop_duplicates(subset = ['cleaned_main_product', 'cleaned_cross_product', 'applestore'], 
                                    inplace = True))
    return augmented_table.reset_index(drop = True)
augmented_train = creat_augmented(train_cat_data, growth)

In [None]:
def create_predictors(df):
    table_words = pd.merge(df['cleaned_main_product'].apply(lambda x: x.split()),
                           df['cleaned_cross_product'].apply(lambda x: x.split()), how='outer',left_index=True, right_index=True)
    function_cartesian = lambda x: itertools.product(x.cleaned_main_product, x.cleaned_cross_product)
    table_words['bag_words'] = table_words.apply(function_cartesian, axis=1)
    table_words['bag_words'] = table_words['bag_words'].apply(list)
    function_join = lambda x: list(map('_'.join, x))
    table_words['bag_words'] = table_words['bag_words'].apply(function_join)
    vec = HashingVectorizer(ngram_range=(1, 1), n_features=(2 ** 18), norm=None, alternate_sign=True,
                            binary=True)

    array_words = np.array(table_words['bag_words'].str.join(" "))
    matrix = vec.fit_transform(array_words)
    gc.collect()
    return matrix

In [None]:
x = create_predictors(augmented_train)
y = augmented_train['poisson_target']

In [None]:
test_regression = linear_model.Ridge(alpha = 10)
regression = test_regression.fit(x, y)

In [None]:
mean_squared_error(y, test_regression.predict(x))