In [1]:
import sys
import os
import warnings
os.environ['OPENBLAS_NUM_THREADS'] = '1'
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import numpy as np
import gc
import pickle
import time
import pyarrow as pa
import pyarrow.parquet as pq
from tqdm import tqdm
import scipy
import implicit
import bisect

In [7]:
def save(obj, path, verbose=True):
    if verbose:
        print("Saving object to {}".format(path))

    with open(path, "wb") as obj_file:
        pickle.dump( obj, obj_file, protocol=pickle.HIGHEST_PROTOCOL )

    if verbose:
        print("Object saved to {}".format(path))
    pass

In [4]:
def load(path, verbose=True):
    if verbose:
        print("Loading object from {}".format(path))
    with open(path, "rb") as obj_file:
        obj = pickle.load(obj_file)
    if verbose:
        print("Object loaded from {}".format(path))
    return obj

In [3]:
data = pd.read_feather('/data/dataset_full.feather', columns=["user_id", "region_name", "city_name", 
                                                              "cpe_manufacturer_name", "cpe_model_name", 
                                                              "cpe_type_cd", "cpe_model_os_type","part_of_day", 
                                                              "day_name", "day_name_parts"])

In [None]:
# I use word_to_vec categorical encoder from
# https://github.com/CameleoGrey/cameleogrey_mtsmlcup

class GreyCategoricalEncoder():
    
    def __init__(self,
                 shuffle_count=5, vec_size=1,
                 window=5, n_jobs=8,
                 min_count=1, sample=0,
                 epochs=100, sg=0, seed=45):

        self.feature_name = None
        self.w2v_dict = None
        self.shuffle_count = shuffle_count
        self.vec_size = vec_size
        self.window = window
        self.n_jobs = n_jobs
        self.min_count = min_count
        self.sample = sample
        self.epochs = epochs
        self.sg = sg
        self.seed = seed

        pass

    def fit(self, values_list, feature_name):
        
        self.feature_name = feature_name
        
        shuffled_token_rows = []
        for i in tqdm(range(self.shuffle_count), desc="Generating token shuffles for fitting cat encoder"):
            shuffled_tokens = deepcopy(values_list)
            np.random.shuffle(shuffled_tokens)
            shuffled_token_rows.append( shuffled_tokens )
        
        w2v_dict = self.fit_word2vec_( shuffled_token_rows, size=self.vec_size,
                                      window=self.window, n_jobs=self.n_jobs,
                                      min_count=self.min_count, sample=self.sample,
                                      epochs=self.epochs, sg=self.sg, seed=self.seed)
        self.w2v_dict = w2v_dict

        return self
    

    def transform(self, values_list):
        
        encoded_feats_names = []
        for i in range(self.vec_size):
            encoded_feats_names.append( str(self.feature_name) + "_{}".format(i) )
        
        encoded_feats = self.encode_features_( values_list, verbose=True )
        encoded_feats = np.array( encoded_feats )

        return encoded_feats, encoded_feats_names

    def fit_word2vec_(self, texts, size=128, window=5, n_jobs=8, min_count=1, sample=0, epochs=100, sg=0, seed=45):
        logging.root.setLevel(level=logging.INFO)
        w2v_model = Word2Vec(texts, vector_size=size, window=window, workers=n_jobs, min_count=min_count, sample=sample, epochs=epochs, sg=sg, seed=seed)
        w2v_dict = dict(zip(w2v_model.wv.index_to_key, w2v_model.wv.vectors))
        del texts
        gc.collect()

        return w2v_dict

    def encode_features_(self, texts, verbose=True):

        text_vectors = []
        if verbose:
            proc_range = tqdm(range(len(texts)), desc="Vectorizing texts")
        else:
            proc_range = range(len(texts))

        vec_size = len(self.w2v_dict[next(iter(self.w2v_dict.keys()))])
        for i in proc_range:
            current_vector = None
            current_vector = self.w2v_dict[texts[i]]
            if vec_size == 1:
                current_vector = current_vector[0]

            text_vectors.append(current_vector)
        return text_vectors

In [None]:
def fit_cat_encoders(df, feature_names, 
                         embedding_sizes, shuffle_counts):
    
    cat_encoders={}
    for i in tqdm(range(len(feature_names)), desc="Fitting category encoders"):
        feature_list = df[feature_names[i]].to_list()
        cat_encoders[feature_names[i]] = GreyCategoricalEncoder( shuffle_count=shuffle_counts[i], vec_size=embedding_sizes[i] )
        cat_encoders[feature_names[i]].fit( feature_list, feature_names[i] )
        
    gc.collect()
        
    return cat_encoders
    
def transform_cat_features(df, cat_encoders):
    for feature_name in tqdm(cat_encoders.keys(), desc="Encoding categorical features"):
        feature_values = df[feature_name].to_list()
        embeddings, column_names = cat_encoders[feature_name].transform( feature_values )
        del df[feature_name]
        df_enc = pd.DataFrame(embeddings, columns=column_names)
        df_enc['user_id'] = df['user_id']
        # saving features
        save(df_enc, '/data/utils/'+feature_name+'.pkl')
        del(df_enc)
        for i in tqdm(range(len(column_names)), desc="Replacing original \"{}\" by encoded".format(feature_name)):
            if len(column_names) == 1:
                df[column_names[i]] = embeddings[:]
            else:
                df[column_names[i]] = embeddings[:, i]
        
    gc.collect()

In [None]:
base_cat_feature_names = ["user_id", "region_name", "city_name", 
                                                              "cpe_manufacturer_name", "cpe_model_name", 
                                                              "cpe_type_cd", "cpe_model_os_type","part_of_day", 
                                                              "day_name", "day_name_parts"]
cat_encoders = fit_cat_encoders(data, 
                                 feature_names=base_cat_feature_names, 
                                 embedding_sizes=[1, 1],
                                 #embedding_sizes=[5, 5, 5, 5, 1, 1, 1, 1, 5], 
                                 shuffle_counts=[1, 1])
transform_cat_features(data, cat_encoders)