In [1]:
import sys
import os
import warnings
os.environ['OPENBLAS_NUM_THREADS'] = '1'
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import numpy as np
import gc
import pickle
import time
import pyarrow as pa
import pyarrow.parquet as pq
from tqdm import tqdm
import scipy
import implicit
import bisect

In [7]:
def save(obj, path, verbose=True):
    if verbose:
        print("Saving object to {}".format(path))

    with open(path, "wb") as obj_file:
        pickle.dump( obj, obj_file, protocol=pickle.HIGHEST_PROTOCOL )

    if verbose:
        print("Object saved to {}".format(path))
    pass

In [None]:
def load(path, verbose=True):
    if verbose:
        print("Loading object from {}".format(path))
    with open(path, "rb") as obj_file:
        obj = pickle.load(obj_file)
    if verbose:
        print("Object loaded from {}".format(path))
    return obj

In [3]:
data = pd.read_feather('/data/dataset_full.feather', columns=['user_id', 'url_host'])

In [4]:
feature_builder = load('/data/utils/url_only_factor_features_dict.pkl')

Loading object from /kaggle/input/fork-of-datapreprocessing/url_only_factor_features_dict.pkl
Object loaded from /kaggle/input/fork-of-datapreprocessing/url_only_factor_features_dict.pkl


In [5]:
def build_backward_index_(x_array):
            
    backward_index = {}
    for i in tqdm(range(len(x_array)), desc="Building backward index"):
        current_x = x_array[i]

        if current_x not in backward_index.keys():
            backward_index[current_x] = []
        backward_index[current_x].append(i)
    
    
    for x in tqdm(backward_index.keys(), desc="Building backward index (final types converting)"):
        backward_index[x] = np.array(backward_index[x])
        
    return backward_index

In [6]:
uniq_user_ids = np.unique( data["user_id"].values )
url_hosts = data["url_host"].values
user_ids_backward_index = build_backward_index_( data["user_id"].values )

Building backward index: 100%|██████████| 322899435/322899435 [04:04<00:00, 1319196.46it/s]
Building backward index (final types converting): 100%|██████████| 415317/415317 [00:37<00:00, 11167.94it/s]


In [8]:
save(user_ids_backward_index, "/data/utils/user_ids_backward_index.pkl")
save(uniq_user_ids, "/data/utils/uniq_user_ids.pkl")
save(url_hosts, "/data/utils/url_hosts.pkl")

Saving object to user_ids_backward_index.pkl
Object saved to user_ids_backward_index.pkl
Saving object to uniq_user_ids.pkl
Object saved to uniq_user_ids.pkl
Saving object to url_hosts.pkl
Object saved to url_hosts.pkl


In [None]:
del(data)
del(user_ids_backward_index)
del(feature_builder)
del(uniq_user_ids)
del(url_hosts)
gc.collect()

In [None]:
user_ids_backward_index = load('/data/utils/user_ids_backward_index.pkl')
uniq_user_ids = load('/data/utils/uniq_user_ids.pkl')
url_hosts = load('/data/utils/url_hosts.pkl')
url_dict = load('/data/utils/url_only_factor_features_dict.pkl')

In [None]:
def build_feat_dict(uniq_user_ids, url_hosts, user_ids_backward_index, url_feat_dict):
        texts_for_transform = []
        for i in tqdm( range(len(uniq_user_ids)), desc="Aggregating urls by user_id" ):
            uniq_id = uniq_user_ids[i]
            id_group_mask = user_ids_backward_index[ uniq_id ]
            group_urls = url_hosts[ id_group_mask ]
            group_tokens = " ".join( group_urls )
            group_tokens = group_tokens.split(" ")
            texts_for_transform.append( group_tokens )
        
        encoded_feats = encode_user_urls_( texts_for_transform, url_feat_dict, verbose=True )
        encoded_feats = np.array( encoded_feats )
        
        user_url_feats = {}
        user_url_feats["feature_names"] = url_feat_dict["feature_names"]
        
        for i in range(len(uniq_user_ids)):
            user_url_feats[uniq_user_ids[i]] = encoded_feats[i, :]
        
        return user_url_feats

def encode_user_urls_(docs, url_feat_dict=None, verbose=True):

    doc_vectors = []
    if verbose:
        proc_range = tqdm(range(len(docs)), desc="Encoding unique user's urls")
    else:
        proc_range = range(len(docs))

    for i in proc_range:
        current_vector = []
        uniq_docs = set(docs[i])
        for current_url in uniq_docs:
            extracted_vector = url_feat_dict[current_url]
                
            current_vector.append( extracted_vector )
            
        current_vector = np.mean( current_vector, axis=0 )
        doc_vectors.append(current_vector)
    return doc_vectors

In [None]:
url_dict = build_feat_dict(uniq_user_ids, url_hosts, user_ids_backward_index, url_dict)

In [None]:
feature_columns = []
feature_columns += url_dict["feature_names"]

feature_vectors = []
user_ids = []
for user_id in tqdm(url_dict.keys(), desc="Building merged features"):
    if user_id == "feature_names":
        continue
    user_common_vector = []
    user_ids.append(user_id)
    user_common_vector.append(url_dict[user_id])
    user_common_vector = np.array(user_common_vector)
    feature_vectors.append(user_common_vector)
feature_vectors = np.array(feature_vectors).reshape(415317,-1)
users_dataset = pd.DataFrame(data=feature_vectors, columns=feature_columns)
users_dataset["user_id"] = user_ids

In [None]:
users_dataset.to_csv('data/url_factor/url_factor.csv', index = False)