In [104]:
import os
import numpy as np
import numba as nb
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.externals import joblib


### Define function here

In [142]:
def name_encoding(table = 'click_order_user.csv', window_r = 2):
    '''
    generate output filename by input filename
    
    input:
    ------------
    table: str
        input table name
    window_r: int
        ridus of windows. Window size = 2 * window_r + 1
    
    output:
    ------------
    n_gram_table_name: str
        name of n gram csv file
        
    example:
    ------------
    >>> name_encoding(table = 'click_order_user.csv', window_r = 2)
    user_5_gram.csv
    
    '''
    name = table.replace('.csv', '').split('_')[-1]
    return '_'.join([name, str(2*window_r+1), 'gram']) + '.csv'


def command_pipeline(command):
    '''
    execute a sequence of os command
    
    input:
    ------------
    command: multiple line string
        each line is a os command
        
    example:
    ------------
    >>> command = """
    cd ./data
    data_preprocessing.exe
    >>> command_pipeline(command)
    """
    
    '''
    with open('command_pipeline_temp.bat', 'w') as f:
        f.write(command)
    os.system("command_pipeline_temp.bat")
    os.remove("command_pipeline_temp.bat")
    
def create_order_column(rowData):
    '''
    generate buy column from n gram csv
    '''
    for data in rowData:
        if data == "NONE":
            continue
        if data.split('__')[-2] == "True":
            return data
    return "NONE"


def save_model(obj, filepath):
    with open(filepath, 'wb') as f:
        joblib.dump(obj, f)
        
        
def negative_sampling(l):
    '''
    negative sampling, input number of classes, output function to do negative sampling
    
    input:
    ---------
    l: int
        number of classes
    
    output:
    ---------
    sampling: function of sample
    '''
    def sampling(rowData):
        rowData = np.array(rowData)
        negative_sample = np.random.choice(l)
        while negative_sample in rowData:
            negative_sample = np.random.choice(l)
        return negative_sample
    return sampling


## 1. Preprocessing
### 1.1 Specify arguments and configuration 

In [135]:
# ridus for n-gram window
WINDOW_R = 2
NUM_NEGATIVE = 4

DATA_PATH = "./data/" # path to store data
USER_TABLE_NAME = 'click_order_user.csv'
USER_N_GRAM_TABLE_NAME = name_encoding(USER_TABLE_NAME, WINDOW_R) # name of output n-gram file

SKU_TABLE_NAME = 'click_order_sku.csv'
SKU_N_GRAM_TABLE_NAME = name_encoding(SKU_TABLE_NAME, WINDOW_R) # name of output n-gram file


print('output n gram data into: ', USER_N_GRAM_TABLE_NAME, SKU_N_GRAM_TABLE_NAME)

output n gram data into:  user_5_gram.csv sku_5_gram.csv


### 1.2 preprocessing

1. join tables, then sort by sku_ID or user_ID
2. preprocessing using data_preprocessing.exe

In [57]:
preprocessing_command_user = """
python table_join.py
cd ./data
data_preprocessing.exe {0} sku_ID {1} {2} user_level gender education city_level purchase_power marital_status age if_order
cd ..
""".format(WINDOW_R, USER_TABLE_NAME, USER_N_GRAM_TABLE_NAME)

preprocessing_command_sku = """
cd ./data
data_preprocessing.exe {0} user_ID {1} {2} sku_ID if_order
cd ..
""".format(WINDOW_R, SKU_TABLE_NAME, SKU_N_GRAM_TABLE_NAME)





if not os.path.exists(DATA_PATH+N_GRAM_TABLE_NAME):
    command_pipeline(preprocessing_command_user)

### 1.2.1 user_encoding

In [86]:
user_5_gram = pd.read_csv('./data/user_5_gram.csv', header=None)
user_5_gram.head(5)

Unnamed: 0,0,1,2,3,4
0,NONE,NONE,4__M__3__2__2__S__26-35__,3__M__2__2__2__M__26-35__,2__F__1__1__2__M__36-45__
1,NONE,4__M__3__2__2__S__26-35__,3__M__2__2__2__M__26-35__,2__F__1__1__2__M__36-45__,NONE
2,4__M__3__2__2__S__26-35__,3__M__2__2__2__M__26-35__,2__F__1__1__2__M__36-45__,NONE,NONE
3,NONE,NONE,1__F__-1__-1__-1__U__16-25__,NONE,NONE
4,NONE,NONE,4__M__4__2__2__S__26-35__,1__U__-1__-1__-1__U__U__,NONE


### 1.2.2 label encoding

In [106]:
if not os.path.exists("./models/5_gram_user_label_encoder.pk"):
    label_encoder = LabelEncoder()
    label_encoder.fit_transform(np.array(user_5_gram).flatten())

    save_model(label_encoder, "./models/5_gram_user_label_encoder.pk")
else:
    label_encoder = joblib.load("./models/5_gram_user_label_encoder.pk")

In [113]:
user_5_gram = user_5_gram.apply(label_encoder.transform)
user_5_gram.head(10)

Unnamed: 0,0,1,2,3,4
0,7762,7762,7468,5897,2666
1,7762,7468,5897,2666,7762
2,7468,5897,2666,7762,7762
3,7762,7762,71,7762,7762
4,7762,7762,7664,2148,7762
5,7762,7664,2148,7762,7762
6,7762,7762,948,2148,4574
7,7762,948,2148,4574,3255
8,948,2148,4574,3255,7761
9,2148,4574,3255,7761,993


### 1.2.3 Negative sampling

In [143]:
num_classes_ = len(label_encoder.classes_)
for i in range(NUM_NEGATIVE):
    
    user_5_gram['nega_sample_' + str(i)] = user_5_gram.iloc[:, :5].apply(negative_sampling(num_classes_), axis = 1)
    
user_5_gram.head(15)

Unnamed: 0,0,1,2,3,4,nega_sample_0,nega_sample_1,nega_sample_2,nega_sample_3
0,7762,7762,7468,5897,2666,3091,7209,6728,7623
1,7762,7468,5897,2666,7762,4913,5457,799,3490
2,7468,5897,2666,7762,7762,5541,5148,7115,7718
3,7762,7762,71,7762,7762,1154,5509,6756,7543
4,7762,7762,7664,2148,7762,2928,1628,1120,3331
5,7762,7664,2148,7762,7762,1557,4865,1480,999
6,7762,7762,948,2148,4574,68,4457,7689,1482
7,7762,948,2148,4574,3255,4848,7192,1203,4849
8,948,2148,4574,3255,7761,6834,4378,3854,1107
9,2148,4574,3255,7761,993,375,2356,2264,5900


In [145]:
user_5_gram.to_csv('./data/user_5_gram_for_word2vec.csv', index=None)