In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
df_raw = pd.read_excel('Train.xlsx')

### Feature List Operations
0. sor: One-hot dummy
1. cdf_seq_no: Deleted, unique to transaction
2. trans_desc: Tokenized and Word2Vec
3. merchant_cat_code: Numerical input
4. amt: Input (continuous)                       
5. db_cr_cd: One-hot dummy
6. payment_reporting_category 
7. payment_category:  One-hot dummy
8. is_internationa: One-hot dummy
9. default_brand: Word2Vec sum embedding
10. default_location: Turned last two to State Categorical
11. qrated_brand: unused
12. coalesced_brand: Word2Vec sum embedding  

In [73]:
# Remove unused columns
df_train = df_raw.drop(columns=['cdf_seq_no','payment_reporting_category'])

# Convert categorical values to binary indicators (one-hot)
categ = ['sor','db_cr_cd', 'payment_category', 'is_international']
df_categ = pd.concat([
    df_train.drop(columns=categ), # dataset without the categorical features
    pd.get_dummies(df_train[categ], columns=categ, drop_first=False) # categorical features converted to dummies
], axis=1)

# Takes State from location (with some errors)
df_categ['State'] = pd.factorize(df_categ['default_location'].str[-2:])[0]


df_categ.fillna(0, inplace=True)

# Turn targets into numeric classes
[df_categ['Category'],class_names] = pd.factorize(df_categ.Category)

In [22]:
## Load Word2Vec Model
def load_glove_model(File):
    print("Loading Glove Model")
    glove_model = {}
    with open(File,'r') as f:
        for line in f:
            split_line = line.split()
            word = split_line[0]
            embedding = np.array(split_line[1:], dtype=np.float64)
            glove_model[word] = embedding
    print(f"{len(glove_model)} words loaded!")
    return glove_model

glove_word2vec = load_glove_model('glove.6B.50d.txt')

Loading Glove Model
400000 words loaded!


In [71]:
## Toekenize and word2Vec 

embed_size = 50
num_embed = 2
embed = np.zeros((len(df_categ), embed_size*num_embed))

for iRow in range(len(df_categ)):
    #default_brand
    brand_token = np.zeros(embed_size)
    for iToken in str(df_categ.iloc[iRow]['default_brand']).lower().split():
        if iToken in glove_word2vec:
            brand_token += glove_word2vec[iToken]
    
    coalesced_token = np.zeros(embed_size)
    for iToken in str(df_categ.iloc[iRow]['coalesced_brand']).lower().split():
        if iToken in glove_word2vec:
            coalesced_token += glove_word2vec[iToken]

    embed[iRow] = np.concatenate((brand_token, coalesced_token))

In [103]:
X_data = np.hstack((df_categ.iloc[:, [1,2]].to_numpy(),df_categ.iloc[:, 8:].to_numpy(), embed))
y_data = df_categ.iloc[:, 7].to_numpy()

print('Shape of Feature Data: {}'.format(X_data.shape))
print('Shape of Target Data: {}'.format(y_data.shape))

np.savez('data.npz', X_data=X_data, y_data=y_data)
print('Data saved to data.npz')

Shape of Feature Data: (40000, 114)
Shape of Target Data: (40000,)
Data saved to data.npz
