In [3]:
import tensorflow as tf
import pandas as pd
import numpy as np
import time
import os
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from numpy import linalg as la

## set parameters
#ROOT = 'PATH/TO/data/processed/'
ROOT = '/home/kddlab/swyoo/data/'
PATH_TO_TRAIN = ROOT + 'train.csv'
PATH_TO_TEST = ROOT + 'test.csv'
PATH_TO_ITEM = ROOT + 'item_metadata.csv'
checkpoint_dir = './checkpoint'
if not os.path.exists(checkpoint_dir): os.mkdir(checkpoint_dir)

In [5]:
## load data
train = pd.read_csv(PATH_TO_TRAIN)
test = pd.read_csv(PATH_TO_TEST)
item = pd.read_csv(PATH_TO_ITEM)

In [6]:
def encoding(data):
    platforms = data.platform.unique()
    n_platforms = len(platforms)
    platformsIdx = pd.Series(data=np.arange(n_platforms), index=platforms).to_dict()
    platformsIdx_inv = dict([(v, k) for k, v in platformsIdx.items()])
    users = data.user_id.unique()
    n_users = len(users)
    usersIdx = pd.Series(data=np.arange(n_users), index=users).to_dict()
    usersIdx_inv = dict([(v, k) for k, v in usersIdx.items()])
    A = data[data.columns[[0,6,7,8]]]
    # one hot encoding
    one_hot = pd.get_dummies(data['platform'])
    A = A.drop('platform',axis = 1)
    A = A.join(one_hot)
    one_hot = pd.get_dummies(data['device'])
    A = A.drop('device',axis = 1)
    A = A.join(one_hot)
    
    return A

In [7]:
tr = encoding(train)

In [8]:
te = encoding(test)

In [9]:
print(tr.shape)
print(te.shape)

(15932992, 60)
(3782335, 60)


In [17]:
tr[:10]

Unnamed: 0,user_id,city,AA,AE,AR,AT,AU,BE,BG,BR,...,TR,TW,UK,US,UY,VN,ZA,desktop,mobile,tablet
0,00RL8Z82B2Z1,"Sydney, Australia",0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,00RL8Z82B2Z1,"Sydney, Australia",0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,00RL8Z82B2Z1,"Sydney, Australia",0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,00RL8Z82B2Z1,"Sydney, Australia",0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,00RL8Z82B2Z1,"Sydney, Australia",0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0
5,00RL8Z82B2Z1,"Sydney, Australia",0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0
6,00RL8Z82B2Z1,"Sydney, Australia",0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0
7,00RL8Z82B2Z1,"Sydney, Australia",0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0
8,00RL8Z82B2Z1,"Sydney, Australia",0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0
9,00RL8Z82B2Z1,"Sydney, Australia",0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [None]:
def reduction(A,rate=0.01,dimension=3):
    B = A.groupby('user_id').sum()
    B.values.shape
    M = B.values
    M = M/la.norm(M, 'fro')
    pca = PCA(n_components=dimension)
    pca.fit(M)
    M = np.matmul(M,pca.components_.T)
    return M 

In [10]:
def reduction(A,rate=0.01,dimension=3):
    k = 1000
    B = A[:k].groupby('user_id').sum()
    B.values.shape
    M = B.values
    M = M/la.norm(M, 'fro')
    pca = PCA(n_components=dimension)
    pca.fit(M)
    M = np.matmul(M,pca.components_.T)
    return M 

In [11]:
A = reduction(tr)
B = reduction(te)

In [12]:
A.shape

(56, 3)

In [13]:
B.shape

(47, 3)

In [14]:
km = KMeans(n_clusters=10, random_state=0).fit(A)
km

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=10, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=0, tol=0.0001, verbose=0)

In [15]:
km.labels_

array([7, 0, 4, 1, 8, 1, 1, 1, 7, 1, 6, 1, 1, 7, 1, 7, 1, 1, 4, 5, 2, 1,
       7, 1, 9, 1, 1, 5, 1, 1, 1, 1, 1, 1, 1, 3, 0, 1, 1, 1, 5, 1, 7, 1,
       1, 1, 1, 1, 1, 5, 1, 4, 1, 7, 1, 1], dtype=int32)

In [16]:
km.predict(B)

array([1, 1, 1, 1, 5, 4, 1, 1, 5, 3, 1, 1, 1, 7, 4, 1, 1, 7, 1, 1, 1, 5,
       1, 1, 7, 1, 1, 1, 1, 1, 1, 1, 1, 5, 1, 1, 8, 1, 1, 1, 1, 1, 1, 1,
       5, 1, 2], dtype=int32)