In [1]:
import pandas as pd
import numpy as np
cols = ['useriD','StreamId', 'StreamerName', 'StartTime', 'EndTime']
df = pd.read_csv('100k_a.csv', header = None, names = cols)
df.head()

Unnamed: 0,useriD,StreamId,StreamerName,StartTime,EndTime
0,1,33842865744,mithrain,154,156
1,1,33846768288,alptv,166,169
2,1,33886469056,mithrain,587,588
3,1,33887624992,wtcn,589,591
4,1,33890145056,jrokezftw,591,594


In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf
tf.config.run_functions_eagerly(True)

# from pyspark.sql import SparkSession
# from pyspark.ml.feature import StringIndexer
cols = ['useriD','StreamId', 'StreamerName', 'StartTime', 'EndTime']

"""
user - streamer - watch hours
user - streamer - number of times used watched perticular streamer
which is overall 2D
"""

def split_data(df, hold_out = 0.1):
    test = df.sample(frac = hold_out, replace = False)
    train = df[~df.index.isin(test.index)]
    return train, test

def build_watch_sparse(df):
    sparse_matrix = tf.SparseTensor(
        indices = df[['useriD','StreamerName']].values,
        values = df['ts'].values,
        dense_shape = [np.max(df['useriD']), np.max(df['StreamerName'])]
    )
    return sparse_matrix

def preproc(df):

    # spark = SparkSession.builder.appName('preprocess_dag').getOrCreate()

    # df = spark.createDataFrame(df, cols)
    df = pd.DataFrame(df, columns = cols)
    df.useriD = pd.factorize(df.useriD)[0] +1
    df.StreamerName = pd.factorize(df.StreamerName)[0]+1

    df['ts'] = df['EndTime'] - df['StartTime']
    df = df.groupby(['useriD', 'StreamerName']).sum('ts').reset_index()
    df['combined'] = df['useriD'].astype(str) + '_' + df['StreamerName'].astype(str)

    df = df.sample(frac = 1, random_state = 42)

    train, test = split_data(df)

    train_combined = set(train['combined'])
    mask = test['combined'].isin(train_combined)
    filtered_test = test[mask]

    train_sparse = build_watch_sparse(train)
    test_sparse = build_watch_sparse(test)

    # sparse_matrix = build_watch_sparse(df)

    # print(sparse_matrix)
    return train_sparse, test_sparse


train_sparse, test_sparse = preproc(df)

In [3]:
train_sparse


SparseTensor(indices=tf.Tensor(
[[ 45683    720]
 [ 91831   2739]
 [ 48942   1084]
 ...
 [  9051  35200]
 [ 44872 101513]
 [  8351    162]], shape=(1354642, 2), dtype=int64), values=tf.Tensor([70 13 19 ...  3  1  1], shape=(1354642,), dtype=int64), dense_shape=tf.Tensor([100000 162624], shape=(2,), dtype=int64))

In [4]:


# def split_data(data):
#     pass

# MSE

def sparse_loss(sparse_matrix, user_emb, streamer_emb):
    pred = tf.reduce_sum(
        tf.gather(user_emb, sparse_matrix.indices[:,0]) *
        tf.gather(streamer_emb, sparse_matrix.indices[:, 1]),
        axis = 1
    )

    loss = tf.losses.mean_squared_error(sparse_matrix.values, pred)
    return loss


def embed_init(train, emb_dim = 3, init_stddev = 1):

    num_users, num_streamers = train.shape


    xavier_init = tf.initializers.GlorotNormal()

    U = tf.Variable(xavier_init(
        shape = (num_users+1,emb_dim)
    ))
    V = tf.Variable(xavier_init(
        shape = (num_streamers+1, emb_dim)
    ))

    return U,V


U,V = embed_init(train_sparse)







In [5]:
tf.gather(U,train_sparse.indices[:,0])

<tf.Tensor: shape=(1354642, 3), dtype=float32, numpy=
array([[ 0.00416761,  0.00836583,  0.00067576],
       [ 0.00531957,  0.00894421,  0.00205468],
       [-0.00120842, -0.00340456,  0.01016272],
       ...,
       [-0.00329517,  0.00012952, -0.00459291],
       [ 0.00674833, -0.00047842, -0.00126578],
       [ 0.00012345,  0.00039406,  0.00611719]], dtype=float32)>

In [6]:
U

<tf.Variable 'Variable:0' shape=(100001, 3) dtype=float32, numpy=
array([[ 0.0042925 , -0.00067712,  0.00169951],
       [-0.00342722,  0.00629227, -0.00469559],
       [ 0.00080689,  0.00043371,  0.00157024],
       ...,
       [-0.00130897, -0.00053903,  0.00133053],
       [-0.01012751,  0.00616905,  0.001772  ],
       [ 0.0056945 , -0.00483312, -0.00214897]], dtype=float32)>

In [7]:


def collaborative_filtering_train(train_sparse, U, V, num_epochs=10, learning_rate=0.01):
    optimizer = tf.keras.optimizers.Adam(learning_rate)

    for epoch in range(num_epochs):
        with tf.GradientTape() as tape:
            loss = sparse_loss(train_sparse, U,V)

        gradients = tape.gradient(loss, [U, V])
        optimizer.apply_gradients(zip(gradients, [U, V]))

        if epoch % 99 ==0:
          print(f"Epoch {epoch+1}/{num_epochs}, Loss: {loss.numpy()}")

    return U, V


U, V = collaborative_filtering_train(train_sparse, U, V, num_epochs=1000, learning_rate=0.01)



Epoch 1/1000, Loss: 211.0229949951172
Epoch 100/1000, Loss: 177.65760803222656
Epoch 199/1000, Loss: 132.489990234375
Epoch 298/1000, Loss: 108.56517028808594
Epoch 397/1000, Loss: 94.51472473144531
Epoch 496/1000, Loss: 85.52915954589844
Epoch 595/1000, Loss: 79.40997314453125
Epoch 694/1000, Loss: 75.02025604248047
Epoch 793/1000, Loss: 71.74691009521484
Epoch 892/1000, Loss: 69.23348999023438
Epoch 991/1000, Loss: 67.25261688232422


In [8]:
U[256]

<tf.Tensor: shape=(3,), dtype=float32, numpy=array([-2.87786   , -2.655604  , -0.25101382], dtype=float32)>

In [11]:
def score( U,V):
    scores = tf.reduce_sum(tf.multiply(V, U), axis = 1) / (tf.norm(V, axis = 1)* tf.norm(U))
    return scores


scores = score(U[122], V)
ordered_scores = np.argsort(scores)[::-1]
top_k = ordered_scores[:6]
top_k

array([  2488,  29053,  31458, 139625,  75843,  79504])

In [12]:
df.head()

Unnamed: 0,useriD,StreamId,StreamerName,StartTime,EndTime
0,1,33842865744,mithrain,154,156
1,1,33846768288,alptv,166,169
2,1,33886469056,mithrain,587,588
3,1,33887624992,wtcn,589,591
4,1,33890145056,jrokezftw,591,594


In [16]:
cat[[  2488,  29053,  31458, 139625,  75843,  79504]]

Index(['nemo_good', 'theivyx', 'sonixpride', 'jvitorslv', 'monniezinha',
       'ld50_ii'],
      dtype='object')