In [1]:
import pandas as pd
import numpy as np
from zipfile import ZipFile
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from pathlib import Path
import matplotlib.pyplot as plt


In [2]:
df=pd.read_csv('dfn3share.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,user,service,count,subcat,gender
0,0,3646,92,127,15,1
1,1,3646,94,18,15,1
2,2,3646,93,19,15,1
3,3,3646,95,2,17,1
4,4,3646,114,1,15,1


In [3]:
user_ids = df["user"].unique().tolist()
user2user_encoded = {x: i for i, x in enumerate(user_ids)}
userencoded2user = {i: x for i, x in enumerate(user_ids)}
movie_ids = df["service"].unique().tolist()
movie2movie_encoded = {x: i for i, x in enumerate(movie_ids)}
movie_encoded2movie = {i: x for i, x in enumerate(movie_ids)}
df["user"] = df["user"].map(user2user_encoded)
df["service"] = df["service"].map(movie2movie_encoded)
num_users = len(user2user_encoded)
num_movies = len(movie_encoded2movie)
df["count"] = df["count"].values.astype(np.float32)
# min and max ratings will be used to normalize the ratings later
min_rating = min(df["count"])
max_rating = max(df["count"])

print(
    "Number of users: {}, Number of services: {}, Min counting: {}, Max counting: {}".format(
        num_users, num_movies, min_rating, max_rating
    )
)


Number of users: 426026, Number of services: 301, Min counting: 1.0, Max counting: 127.0


In [4]:
df = df.sample(frac=1, random_state=42)
x = df[["user", "service"]].values
# Normalize the targets between 0 and 1. Makes it easy to train.
y = df["count"].apply(lambda x: (x - min_rating) / (max_rating - min_rating)).values
# Assuming training on 90% of the data and validating on 10%.
train_indices = int(0.9 * df.shape[0])
x_train, x_val, y_train, y_val = (
    x[:train_indices],
    x[train_indices:],
    y[:train_indices],
    y[train_indices:],
)


In [5]:
EMBEDDING_SIZE = 50


class RecommenderNet(keras.Model):
    def __init__(self, num_users, num_movies, embedding_size, **kwargs):
        super().__init__(**kwargs)
        self.num_users = num_users
        self.num_movies = num_movies
        self.embedding_size = embedding_size
        self.user_embedding = layers.Embedding(
            num_users,
            embedding_size,
            embeddings_initializer="he_normal",
            embeddings_regularizer=keras.regularizers.l2(1e-6),
        )
        self.user_bias = layers.Embedding(num_users, 1)
        self.movie_embedding = layers.Embedding(
            num_movies,
            embedding_size,
            embeddings_initializer="he_normal",
            embeddings_regularizer=keras.regularizers.l2(1e-6),
        )
        self.movie_bias = layers.Embedding(num_movies, 1)

    def call(self, inputs):
        user_vector = self.user_embedding(inputs[:, 0])
        user_bias = self.user_bias(inputs[:, 0])
        movie_vector = self.movie_embedding(inputs[:, 1])
        movie_bias = self.movie_bias(inputs[:, 1])
        dot_user_movie = tf.tensordot(user_vector, movie_vector, 2)
        # Add all the components (including bias)
        x = dot_user_movie + user_bias + movie_bias
        # The sigmoid activation forces the rating to between 0 and 1
        return tf.nn.sigmoid(x)


model = RecommenderNet(num_users, num_movies, EMBEDDING_SIZE)
model.compile(
    loss=tf.keras.losses.BinaryCrossentropy(),
    optimizer=keras.optimizers.Adam(learning_rate=0.001),
)


In [6]:
history = model.fit(
    x=x_train,
    y=y_train,
    batch_size=64,
    epochs=5,
    verbose=1,
    validation_data=(x_val, y_val),
)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [29]:
service_df = pd.read_csv('dfn3share.csv')

# Let us get a user and see the top recommendations.
user_id = df.user.sample(1).iloc[0]
service_use_by_user = df[df.user == user_id]
service_not_used = service_df[
    ~service_df["service"].isin(service_use_by_user.service.values)
]["service"]
service_not_used = list(
    set(service_not_used).intersection(set(movie2movie_encoded.keys()))
)
service_not_used = [[movie2movie_encoded.get(x)] for x in service_not_used]
user_encoder = user2user_encoded.get(user_id)
user_movie_array = np.hstack(
    ([[user_encoder]] * len(service_not_used), service_not_used)
)
ratings = model.predict(user_movie_array).flatten()
top_ratings_indices = ratings.argsort()[-10:][::-1]
recommended_movie_ids = [
    movie_encoded2movie.get(service_not_used[x][0]) for x in top_ratings_indices
]

print("Showing recommendations for user: {}".format(user_id))

# top_movies_user = (
#     service_use_by_user.sort_values(by="count", ascending=False)
#     .head(5)
#     .service.values
# )
# movie_df_rows = service_df[service_df["service"].isin(top_movies_user)]
# for row in movie_df_rows.itertuples():
#     print(row.service, ":", row.subcat)

print("----" * 8)
print("Top 10 service recommendations")
print("----" * 8)
recommended_movies = service_df[service_df["service"].isin(recommended_movie_ids)]
for row in recommended_movies.itertuples():
    print('The code of services:',row.service, ":",'the code of subcat:', row.subcat)


Showing recommendations for user: 284077
--------------------------------
Top 10 service recommendations
--------------------------------
The code of services: 219 : the code of subcat: 25
The code of services: 188 : the code of subcat: 25
The code of services: 162 : the code of subcat: 38
The code of services: 550 : the code of subcat: 3
The code of services: 214 : the code of subcat: 25
The code of services: 550 : the code of subcat: 3
The code of services: 220 : the code of subcat: 25
The code of services: 214 : the code of subcat: 25
The code of services: 208 : the code of subcat: 25
The code of services: 118 : the code of subcat: 14
The code of services: 78 : the code of subcat: 3
The code of services: 219 : the code of subcat: 25
The code of services: 220 : the code of subcat: 25
The code of services: 208 : the code of subcat: 25
The code of services: 198 : the code of subcat: 25
