In [1]:
from flask import Flask
from flask_sqlalchemy import SQLAlchemy
from sqlalchemy import create_engine
import os
import pandas as pd

app = Flask(__name__)

engine = create_engine('sqlite:///recommender2.db', echo=False)

for f in os.listdir('data/movies/ml-latest-small'):
    if f[-4:] == '.csv':
        data = pd.read_csv(f'data/movies/ml-latest-small/{f}')
        data.to_sql(f[:-4], engine)
        print(f[0:-4])

links
ratings
movies
tags


In [2]:
watched_movie_id_list = ['70286', '109487', '589']


In [6]:
import numpy as np

movie_id_unique = 'SELECT * FROM movies'
all_movies = pd.read_sql(movie_id_unique, engine)

#remove the input movies from all_movies
movies_not_watched = all_movies[~all_movies['movieId'].isin(watched_movie_id_list)]
movies_not_watched.loc[:,'fake_id'] = np.ones(len(movies_not_watched), dtype =int)

#get all_ratings from sqlite
query = 'SELECT "userId", ratings."movieId", movies.title, rating FROM ratings JOIN movies ON ratings."movieId" = movies."movieId";'
all_ratings = pd.read_sql(query, engine)

#remove the watched movies from all_ratings
not_all_ratings = all_ratings[~all_ratings['movieId'].isin(watched_movie_id_list)]

#remove movieId and ratings of the watched movies???
movieindex = not_all_ratings['movieId'].unique().tolist()
dl_movie2movie_encoded = {x: i for i, x in enumerate(movieindex)}
dl_movie_encoded2movie = {i: x for i, x in enumerate(movieindex)}

not_all_ratings.loc[:,"movie"] = not_all_ratings["movieId"].map(dl_movie2movie_encoded)
not_all_ratings.loc[:,"rating"] = not_all_ratings["rating"].values.astype(np.float32)

#map userId from ?? to not_all_ratings ?? what do i do with the users??

not_all_user_ids = not_all_ratings["userId"].unique().tolist()
dl_user2user_encoded = {x: i for i, x in enumerate(not_all_user_ids)}
dl_userencoded2user = {i: x for i, x in enumerate(not_all_user_ids)}

not_all_ratings.loc[:,"user"] = not_all_ratings["userId"].map(dl_user2user_encoded)

min_rating = min(not_all_ratings["rating"])
max_rating = max(not_all_ratings["rating"])
num_users = len(dl_user2user_encoded)
num_movies = len(dl_movie_encoded2movie)
print(
        "Number of users: {}, Number of Movies: {}, Min rating: {}, Max rating: {}".format(
            num_users, num_movies, min_rating, max_rating
        )
    )

#define training data
df = not_all_ratings.sample(frac=1, random_state=42)
x = not_all_ratings[["user", "movie"]].values

    # Normalize the targets between 0 and 1. Makes it easy to train.
y = not_all_ratings["rating"].apply(lambda x: (x - min_rating) / (max_rating - min_rating)).values
    # Assuming training on 90% of the data and validating on 10%.
train_indices = int(0.9 * df.shape[0])
x_train, x_val, y_train, y_val = (
        x[:train_indices],
        x[train_indices:],
        y[:train_indices],
        y[train_indices:],
    )

#inference
user_movie_array = movies_not_watched[['fake_id','index']]
max_movie_index = user_movie_array['index'].max()
that = user_movie_array.to_numpy()

EMBEDDING_SIZE = 50

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

class RecommenderNet(keras.Model):
        def __init__(self, num_users, num_movies, embedding_size, **kwargs):
            super(RecommenderNet, self).__init__(**kwargs)
            self.num_users = num_users
            self.num_movies = num_movies
            self.embedding_size = embedding_size
            self.user_embedding = layers.Embedding(
                num_users,
                embedding_size,
                embeddings_initializer="he_normal",
                embeddings_regularizer=keras.regularizers.l2(1e-6),
            )
            self.user_bias = layers.Embedding(num_users, 1)
            self.movie_embedding = layers.Embedding(
            #change this line from num_movies to max_movie_index+1
                max_movie_index+1,
                embedding_size,
                embeddings_initializer="he_normal",
                embeddings_regularizer=keras.regularizers.l2(1e-6),
            )
            #and this line (input_dim)
            self.movie_bias = layers.Embedding(max_movie_index+1, 1)

        def call(self, inputs):
            user_vector = self.user_embedding(inputs[:, 0])
            user_bias = self.user_bias(inputs[:, 0])
            movie_vector = self.movie_embedding(inputs[:, 1])
            movie_bias = self.movie_bias(inputs[:, 1])
            dot_user_movie = tf.tensordot(user_vector, movie_vector, 2)
            # Add all the components (including bias)
            x = dot_user_movie + user_bias + movie_bias
            # The sigmoid activation forces the rating to between 0 and 1
            return tf.nn.sigmoid(x)


model = RecommenderNet(num_users, num_movies, EMBEDDING_SIZE)
model.compile(
        loss=tf.keras.losses.BinaryCrossentropy(), optimizer=keras.optimizers.Adam(lr=0.001)
    )
ratings = model.predict(that).flatten()
movies_not_watched.loc[:,'prediction'] = ratings

#top_ratings_indices = ratings.argsort()[-10:][::-1] #เอา index มาผิด
highest_score = ratings[ratings.argsort()[-10:]][::-1] #เอาค่ามาถึงจะถูก

#movies_not_watched.loc[movies_not_watched['index'].isin(top_ratings_indices)]
movies_not_watched.loc[movies_not_watched.loc[:,'prediction'].isin(highest_score)] ##wuuuhuuuuwww

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)


Number of users: 610, Number of Movies: 9721, Min rating: 0.5, Max rating: 5.0


Unnamed: 0,index,movieId,title,genres,fake_id,prediction
954,954,1255,Bad Taste (1987),Comedy|Horror|Sci-Fi,1,0.524683
960,960,1261,Evil Dead II (Dead by Dawn) (1987),Action|Comedy|Fantasy|Horror,1,0.524533
5251,5251,8611,"Farmer's Daughter, The (1947)",Comedy,1,0.525885
5254,5254,8620,"Exterminating Angel, The (Ángel exterminador, ...",Comedy|Drama|Fantasy|Mystery,1,0.525476
5276,5276,8690,Slaughterhouse-Five (1972),Comedy|Drama|Sci-Fi|War,1,0.524473
6893,6893,63393,Camp Rock (2008),Comedy|Musical|Romance,1,0.523749
6912,6912,64167,Dinotopia (2002),Adventure|Fantasy,1,0.523539
9612,9612,176621,Boniface's Holiday (1965),Animation|Children|Comedy|Romance,1,0.526373
9613,9613,176751,American Made (2017),Crime|Thriller,1,0.526543
9631,9631,179073,Male Hunt (1964),Comedy,1,0.52478


In [7]:
ratings.max()

0.526543

In [8]:
import numpy as np

movie_id_unique = 'SELECT * FROM movies'
all_movies = pd.read_sql(movie_id_unique, engine)

#remove the input movies from all_movies
movies_not_watched = all_movies[~all_movies['movieId'].isin(watched_movie_id_list)]
movies_not_watched.loc[:,'fake_id'] = np.ones(len(movies_not_watched), dtype =int)

#get all_ratings from sqlite
query = 'SELECT "userId", ratings."movieId", movies.title, rating FROM ratings JOIN movies ON ratings."movieId" = movies."movieId";'
all_ratings = pd.read_sql(query, engine)

#remove the watched movies from all_ratings
not_all_ratings = all_ratings[~all_ratings['movieId'].isin(watched_movie_id_list)]

#remove movieId and ratings of the watched movies???
movieindex = not_all_ratings['movieId'].unique().tolist()
dl_movie2movie_encoded = {x: i for i, x in enumerate(movieindex)}
dl_movie_encoded2movie = {i: x for i, x in enumerate(movieindex)}

not_all_ratings.loc[:,"movie"] = not_all_ratings["movieId"].map(dl_movie2movie_encoded)
not_all_ratings.loc[:,"rating"] = not_all_ratings["rating"].values.astype(np.float32)

#map userId from ?? to not_all_ratings ?? what do i do with the users??

not_all_user_ids = not_all_ratings["userId"].unique().tolist()
dl_user2user_encoded = {x: i for i, x in enumerate(not_all_user_ids)}
dl_userencoded2user = {i: x for i, x in enumerate(not_all_user_ids)}

not_all_ratings.loc[:,"user"] = not_all_ratings["userId"].map(dl_user2user_encoded)

min_rating = min(not_all_ratings["rating"])
max_rating = max(not_all_ratings["rating"])
num_users = len(dl_user2user_encoded)
num_movies = len(dl_movie_encoded2movie)
print(
        "Number of users: {}, Number of Movies: {}, Min rating: {}, Max rating: {}".format(
            num_users, num_movies, min_rating, max_rating
        )
    )

#define training data
df = not_all_ratings.sample(frac=1, random_state=42)
x = not_all_ratings[["user", "movie"]].values

    # Normalize the targets between 0 and 1. Makes it easy to train.
y = not_all_ratings["rating"].apply(lambda x: (x - min_rating) / (max_rating - min_rating)).values
    # Assuming training on 90% of the data and validating on 10%.
train_indices = int(0.9 * df.shape[0])
x_train, x_val, y_train, y_val = (
        x[:train_indices],
        x[train_indices:],
        y[:train_indices],
        y[train_indices:],
    )

#inference
user_movie_array = movies_not_watched[['fake_id','index']]
max_movie_index = user_movie_array['index'].max()
that = user_movie_array.to_numpy()

EMBEDDING_SIZE = 50

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

class RecommenderNet(keras.Model):
        def __init__(self, num_users, num_movies, embedding_size, **kwargs):
            super(RecommenderNet, self).__init__(**kwargs)
            self.num_users = num_users
            self.num_movies = num_movies
            self.embedding_size = embedding_size
            self.user_embedding = layers.Embedding(
                num_users,
                embedding_size,
                embeddings_initializer="he_normal",
                embeddings_regularizer=keras.regularizers.l2(1e-6),
            )
            self.user_bias = layers.Embedding(num_users, 1)
            self.movie_embedding = layers.Embedding(
            #change this line from num_movies to max_movie_index+1
                max_movie_index+1,
                embedding_size,
                embeddings_initializer="he_normal",
                embeddings_regularizer=keras.regularizers.l2(1e-6),
            )
            #and this line (input_dim)
            self.movie_bias = layers.Embedding(max_movie_index+1, 1)

        def call(self, inputs):
            user_vector = self.user_embedding(inputs[:, 0])
            user_bias = self.user_bias(inputs[:, 0])
            movie_vector = self.movie_embedding(inputs[:, 1])
            movie_bias = self.movie_bias(inputs[:, 1])
            dot_user_movie = tf.tensordot(user_vector, movie_vector, 2)
            # Add all the components (including bias)
            x = dot_user_movie + user_bias + movie_bias
            # The sigmoid activation forces the rating to between 0 and 1
            return tf.nn.sigmoid(x)


model = RecommenderNet(num_users, num_movies, EMBEDDING_SIZE)
model.compile(
        loss=tf.keras.losses.BinaryCrossentropy(), optimizer=keras.optimizers.Adam(lr=0.001)
    )
ratings = model.predict(that).flatten()
movies_not_watched.loc[:,'prediction'] = ratings

#top_ratings_indices = ratings.argsort()[-10:][::-1] #เอา index มาผิด
highest_score = ratings[ratings.argsort()[-10:]][::-1] #เอาค่ามาถึงจะถูก

#movies_not_watched.loc[movies_not_watched['index'].isin(top_ratings_indices)]
movies_not_watched.loc[movies_not_watched.loc[:,'prediction'].isin(highest_score)] ##wuuuhuuuuwww

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)


Number of users: 610, Number of Movies: 9721, Min rating: 0.5, Max rating: 5.0


Unnamed: 0,index,movieId,title,genres,fake_id,prediction
869,869,1150,"Return of Martin Guerre, The (Retour de Martin...",Drama,1,0.544616
872,872,1161,"Tin Drum, The (Blechtrommel, Die) (1979)",Drama|War,1,0.544082
887,887,1184,Mediterraneo (1991),Comedy|Drama,1,0.54353
5189,5189,8446,Sands of Iwo Jima (1949),Action|Drama|Romance|War,1,0.546629
5197,5197,8461,Dragon Seed (1944),Drama|War,1,0.546461
5201,5201,8465,Johnny Eager (1942),Crime|Drama|Film-Noir|Romance,1,0.544484
5202,5202,8477,"Jetée, La (1962)",Romance|Sci-Fi,1,0.54421
5207,5207,8491,White Heat (1949),Crime|Drama|Film-Noir,1,0.545103
5210,5210,8494,"Cincinnati Kid, The (1965)",Drama,1,0.545325
5212,5212,8500,Godzilla vs. Mechagodzilla (Gojira tai Mekagoj...,Action|Horror|Sci-Fi,1,0.544298


In [9]:
ratings.max()

0.54662895

In [None]:
watched_movie_id_list = ['70286', '109487', '589']


In [12]:
all_movies

Unnamed: 0,index,movieId,title,genres
0,0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,1,2,Jumanji (1995),Adventure|Children|Fantasy
2,2,3,Grumpier Old Men (1995),Comedy|Romance
3,3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...,...
9737,9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,9739,193585,Flint (2017),Drama
9740,9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [17]:
all_movies.loc[all_movies['movieId'] == 109487]

Unnamed: 0,index,movieId,title,genres
8376,8376,109487,Interstellar (2014),Sci-Fi|IMAX


In [18]:
all_movies.loc[all_movies['movieId'] == 70286]

Unnamed: 0,index,movieId,title,genres
7090,7090,70286,District 9 (2009),Mystery|Sci-Fi|Thriller


In [None]:
อะ คือตรงนี้ ลิสท์ที่กลับมาจาก user input มันให้เลือกด้วยลิสท์ all_movies เพราะจะให้ไปเลือกจาก all_ratings ก็คงประหลาด
ถึงแม้ว่า movieId ใน all_ratings ก็มีเหมือนกัน แต่มันไม่ต่อกัน คือเลขอินเด็กซ์มันไม่สะท้อนถึง unique list
ที่มีจำนวนหนังทั้งหมด 9742 เรื่อง

มันเป็นการสร้างความสัมพันธ์ จะนึกเป็นก้อนคิวบ์สามมิติสองก้อน แล้วมีเส้นๆเชื่อมกันเป็นเน็ตเวิร์คก็ได้
แล้วแต่ละจุด ก็พล็อตด้วยเลขเว็คเตอร์ ซึ่งก็คือ embedding นั่นเอง
แล้วเส้นความสัมพันธ์ก็คือ weight
ว่าได้เท่าไหร่ ห่างจาก 0.5 มากน้อยแค่ไหน

เอา 10 อันดับที่ใกล้กับ 0.5 มากสุด
คือก็คือ ย้อนไปห้า เดินหน้าไปห้า ก็ได้เป็น
0.495 ถึง 0.505

ทำไมต้องเป็น 0.5
ก็คือระยะห่างระหว่าง 2 จุด
คือจุดว่า user คนนี้ให้ rating หนังเรื่องนี้เท่านี้ เต็ม 5.0
แล้ว user ของเราคนนี้จะให้เท่าไหร่

ยังไงก็ต้องมี user embedding ด้วย
เพราะมัน predict against person/identity

In [None]:
เสร็จก็เลยไปเพิ่มคอลัมน์ not_all_ratings['index'] เพื่อจะได้เรียกหนังเรื่องนึงได้ ตาม index ของมัน
แล้วก็ใส่ not_all_ratings['user'] ด้วย
แต่มันมีอยู่แล้วนี่หว่า??
แต่มันเริ่มต้นที่ 0-609 เลยต้องไป encode รอบนึงก่อน? ไรเงี้ยเหรอ?

เสร็จก็เอา 

In [19]:
import numpy as np

movie_id_unique = 'SELECT * FROM movies'
all_movies = pd.read_sql(movie_id_unique, engine)

#remove the input movies from all_movies
movies_not_watched = all_movies[~all_movies['movieId'].isin(watched_movie_id_list)]
movies_not_watched.loc[:,'fake_id'] = np.ones(len(movies_not_watched), dtype =int)

#get all_ratings from sqlite
query = 'SELECT "userId", ratings."movieId", movies.title, rating FROM ratings JOIN movies ON ratings."movieId" = movies."movieId";'
all_ratings = pd.read_sql(query, engine)

#remove the watched movies from all_ratings
not_all_ratings = all_ratings[~all_ratings['movieId'].isin(watched_movie_id_list)]

#remove movieId and ratings of the watched movies???
movieindex = not_all_ratings['movieId'].unique().tolist()
dl_movie2movie_encoded = {x: i for i, x in enumerate(movieindex)}
dl_movie_encoded2movie = {i: x for i, x in enumerate(movieindex)}

not_all_ratings.loc[:,"movie"] = not_all_ratings["movieId"].map(dl_movie2movie_encoded)
not_all_ratings.loc[:,"rating"] = not_all_ratings["rating"].values.astype(np.float32)

#map userId from ?? to not_all_ratings ?? what do i do with the users??

not_all_user_ids = not_all_ratings["userId"].unique().tolist()
dl_user2user_encoded = {x: i for i, x in enumerate(not_all_user_ids)}
dl_userencoded2user = {i: x for i, x in enumerate(not_all_user_ids)}

not_all_ratings.loc[:,"user"] = not_all_ratings["userId"].map(dl_user2user_encoded)

min_rating = min(not_all_ratings["rating"])
max_rating = max(not_all_ratings["rating"])
num_users = len(dl_user2user_encoded)
num_movies = len(dl_movie_encoded2movie)
print(
        "Number of users: {}, Number of Movies: {}, Min rating: {}, Max rating: {}".format(
            num_users, num_movies, min_rating, max_rating
        )
    )

#define training data
df = not_all_ratings.sample(frac=1, random_state=42)
x = not_all_ratings[["user", "movie"]].values

    # Normalize the targets between 0 and 1. Makes it easy to train.
y = not_all_ratings["rating"].apply(lambda x: (x - min_rating) / (max_rating - min_rating)).values
    # Assuming training on 90% of the data and validating on 10%.
train_indices = int(0.9 * df.shape[0])
x_train, x_val, y_train, y_val = (
        x[:train_indices],
        x[train_indices:],
        y[:train_indices],
        y[train_indices:],
    )

#inference
user_movie_array = movies_not_watched[['fake_id','index']]
max_movie_index = user_movie_array['index'].max()
that = user_movie_array.to_numpy()

EMBEDDING_SIZE = 50

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

class RecommenderNet(keras.Model):
        def __init__(self, num_users, num_movies, embedding_size, **kwargs):
            super(RecommenderNet, self).__init__(**kwargs)
            self.num_users = num_users
            self.num_movies = num_movies
            self.embedding_size = embedding_size
            self.user_embedding = layers.Embedding(
                num_users,
                embedding_size,
                embeddings_initializer="he_normal",
                embeddings_regularizer=keras.regularizers.l2(1e-6),
            )
            self.user_bias = layers.Embedding(num_users, 1)
            self.movie_embedding = layers.Embedding(
            #change this line from num_movies to max_movie_index+1
                max_movie_index+1,
                embedding_size,
                embeddings_initializer="he_normal",
                embeddings_regularizer=keras.regularizers.l2(1e-6),
            )
            #and this line (input_dim)
            self.movie_bias = layers.Embedding(max_movie_index+1, 1)

        def call(self, inputs):
            user_vector = self.user_embedding(inputs[:, 0])
            user_bias = self.user_bias(inputs[:, 0])
            movie_vector = self.movie_embedding(inputs[:, 1])
            movie_bias = self.movie_bias(inputs[:, 1])
            dot_user_movie = tf.tensordot(user_vector, movie_vector, 2)
            # Add all the components (including bias)
            x = dot_user_movie + user_bias + movie_bias
            # The sigmoid activation forces the rating to between 0 and 1
            return tf.nn.sigmoid(x)


model = RecommenderNet(num_users, num_movies, EMBEDDING_SIZE)
model.compile(
        loss=tf.keras.losses.BinaryCrossentropy(), optimizer=keras.optimizers.Adam(lr=0.001)
    )
ratings = model.predict(that).flatten()
movies_not_watched.loc[:,'prediction'] = ratings
movies_not_watched.drop(columns='fake_id')

#top_ratings_indices = ratings.argsort()[-10:][::-1] #เอา index มาผิด
highest_score = ratings[ratings.argsort()[-10:]][::-1] #เอาค่ามาถึงจะถูก

#movies_not_watched.loc[movies_not_watched['index'].isin(top_ratings_indices)]
movies_not_watched.loc[movies_not_watched.loc[:,'prediction'].isin(highest_score)] ##wuuuhuuuuwww

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)


Number of users: 610, Number of Movies: 9721, Min rating: 0.5, Max rating: 5.0


Unnamed: 0,index,movieId,title,genres,fake_id,prediction
259,259,299,Priest (1994),Drama,1,0.536291
270,270,311,Relative Fear (1994),Horror|Thriller,1,0.534351
283,283,325,National Lampoon's Senior Trip (1995),Comedy,1,0.536092
285,285,327,Tank Girl (1995),Action|Comedy|Sci-Fi,1,0.535003
287,287,329,Star Trek: Generations (1994),Adventure|Drama|Sci-Fi,1,0.534206
3531,3531,4826,"Big Red One, The (1980)",Action|Adventure|Drama|War,1,0.538014
3535,3535,4831,Can't Stop the Music (1980),Comedy|Musical,1,0.538257
3537,3537,4835,Coal Miner's Daughter (1980),Drama,1,0.534381
3546,3546,4850,Spriggan (Supurigan) (1998),Action|Animation|Sci-Fi,1,0.534627
3549,3549,4857,Fiddler on the Roof (1971),Drama|Musical,1,0.537654
