In [1]:
from flask import Flask
from flask_sqlalchemy import SQLAlchemy
from sqlalchemy import create_engine
import os
import pandas as pd

app = Flask(__name__)

engine = create_engine('sqlite:///recommender2.db', echo=False)

for f in os.listdir('data/movies/ml-latest-small'):
    if f[-4:] == '.csv':
        data = pd.read_csv(f'data/movies/ml-latest-small/{f}')
        data.to_sql(f[:-4], engine)
        print(f[0:-4])

links
ratings
movies
tags


In [3]:
watched_movie_id_list = ['70286', '109487', '589']


In [4]:
import numpy as np

movie_id_unique = 'SELECT * FROM movies'
all_movies = pd.read_sql(movie_id_unique, engine)

#remove the input movies from all_movies
movies_not_watched = all_movies[~all_movies['movieId'].isin(watched_movie_id_list)]
movies_not_watched.loc[:,'fake_id'] = np.ones(len(movies_not_watched), dtype =int)

#get all_ratings from sqlite
query = 'SELECT "userId", ratings."movieId", movies.title, rating FROM ratings JOIN movies ON ratings."movieId" = movies."movieId";'
all_ratings = pd.read_sql(query, engine)

#remove the watched movies from all_ratings
not_all_ratings = all_ratings[~all_ratings['movieId'].isin(watched_movie_id_list)]

#remove movieId and ratings of the watched movies???
movieindex = not_all_ratings['movieId'].unique().tolist()
dl_movie2movie_encoded = {x: i for i, x in enumerate(movieindex)}
dl_movie_encoded2movie = {i: x for i, x in enumerate(movieindex)}

not_all_ratings.loc[:,"movie"] = not_all_ratings["movieId"].map(dl_movie2movie_encoded)
not_all_ratings.loc[:,"rating"] = not_all_ratings["rating"].values.astype(np.float32)

#map userId from ?? to not_all_ratings ?? what do i do with the users??

not_all_user_ids = not_all_ratings["userId"].unique().tolist()
dl_user2user_encoded = {x: i for i, x in enumerate(not_all_user_ids)}
dl_userencoded2user = {i: x for i, x in enumerate(not_all_user_ids)}

not_all_ratings.loc[:,"user"] = not_all_ratings["userId"].map(dl_user2user_encoded)

min_rating = min(not_all_ratings["rating"])
max_rating = max(not_all_ratings["rating"])
num_users = len(dl_user2user_encoded)
num_movies = len(dl_movie_encoded2movie)
print(
        "Number of users: {}, Number of Movies: {}, Min rating: {}, Max rating: {}".format(
            num_users, num_movies, min_rating, max_rating
        )
    )

#define training data
df = not_all_ratings.sample(frac=1, random_state=42)
x = not_all_ratings[["user", "movie"]].values

    # Normalize the targets between 0 and 1. Makes it easy to train.
y = not_all_ratings["rating"].apply(lambda x: (x - min_rating) / (max_rating - min_rating)).values
    # Assuming training on 90% of the data and validating on 10%.
train_indices = int(0.9 * df.shape[0])
x_train, x_val, y_train, y_val = (
        x[:train_indices],
        x[train_indices:],
        y[:train_indices],
        y[train_indices:],
    )

#inference
user_movie_array = movies_not_watched[['fake_id','index']]
max_movie_index = user_movie_array['index'].max()
that = user_movie_array.to_numpy()

EMBEDDING_SIZE = 50

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

class RecommenderNet(keras.Model):
        def __init__(self, num_users, num_movies, embedding_size, **kwargs):
            super(RecommenderNet, self).__init__(**kwargs)
            self.num_users = num_users
            self.num_movies = num_movies
            self.embedding_size = embedding_size
            self.user_embedding = layers.Embedding(
                num_users,
                embedding_size,
                embeddings_initializer="he_normal",
                embeddings_regularizer=keras.regularizers.l2(1e-6),
            )
            self.user_bias = layers.Embedding(num_users, 1)
            self.movie_embedding = layers.Embedding(
            #change this line from num_movies to max_movie_index+1
                max_movie_index+1,
                embedding_size,
                embeddings_initializer="he_normal",
                embeddings_regularizer=keras.regularizers.l2(1e-6),
            )
            #and this line (input_dim)
            self.movie_bias = layers.Embedding(max_movie_index+1, 1)

        def call(self, inputs):
            user_vector = self.user_embedding(inputs[:, 0])
            user_bias = self.user_bias(inputs[:, 0])
            movie_vector = self.movie_embedding(inputs[:, 1])
            movie_bias = self.movie_bias(inputs[:, 1])
            dot_user_movie = tf.tensordot(user_vector, movie_vector, 2)
            # Add all the components (including bias)
            x = dot_user_movie + user_bias + movie_bias
            # The sigmoid activation forces the rating to between 0 and 1
            return tf.nn.sigmoid(x)


model = RecommenderNet(num_users, num_movies, EMBEDDING_SIZE)
model.compile(
        loss=tf.keras.losses.BinaryCrossentropy(), optimizer=keras.optimizers.Adam(lr=0.001)
    )
ratings = model.predict(that).flatten()

top_ratings_indices = ratings.argsort()[-10:][::-1]
movies_not_watched.loc[movies_not_watched['index'].isin(top_ratings_indices)]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)


Number of users: 610, Number of Movies: 9721, Min rating: 0.5, Max rating: 5.0


Unnamed: 0,index,movieId,title,genres,fake_id
3169,3169,4265,Driven (2001),Action|Thriller,1
3185,3185,4293,Summer Rental (1985),Comedy,1
3193,3193,4305,Angel Eyes (2001),Romance|Thriller,1
3952,3952,5569,"Last House on the Left, The (1972)",Crime|Horror|Thriller,1
3967,3967,5591,Monkey Trouble (1994),Children|Comedy,1
4821,4821,7184,This Property is Condemned (1966),Drama|Romance,1
6688,6688,58156,Semi-Pro (2008),Comedy,1
6703,6703,58347,Penelope (2006),Comedy|Fantasy|Romance,1
8106,8106,100553,Frozen Planet (2011),Documentary,1
8126,8126,101362,Olympus Has Fallen (2013),Action|Thriller,1


In [5]:
top_ratings_indices

array([8126, 8106, 3169, 3185, 3967, 3193, 4821, 6703, 6688, 3952])

In [None]:
อะ คือตรงนี้ ลิสท์ที่กลับมาจาก user input มันให้เลือกด้วยลิสท์ all_movies เพราะจะให้ไปเลือกจาก all_ratings ก็คงประหลาด
ถึงแม้ว่า movieId ใน all_ratings ก็มีเหมือนกัน แต่มันไม่ต่อกัน คือเลขอินเด็กซ์มันไม่สะท้อนถึง unique list
ที่มีจำนวนหนังทั้งหมด 9742 เรื่อง

มันเป็นการสร้างความสัมพันธ์ จะนึกเป็นก้อนคิวบ์สามมิติสองก้อน แล้วมีเส้นๆเชื่อมกันเป็นเน็ตเวิร์คก็ได้
แล้วแต่ละจุด ก็พล็อตด้วยเลขเว็คเตอร์ ซึ่งก็คือ embedding นั่นเอง
แล้วเส้นความสัมพันธ์ก็คือ weight
ว่าได้เท่าไหร่ ห่างจาก 0.5 มากน้อยแค่ไหน

เอา 10 อันดับที่ใกล้กับ 0.5 มากสุด
คือก็คือ ย้อนไปห้า เดินหน้าไปห้า ก็ได้เป็น
0.495 ถึง 0.505

ทำไมต้องเป็น 0.5
ก็คือระยะห่างระหว่าง 2 จุด
คือจุดว่า user คนนี้ให้ rating หนังเรื่องนี้เท่านี้ เต็ม 5.0
แล้ว user ของเราคนนี้จะให้เท่าไหร่

ยังไงก็ต้องมี user embedding ด้วย
เพราะมัน predict against person/identity

In [None]:
เสร็จก็เลยไปเพิ่มคอลัมน์ not_all_ratings['index'] เพื่อจะได้เรียกหนังเรื่องนึงได้ ตาม index ของมัน
แล้วก็ใส่ not_all_ratings['user'] ด้วย
แต่มันมีอยู่แล้วนี่หว่า??
แต่มันเริ่มต้นที่ 0-609 เลยต้องไป encode รอบนึงก่อน? ไรเงี้ยเหรอ?

เสร็จก็เอา 

In [10]:
not_all_ratings[80000:80020]

Unnamed: 0,userId,movieId,title,rating
80294,506,41571,Memoirs of a Geisha (2005),3.0
80295,506,45447,"Da Vinci Code, The (2006)",0.5
80296,506,45720,"Devil Wears Prada, The (2006)",3.0
80297,506,45880,Marie Antoinette (2006),3.0
80298,506,47629,The Queen (2006),4.0
80299,506,48997,Perfume: The Story of a Murderer (2006),3.5
80300,506,49286,"Holiday, The (2006)",1.0
80301,506,51834,Becoming Jane (2007),3.0
80302,506,52973,Knocked Up (2007),3.5
80303,506,52975,Hairspray (2007),3.0


In [14]:
not_all_ratings['movieId'] ไม่ซ้ำกันเลยซักกะบรรทัดหรือเปล่า ถึงแม้ว่าจะเป็นหนังเรื่องเดียวกัน
ไม่หนิ
สำหรับหนังเรื่อง the Davinci Code ทุกบรรทัดเป็น 45447

KeyError: "None of [Index(['Toy Story (1995)', 'Grumpier Old Men (1995)', 'Heat (1995)',\n       'Seven (a.k.a. Se7en) (1995)', 'Usual Suspects, The (1995)',\n       'From Dusk Till Dawn (1996)', 'Bottle Rocket (1996)',\n       'Braveheart (1995)', 'Rob Roy (1995)', 'Canadian Bacon (1995)',\n       ...\n       'The Magnificent Seven (2016)', 'Blair Witch (2016)', '31 (2016)',\n       'Arrival (2016)', 'Rogue One: A Star Wars Story (2016)', 'Split (2017)',\n       'John Wick: Chapter Two (2017)', 'Get Out (2017)', 'Logan (2017)',\n       'The Fate of the Furious (2017)'],\n      dtype='object', length=100474)] are in the [index]"

In [15]:
not_all_ratings.loc[not_all_ratings['movieId'] == 45447]

Unnamed: 0,userId,movieId,title,rating
3434,21,45447,"Da Vinci Code, The (2006)",3.5
7828,52,45447,"Da Vinci Code, The (2006)",3.5
8932,62,45447,"Da Vinci Code, The (2006)",3.5
9943,64,45447,"Da Vinci Code, The (2006)",2.5
16592,105,45447,"Da Vinci Code, The (2006)",4.0
17536,111,45447,"Da Vinci Code, The (2006)",4.0
25535,177,45447,"Da Vinci Code, The (2006)",3.5
30362,212,45447,"Da Vinci Code, The (2006)",3.5
31976,219,45447,"Da Vinci Code, The (2006)",2.5
32760,222,45447,"Da Vinci Code, The (2006)",3.0


In [10]:
user_movie_ratings = pd.pivot_table(all_ratings, values='rating', index='userId', columns='movieId')
user_movie_ratings

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,4.0,,,4.0,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,2.5,,,,,,2.5,,,,...,,,,,,,,,,
607,4.0,,,,,,,,,,...,,,,,,,,,,
608,2.5,2.0,2.0,,,,,,,4.0,...,,,,,,,,,,
609,3.0,,,,,,,,,4.0,...,,,,,,,,,,


In [11]:
user_movie_id_ratings_matrix = user_movie_ratings.fillna(0)
user_movie_id_ratings_matrix

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,4.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,2.5,0.0,0.0,0.0,0.0,0.0,2.5,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
607,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
608,2.5,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
609,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
number_of_genres = 10
m = NMF(n_components=number_of_genres)
m.fit(user_movie_id_ratings_matrix)

NMF(n_components=10)