In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
%cd drive/MyDrive/00 eCommerce/

[Errno 2] No such file or directory: 'drive/MyDrive/00 eCommerce/'
/content/drive/MyDrive/00 eCommerce


In [4]:
data_path = 'ml-100k'

> # **eCommerce - Recommendation Systems**
**Table of contents**
*   Part A - Data Analysis
  *   Importing Libaries
  *   Reading and Exploring the data
    *   Data Overview
    *   Data pre-processing
    *   Data Visualization
*   Part B - Non-Personal Recommendation
  *   Modeling
  *   Evaluation
*   Part C - Personal Recommendation
  *   3 Turi Create
  *   4 Neural Collaborative Filtering
  *   5 DeepCTR


![](https://bobliu.io/assets/img/cards.509a5045.jpg)

# **Part A - Exploring the Data**

In [5]:
!pip install deepctr

Collecting deepctr
[?25l  Downloading https://files.pythonhosted.org/packages/e1/23/a0c89b3a1631f8017dde94ee096db6ba14dfe0c996df8d5a0bdfb795ca54/deepctr-0.8.5-py3-none-any.whl (116kB)
[K     |████████████████████████████████| 122kB 5.3MB/s 
Installing collected packages: deepctr
Successfully installed deepctr-0.8.5


### **1. Importing Various Modules**


In [6]:
# Ignore  the warnings
import warnings
warnings.filterwarnings('ignore')

# data visualisation and manipulation
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import style
import seaborn as sns
from pylab import rcParams
 
# Load the TensorBoard notebook extension
%load_ext tensorboard

# configure
# sets matplotlib to inline and displays graphs below the corressponding cell.
%matplotlib inline
style.use('fivethirtyeight')
sns.set(style='whitegrid',color_codes=True)

#model selection
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score,precision_score,recall_score,confusion_matrix,roc_curve,roc_auc_score
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder

#dl libraraies
from keras.models import Sequential, Model
from keras.layers import Embedding, Reshape, Activation, Input, Dense, Flatten, Dropout
from keras.layers.merge import Dot, multiply, concatenate
from keras.layers import merge
from keras.optimizers import Adagrad, Adam, SGD, RMSprop
from sklearn.metrics import mean_absolute_error
from keras.utils import to_categorical
from keras.callbacks import ReduceLROnPlateau

from deepctr.models import WDL
from deepctr.feature_column import SparseFeat, get_feature_names

# specifically for deeplearning.
import random as rn
from IPython.display import SVG
 
# specifically for manipulating zipped images and getting numpy arrays of pixel values of images.
import cv2
import numpy as np
from tqdm import tqdm
import os       
from random import shuffle
from zipfile import ZipFile
from PIL import Image

## **2. Reading and Exploring the data**

### **2.1 Data Overview**

GroupLens Research has collected and made available rating data sets from the MovieLens web site (https://movielens.org). The data sets were collected over various periods of time, depending on the size of the set. Before using these data sets, please review their README files for the usage licenses and other details.


**MovieLens 100K Dataset**

MovieLens 100K movie ratings. Stable benchmark dataset. 100,000 ratings from 1000 users on 1700 movies. Released 4/1998.

* [README.txt](https://files.grouplens.org/datasets/movielens/ml-100k-README.txt)
* [ml-100k.zip](https://files.grouplens.org/datasets/movielens/ml-100k.zip) (size: 5 MB, checksum)
* [Index of unzipped files](https://files.grouplens.org/datasets/movielens/ml-100k/)

Permalink: https://grouplens.org/datasets/movielens/100k/

### **2.2 Data pre-processing**

In [7]:
names = ["user_id", "movie_id", "rating", "unix_timestamp"]
ratings = pd.read_csv("/".join((data_path, "u1.base")), sep='\t', names=names, encoding='latin-1')
ratings_val = pd.read_csv("/".join((data_path, "u1.test")), sep='\t', names=names, encoding='latin-1')

FileNotFoundError: ignored

In [None]:
ratings

In [None]:
m_cols = ['movie_id', 'title', 'release_date']
movies = pd.read_csv("/".join((data_path, "u.item")), sep='|', names=m_cols, usecols=range(3),encoding='latin-1')

In [None]:
movies

In [None]:
movie_ratings = pd.merge(movies, ratings, on='movie_id')
movie_ratings_val = pd.merge(movies, ratings_val, on='movie_id')

In [None]:
movie_ratings

In [None]:
movie_stats = movie_ratings.groupby('movie_id', as_index=False)['rating'].mean()

In [None]:
ratings_sorted = movie_stats.sort_values('rating', ascending=False)
top3 = pd.merge(movies, ratings_sorted.head(3), on='movie_id')
print('TOP3 Titles: %s'%top3.title)

In [None]:
u_cols = ['user_id', 'age', 'gender', 'occupation', 'zip_code']
users = pd.read_csv("/".join((data_path, 'u.user')), sep='|', names=u_cols, encoding='latin-1')

### **2.3 Data Analysis**

In [None]:
movielens = pd.merge(movie_ratings, users, on='user_id')
movielens_val = pd.merge(movie_ratings_val, users, on='user_id')
movielens

In [None]:
ratings_by_gender = movielens.pivot_table('rating',index=['movie_id'], columns='gender', aggfunc='mean')
ratings_by_gender

**TOP3 Male Titles:**

In [None]:
male_top_ratings = ratings_by_gender.sort_values('M', ascending=False)
top3_male = pd.merge(movies, male_top_ratings.head(3), on='movie_id')
list(top3_male.title)

**TOP3 Female Titles**

In [None]:
female_top_ratings = ratings_by_gender.sort_values('F', ascending=False)
top3_female = pd.merge(movies, female_top_ratings.head(3), on='movie_id')
list(top3_female.title)

In [None]:
diff = male_top_ratings['M'] - female_top_ratings['F']
diff.dropna(inplace=True)
ratings_by_gender['difference'] = diff.apply(lambda x: abs(x))
diff = pd.merge(ratings_by_gender, movies, on='movie_id')
diff = diff.sort_values('difference', ascending=False)

### **2.4 Data Visualization**

In [None]:
# A.1 Histogram - Average rating for movies
plt.hist(movie_stats['rating'], rwidth=0.7, orientation='horizontal',color='orange', bins=5)
plt.yticks(range(1,6), labels=range(1,6))
plt.title("Total Ratings")
plt.ylabel("Average Rating")
plt.xlabel("Amount of Movies")

In [None]:
plt.hist(male_top_ratings['M'].dropna(), rwidth=0.7, orientation='horizontal',color='skyblue', bins=5)
plt.yticks(range(1,6))
plt.title("Male Rating")
plt.ylabel("Average Rating")
plt.xlabel("Amount of Movies")

In [None]:
plt.hist(female_top_ratings['F'].dropna(), rwidth=0.7, orientation='horizontal',color='pink', bins=5)
plt.yticks(range(1,6))
plt.title("Female Rating")
plt.ylabel("Average Rating")
plt.xlabel("Amount of Movies")

In [None]:
plt.hist(male_top_ratings['M'].dropna(), rwidth=0.7, orientation='horizontal',color='skyblue', bins=5)
plt.hist(female_top_ratings['F'].dropna(), rwidth=0.7, orientation='horizontal',color='pink', bins=5)
plt.yticks(range(1,6))
plt.title("Gender Rating Difference")
plt.ylabel("Average Rating Diff")
plt.xlabel("Amount of Movies")

**TOP3 Rating Difference between Genders:**

In [None]:
top3_diff = diff[['title','difference']].head(3)
top3_diff.style.hide_index()

TODO **1c**

In [None]:
###TODO###

In [None]:
###TODO###

**1d**

In [None]:
movie_stats0 = movie_ratings.groupby('movie_id', as_index=False)['rating'].mean()
movie_stats1 = movie_ratings.groupby('movie_id', as_index=False)['rating'].count()
movie_stats1.columns = ['movie_id','num_of_ratings']
movie_pop = pd.merge(movie_stats0, movie_stats1, on='movie_id')

In [None]:
movie_pop['popularity']=(movie_pop['rating']*movie_pop['num_of_ratings'])/(users.shape[0]*5)
sorted_by_pop = movie_pop.sort_values('popularity', ascending=False)

In [None]:
popularity_with_names = pd.merge(sorted_by_pop,movies,on='movie_id',how='inner')
popularity_with_names[['movie_id','title','popularity','rating','num_of_ratings']]

**1e** - Data Sparity and Average Ratings per User

In [None]:
sparity = movielens.shape[0]/(users.shape[0]*movies.shape[0])
print(f'Data Sparity is {sparity*100:0.2f}%')

In [None]:
user_stats = movielens.groupby('user_id', as_index=False)['rating'].count()
apu = user_stats['rating'].mean()
print(f'Average Ratings per User is {apu:0.2f} Reviews')

# **Part B** - Non-Personality Approach
<span style="font-size:1px;">
הודרך אודות הנחיות לאחר חיסון קורונה והמתנה של 15 דקות במתחם.
ת"ל לאחר החיסון הראשון:
כאב אזורי קל באזור ההזרקה
עייפות וחולשה כללית</span>




###**Modeling**

In [None]:
movie_val_pred = movie_ratings_val.groupby('movie_id', as_index=False)['rating'].mean()
movie_val_pred

In [None]:
results = movielens_val.merge(movie_val_pred, on='movie_id', how='inner')

###**Evaluation**

In [None]:
mae = mean_absolute_error(results.rating_y ,results.rating_x)
rmse = mean_squared_error(results.rating_y ,results.rating_x, squared= False)

def mean_absolute_percentage_error(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100
mape = mean_absolute_percentage_error(results.rating_y ,results.rating_x)
npm = pd.DataFrame([['MAE', mae], ['RMSE', rmse], ['MAPE', mape]], columns = ['metric', 'result'])
npm

###**Gender Analysis**

In [None]:
ratings_by_gender.fillna(0, inplace=True)

In [None]:
results = movielens_val.merge(ratings_by_gender, on='movie_id', how='inner')

Male

In [None]:
mae = mean_absolute_error(results.rating ,results.M)
rmse = mean_squared_error(results.rating ,results.M, squared= False)

def mean_absolute_percentage_error(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100
mape = mean_absolute_percentage_error(results.rating ,results.M)
male = pd.DataFrame([['MAE', mae], ['RMSE', rmse], ['MAPE', mape]], columns = ['metric', 'result'])
male

Female

In [None]:
mae = mean_absolute_error(results.rating ,results.F)
rmse = mean_squared_error(results.rating ,results.F, squared= False)

def mean_absolute_percentage_error(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100
mape = mean_absolute_percentage_error(results.rating ,results.F)
female = pd.DataFrame([['MAE', mae], ['RMSE', rmse], ['MAPE', mape]], columns = ['metric', 'result'])
female

In [None]:
res = npm.merge(male, on='metric')
res = res.merge(female, on='metric')
res.columns = ['metric', 'general', 'male', 'female']
res

**Male** is better than female but the general approach is better from both of the genders. The lower the value the better result.

# **Part C** - Personality-Based Approach

Recommender systems usually make use of either or both collaborative filtering and content-based filtering (also known as the personality-based approach),as well as other systems such as knowledge-based systems. Collaborative filtering approaches build a model from a user's past behavior (items previously purchased or selected and/or numerical ratings given to those items) as well as similar decisions made by other users. This model is then used to predict items (or ratings for items) that the user may have an interest in. Content-based filtering approaches utilize a series of discrete, pre-tagged characteristics of an item in order to recommend additional items with similar properties. Current recommender systems typically combine one or more approaches into a hybrid system.



## **4 Neural Collaborative Filtering**
![](https://miro.medium.com/max/1952/1*aP-Mx266ExwoWZPSdHtYpA.png)

Neural Collaborative Filtering (NCF) is a well known recommendation algorithm that generalizes the matrix factorization problem with multi-layer perceptron.

This notebook provides an example of how to utilize and evaluate NCF implementation in the reco_utils. We use a smaller dataset in this example to run NCF efficiently with GPU acceleration on a Data Science Virtual Machine.

In [None]:
def plot_model_loss(history):
  rcParams['figure.figsize'] = 10, 5
  plt.plot(history.history['loss'] , 'g')
  plt.plot(history.history['val_loss'] , 'b')
  plt.title('model loss')
  plt.ylabel('loss')
  plt.xlabel('epoch')
  plt.legend(['train', 'test'], loc='upper left')
  plt.grid(True)
  plt.show()

In [None]:
ratings.user_id = ratings.user_id.astype('category').cat.codes.values
ratings.movie_id = ratings.movie_id.astype('category').cat.codes.values

In [None]:
dim_embedddings = 30 #hyperparamter to deal with
num_movies = len(ratings.movie_id.unique())
num_users = len(ratings.user_id.unique())

#### 4.1 **Model 1**

In [None]:
m_inputs = Input(shape=(1,), dtype='int32')
m = Embedding(num_movies + 1, dim_embedddings, name="movie")(m_inputs)

In [None]:
u_inputs = Input(shape=(1,), dtype='int32')
u = Embedding(num_users + 1, dim_embedddings, name="user")(u_inputs)

**Matrix Factorization**

Here comes the main part!!!
Now we move on to the crux of the notebook ie Matrix Factorization. In matrix facorization, we basically break a matrix into usually 2 smaller matrices each with smaller dimensions. these matrices are oftem called 'Embeddings'. We can have variants of Matrix Factorizartion-> 'Low Rank MF' , 'Non-Negaive MF' (NMF) and so on..

In [None]:
o = multiply([m, u])
o = Dropout(0.5)(o)
o = Flatten()(o)
o = Dense(1)(o)

In [None]:
rec_model = Model(inputs=[m_inputs, u_inputs], outputs=o)
rec_model.summary()

In [None]:
rec_model.compile(loss='mae', optimizer='adam', metrics=["mae"])

In [None]:
history = rec_model.fit([ratings.movie_id, ratings.user_id], ratings.rating, epochs=10, verbose=2, validation_split=0.1)

In [None]:
plot_model_loss(history)

#### 4.2 **Model 2**

In [None]:
bias = 1
m_bias = Embedding(num_movies + 1, bias, name="moviebias")(m_inputs)
u_bias = Embedding(num_users + 1, bias, name="userbias")(u_inputs)

o = multiply([m, u])
o = concatenate([o, m_bias, u_bias])
o = Dropout(0.5)(o)
o = Flatten()(o)
o = Dense(1)(o)

rec_model2 = Model(inputs=[m_inputs, u_inputs], outputs=o)
rec_model2.compile(loss='mse', optimizer='adam', metrics=["mae"])
history = rec_model2.fit([ratings.movie_id, ratings.user_id], ratings.rating, epochs=10, verbose=2, validation_split=0.1)

plot_model_loss(history)

#### 4.3 **Model 3**

In [None]:
from tensorflow.keras.callbacks import EarlyStopping, TensorBoard

log_dir = "logs/fit/" + pd.datetime.now().strftime("%Y%m%d-%H%M%S")

tensor_board = TensorBoard(
    log_dir=log_dir, histogram_freq=0, write_graph=True,
    write_images=True, update_freq='epoch', profile_batch=2,
    embeddings_freq=0, embeddings_metadata=None
)

callbacks = [
             ReduceLROnPlateau(monitor='val_loss', patience=5, cooldown=0),
             EarlyStopping(monitor='val_mae', min_delta=1e-6, patience=15),
             tensor_board
             ]

bias = 1
m_bias = Embedding(num_movies + 1, bias, name="moviebias")(m_inputs)
u_bias = Embedding(num_users + 1, bias, name="userbias")(u_inputs)

o = multiply([m, u])
o = concatenate([o, m_bias, u_bias])
o = Dropout(0.5)(o)
o = Flatten()(o)
o = Dense(1)(o)

rec_model3 = Model(inputs=[m_inputs, u_inputs], outputs=o)
rec_model3.compile(loss='mse', optimizer=Adagrad(lr=5e-3), metrics=["mae"])
history = rec_model3.fit([ratings.movie_id, ratings.user_id], ratings.rating, epochs=20, verbose=2, validation_split=0.1, callbacks=callbacks)

plot_model_loss(history)

In [None]:
%reload_ext tensorboard
%tensorboard --logdir logs/fit

##**5 DeepCTR**
![](https://repository-images.githubusercontent.com/106080065/373f5f00-42e7-11ea-9860-a981b5f8915a)

In [None]:
data = pd.concat([movielens, movielens_val], axis=0) 
sparse_features = ["movie_id", "user_id", "gender", "age", "occupation", "zip_code"]
target = ['rating']

In [None]:
for feature in sparse_features:
    lbe = LabelEncoder()
    data[feature] = lbe.fit_transform(data[feature])

fixlen_feature_columns = [SparseFeat(feature, data[feature].nunique()) for feature in sparse_features]
linear_feature_columns = fixlen_feature_columns
dnn_feature_columns = fixlen_feature_columns
feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)

In [None]:
train, test = train_test_split(data, test_size=0.2)
train_model_input = {name:train[name].values for name in feature_names}
test_model_input = {name:test[name].values for name in feature_names}

![](https://1.bp.blogspot.com/-Dw1mB9am1l8/V3MgtOzp3uI/AAAAAAAABGs/mP-3nZQCjWwdk6qCa5WraSpK8A7rSPj3ACLcB/s1600/image04.png)

**Wide & Deep Learning for Recommender Systems**
Generalized linear models with nonlinear feature transformations are widely used for large-scale regression and classification problems with sparse inputs. Memorization of feature interactions through a wide set of cross-product feature
transformations are effective and interpretable, while generalization requires more feature engineering effort. With less
feature engineering, deep neural networks can generalize better to unseen feature combinations through low-dimensional
dense embeddings learned for the sparse features. However,
deep neural networks with embeddings can over-generalize
and recommend less relevant items when the user-item interactions are sparse and high-rank. In this paper, we present
Wide & Deep learning—jointly trained wide linear models
and deep neural networks—to combine the benefits of memorization and generalization for recommender systems. We
productionized and evaluated the system on Google Play,
a commercial mobile app store with over one billion active
users and over one million apps. Online experiment results
show that Wide & Deep significantly increased app acquisitions compared with wide-only and deep-only models. We
have also open-sourced our implementation in TensorFlow.

In [None]:
model = WDL(linear_feature_columns, dnn_feature_columns, task='regression')
model.compile(optimizer= "adam", loss="mse", metrics=['mae'], )
history = model.fit(train_model_input, train[target].values, batch_size=256, epochs=14, verbose=True, validation_split=0.2,  callbacks=callbacks)

In [None]:
%reload_ext tensorboard
%tensorboard --logdir logs/fit

In [None]:
pred_ans = model.predict(test_model_input, batch_size=256)

In [None]:
mae = mean_absolute_error(test[target].values, pred_ans)
mse = round(mean_squared_error(test[target].values, pred_ans), 4)
rmse = mse ** 0.5

def mean_absolute_percentage_error(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

mape = mean_absolute_percentage_error(test[target].values, pred_ans)
deepctr = pd.DataFrame([['MAE', mae], ['RMSE', rmse], ['MSE', mse], ['MAPE', mape]], columns = ['metric', 'result'])
deepctr 

## **3 Turi Create**
![](https://miro.medium.com/max/1984/0*790rZhXwAo-PrYRm.jpg)

In [None]:
!pip install turicreate
import turicreate as tc

In [None]:
training_data = tc.SFrame(data=ratings)
training_data

In [None]:
validation_data = tc.SFrame(data=ratings_val)
validation_data

#### 3.1 **Matrix Factorization method**

In [None]:
training_data_mf = training_data.remove_column('unix_timestamp')

In [None]:
validation_data_mf = validation_data.remove_column('unix_timestamp')

In [None]:
mf_rank_model = tc.ranking_factorization_recommender.create(training_data_mf,'user_id','movie_id',target="rating")

In [None]:
pred_mf = mf_rank_model.evaluate(validation_data_mf)

#### 3.2 **Item similiarty method**

In [None]:
similarity_model = tc.item_similarity_recommender.create(training_data, 'user_id', 'movie_id', target="rating", similarity_type='cosine')

In [None]:
pred_sim = similarity_model.evaluate(validation_data)

#### 3.3 **Item Content method**

In [None]:
# content_model = tc.item_content_recommender.create(df, item_id='movie_id', user_id='user_id')

In [None]:
# pred_cont = content_model.evaluate(validation_data)

#### 3.4 **Evaluate models**

In [None]:
tc.recommender.util.compare_models(validation_data_mf, [similarity_model, mf_rank_model], model_names=["item2item", "mf rank"], metric='rmse')