In [1]:
import os
import numpy as np
import pandas as pd
import pickle

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
dr = 'data_small/ml-latest-small/'
userfile = 'small_users.csv'
moviefile = 'small_movies.csv'

titlefile = 'small_title_tokens.pkl'
tagfile = 'small_tag_tokens.pkl'

ratingsfile = 'ratings.csv'

concat = lambda file: os.path.join(dr, file)

with open(concat(userfile), 'rb') as f:
    users = pd.read_csv(f, index_col='userId')
with open(concat(moviefile), 'rb') as f:
    movies = pd.read_csv(f, index_col='movieId')
with open(concat(titlefile), 'rb') as f:
    title_tokens = pickle.load(f)
with open(concat(tagfile), 'rb') as f:
    tag_tokens = pickle.load(f)
with open(concat(ratingsfile), 'rb') as f:
    ratings = pd.read_csv(f)

In [3]:
users.head(3)

Unnamed: 0_level_0,Other,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,...,negative_5,negative_6,negative_7,negative_8,negative_9,negative_10,negative_11,negative_12,negative_13,negative_14
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.129125,0.121951,0.041607,0.060258,0.119082,0.064562,0.0,0.097561,0.067432,...,1266,1344,615,1455,1124,1645,694,1535,755,1436
2,0.0,0.15493,0.042254,0.0,0.0,0.098592,0.140845,0.042254,0.239437,0.0,...,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267
3,0.0,0.130841,0.102804,0.037383,0.046729,0.084112,0.018692,0.0,0.149533,0.037383,...,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267


In [4]:
movies.head(3)

Unnamed: 0_level_0,title,Other,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,...,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,year,avg_rating
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,"[7335, 6338, 909, 909, 909, 909, 909, 909, 909]",0,0,1,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,1995,6.84186
2,"[2174, 909, 909, 909, 909, 909, 909, 909, 909]",0,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1995,5.863636
3,"[3369, 6699, 300, 909, 909, 909, 909, 909, 909]",0,0,0,0,0,1,0,0,0,...,0,0,0,1,0,0,0,0,1995,5.519231


In [5]:
import ast
movies['title'] = movies['title'].apply(lambda x: ast.literal_eval(x))

In [6]:
# split ratings into train and test
from sklearn.model_selection import train_test_split
target = ratings.pop('rating')
train, test, y_train, y_test = train_test_split(ratings, target, test_size=0.2, random_state=42, stratify=target)

In [7]:
from torch.utils.data import Dataset, DataLoader

class MovieLensDataset(Dataset):
    def __init__(self, users, movies, ratings, target):
        self.users = users
        self.movies = movies
        self.ratings = ratings
        self.target = target
        
    def __len__(self):
        return len(self.ratings)
    
    def __getitem__(self, idx):
        user = self.users.loc[self.ratings.iloc[idx]['userId']]
        movie = self.movies.loc[self.ratings.iloc[idx]['movieId']]
        rating = self.target.iloc[idx]
        return {
            'user': user,
            'movie': movie,
            'rating': rating
        }
    
train_dataset = MovieLensDataset(users, movies, train, y_train)
test_dataset = MovieLensDataset(users, movies, test, y_test)
train_dataset[0]

{'user': Other             0.000644
 Action            0.121983
 Adventure         0.071773
 Animation         0.026231
 Children          0.026714
                   ...     
 negative_10      83.000000
 negative_11      39.000000
 negative_12     228.000000
 negative_13    1193.000000
 negative_14     357.000000
 Name: 599, Length: 83, dtype: float64,
 'movie': title          [4443, 909, 909, 909, 909, 909, 909, 909, 909]
 Other                                                       0
 Action                                                      0
 Adventure                                                   0
 Animation                                                   0
 Children                                                    0
 Comedy                                                      1
 Crime                                                       0
 Documentary                                                 0
 Drama                                                       0
 Fant

Let's turn data into convenient format for training. We will use the following format:
```
{
    'user': {
        'token_features': list,
        'numeric_features': list
    },
    'movie': {
        'title': list,
        'numeric_features': list
    }
},
rating
```

After this we'll build simple neural network to predict rating.

**User embedder**

FFNN with 2 hidden layers for both numeric and embedded token features.

**Movie embedder**

FFNN with 2 hidden layers for both numeric and pooled title features.

In [10]:
import torch

# features with prefix positive, negative or neutral
user_token_positive = users.columns[users.columns.str.startswith('positive')]
user_token_negative = users.columns[users.columns.str.startswith('negative')]
user_token_neutral = users.columns[users.columns.str.startswith('neutral')]
user_numeric_features = users.columns[~users.columns.str.startswith(('positive', 'negative', 'neutral'))]

movie_numeric_features = movies.columns.to_list()
movie_numeric_features.remove('title')

def collate_fn(batch):
    user = {
        'numeric': torch.tensor([x['user'][user_numeric_features].values for x in batch], dtype=torch.float),
        'positive': torch.tensor([x['user'][user_token_positive].values for x in batch], dtype=torch.long),
        'negative': torch.tensor([x['user'][user_token_negative].values for x in batch], dtype=torch.long),
        'neutral': torch.tensor([x['user'][user_token_neutral].values for x in batch], dtype=torch.long)
    }
    movie = {
        'numeric': torch.tensor([x['movie'][movie_numeric_features].values for x in batch], dtype=torch.float),
        'title': torch.tensor([x['movie']['title'] for x in batch], dtype=torch.long)
    }
    rating = torch.tensor([x['rating']for x in batch], dtype=torch.long)
    
    return {'user': user, 'movie': movie}, rating


train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)
for x, y in train_loader:
    print(y.shape)
    print("x keys:", x.keys(), "x['user'] type:", type(x['user']), "x['user']['numeric']", type(x['user']['numeric']))
    break

torch.Size([32])
x keys: dict_keys(['user', 'movie']) x['user'] type: <class 'dict'> x['user']['numeric'] <class 'torch.Tensor'>


In [9]:
n_titles = len(title_tokens)
n_tags = len(tag_tokens)
print('len title tokens:', n_titles, 'len tag tokens:', n_tags)

len title tokens: 7469 len tag tokens: 1710


In [17]:
from torch import nn
import torch.nn.functional as F

def get_dense_layers(n_features, n_hidden, n_out):
    return nn.Sequential(
        nn.BatchNorm1d(n_features),
        nn.Linear(n_features, n_hidden),
        nn.ReLU(),
        nn.BatchNorm1d(n_hidden),
        nn.Linear(n_hidden, n_hidden),
        nn.ReLU(),
        nn.Linear(n_hidden, n_out)
    )



class MovieEmbedder(nn.Module):
    def __init__(self, n_tokens, n_features, hid_size, emb_size):
        """
        :@param n_tokens: number of tokens in the vocabulary
        :@param n_features: number of numeric features
        :@param hid_size: size of the hidden layer
        :@param emb_size: size of the embedding
        """
        super().__init__()
        # title encoder
        self.emb = nn.Embedding(n_tokens, hid_size)
        self.pool = nn.AdaptiveAvgPool1d(1)
        self.emb_lin = get_dense_layers(hid_size, hid_size, emb_size)

        # feature encoder
        self.num_lin = get_dense_layers(n_features, hid_size, emb_size)

    def forward(self, x):
        title = x['title']
        emb = self.emb(title).transpose(1, 2)
        emb = self.pool(emb).squeeze()
        emb = self.emb_lin(emb)

        features = x['numeric']
        features = self.num_lin(features)

        return torch.cat([emb, features], dim=1)
    

title_len = len(train_dataset[0]['movie']['title'])
n_movie = {
    'n_tokens': len(title_tokens),
    'n_features': len(movie_numeric_features),
    'hid_size': 32,
    'emb_size': 16
}

movie_emb = MovieEmbedder(**n_movie)
movie_out = movie_emb(x['movie'])


IndexError: index out of range in self

In [16]:

    
def get_sentiment_embedder(n_tokens, n_input, hidden_size, n_output):
    print(f"Input [B, {n_input}] -> Embedding [B, {n_input}, {hidden_size}]" \
          "-> Conv1d [B, {n_input}, {n_input}] -> ReLU [B, {n_input}, {n_input}] -> AdaptiveAvgPool1d [B, {n_input}, 1]" \
          "-> Flatten [B, {n_input}] -> Dense [B, {n_output}]")
    return nn.ParameterDict(
        {
            'emb': nn.Embedding(n_tokens, hidden_size),
            'lin': nn.Sequential(
                nn.BatchNorm2d(n_input),
                nn.Conv1d(n_input, n_input, kernel_size=3, padding=1),
                nn.ReLU(),
                nn.AdaptiveAvgPool1d(1),
                nn.Flatten(),
                get_dense_layers(n_input, hidden_size, n_output)
            )
        }
    )

class UserEmbedder(nn.Module):
    def __init__(self, n_tokens, n_sentiment, n_numeric, n_numeric_out, n_numeric_hidden, n_sentiment_hidden, n_sentiment_emb):
        """
        :@param n_tokens: number of tokens in the vocabulary
        :@param n_sentiment: dict with number of tokens for each sentiment (positive, negative, neutral)
        :@param n_numeric: number of numeric features
        :@param n_numeric_out: number of output features
        :@param n_numeric_hidden: number of hidden features
        :@param n_sentiment_hidden: number of hidden features for sentiment
        :@param n_sentiment_emb: number of embedding features for sentiment
        """
        assert isinstance(n_sentiment, dict), "sentiment_tokens must be a dict"
        assert ['positive', 'negative', 'neutral'] == list(n_sentiment.keys()),\
                     "sentiment_tokens must have keys 'positive', 'negative' and 'neutral'"
        super().__init__()
        
        # tag encoder
        self.emb = nn.ParameterDict({
            'positive': get_sentiment_embedder(n_tokens, n_sentiment['positive'], n_sentiment_hidden, n_sentiment_emb),
            'neutral': get_sentiment_embedder(n_tokens, n_sentiment['neutral'], n_sentiment_hidden, n_sentiment_emb),
            'negative': get_sentiment_embedder(n_tokens, n_sentiment['negative'], n_sentiment_hidden, n_sentiment_emb)
        })
        
        self.numeric = get_dense_layers(n_numeric, n_numeric_hidden, n_numeric_out)

    def forward(self, x):
        positive, negative, neutral = x['positive'], x['negative'], x['neutral']
        print("before emb:", positive.shape)
        emb_positive = self.emb['positive']['emb'](positive)
        print("after emb:", emb_positive.shape)
        emb_positive = self.emb['positive']['lin'](emb_positive)
        print("after lin:", emb_positive.shape)

        emb_negative = self.emb['negative']['emb'](negative)
        emb_negative = self.emb['negative']['lin'](emb_negative)

        emb_neutral = self.emb['neutral']['emb'](neutral)
        emb_neutral = self.emb['neutral']['lin'](emb_neutral)

        numeric = x['numeric']
        numeric = self.numeric(numeric)

        return torch.cat([emb_positive, emb_negative, emb_neutral, numeric], dim=1)
            





    
# 'n_sentiment', 'n_numeric', 'n_numeric_out', 'n_numeric_hidden', 'n_sentiment_hidden', 'n_sentiment_emb'
n_user = {
    'n_tokens': len(tag_tokens),
    'n_sentiment': {'positive': len(user_token_positive), 'negative': len(user_token_negative), 'neutral': len(user_token_neutral)},
    'n_numeric': len(user_numeric_features),
    'n_numeric_out': 32,
    'n_numeric_hidden': 32,
    'n_sentiment_hidden': 32,
    'n_sentiment_emb': 16
}



user_emb = UserEmbedder(**n_user)
user_out = user_emb(x['user'])


        


Input [B, 15] -> Embedding [B, 15, 32]-> Conv1d [B, {n_input}, {n_input}] -> ReLU [B, {n_input}, {n_input}] -> AdaptiveAvgPool1d [B, {n_input}, 1]-> Flatten [B, {n_input}] -> Dense [B, {n_output}]
Input [B, 15] -> Embedding [B, 15, 32]-> Conv1d [B, {n_input}, {n_input}] -> ReLU [B, {n_input}, {n_input}] -> AdaptiveAvgPool1d [B, {n_input}, 1]-> Flatten [B, {n_input}] -> Dense [B, {n_output}]
Input [B, 15] -> Embedding [B, 15, 32]-> Conv1d [B, {n_input}, {n_input}] -> ReLU [B, {n_input}, {n_input}] -> AdaptiveAvgPool1d [B, {n_input}, 1]-> Flatten [B, {n_input}] -> Dense [B, {n_output}]
before emb: torch.Size([32, 15])
after emb: torch.Size([32, 15, 32])


ValueError: expected 4D input (got 3D input)

In [None]:
class Recommender(nn.Module):
    def __init__(self, n_user, n_movie, n_hidden, n_classes):
        super().__init__()
        assert isinstance(n_user, dict), "user_tokens must be a dict"
        assert list(n_user.keys()) == ['n_tokens', 'n_sentiment', 'n_numeric', 'n_numeric_out', 'n_numeric_hidden', 'n_sentiment_hidden', 'n_sentiment_emb'],\
            "user_tokens must have keys 'n_tokens', 'n_sentiment', 'n_numeric', 'n_numeric_out', 'n_numeric_hidden', 'n_sentiment_hidden', 'n_sentiment_emb'"

        assert isinstance(n_movie, dict), "movie_tokens must be a dict"
        assert list(n_movie.keys()) == ['n_tokens', 'n_features', 'hid_size', 'emb_size'],\
            "movie_tokens must have keys 'n_tokens', 'n_features', 'hid_size', 'emb_size'"

        self.user_emb = UserEmbedder(**n_user)
        self.movie_emb = MovieEmbedder(**n_movie)

        n_out = n_user['n_numeric_out'] + n_user['n_sentiment_emb'] * 3 + n_movie['emb_size'] * 2
        self.lin = get_dense_layers(n_out, n_hidden, n_classes)

    def forward(self, x):
        user = self.user_emb(x['user'])
        movie = self.movie_emb(x['movie'])
        x = torch.cat([user, movie], dim=1)
        x = self.lin(x)
        return x

model = Recommender(n_user, n_movie, 16, 10)
out = model(x)

In [12]:
# split ratings into train and test
from sklearn.model_selection import train_test_split
target = ratings.pop('rating')
train, test, y_train, y_test = train_test_split(ratings, target, test_size=0.2, random_state=42, stratify=target)

In [13]:
from torch.utils.data import Dataset, DataLoader

class MovieLensDataset(Dataset):
    def __init__(self, users, movies, ratings, target):
        self.users = users
        self.movies = movies
        self.ratings = ratings
        self.target = target
        
    def __len__(self):
        return len(self.ratings)
    
    def __getitem__(self, idx):
        user = self.users.loc[self.ratings.iloc[idx]['userId']]
        movie = self.movies.loc[self.ratings.iloc[idx]['movieId']]
        rating = self.target.iloc[idx]
        return {
            'user': user,
            'movie': movie,
            'rating': rating
        }
    
train_dataset = MovieLensDataset(users, movies, train, y_train)
test_dataset = MovieLensDataset(users, movies, test, y_test)
train_dataset[0].keys()

dict_keys(['user', 'movie', 'rating'])

In [15]:
import torch

def collate_fn(batch):
    # users = torch.stack([x[0] for x in batch])
    # movies = torch.stack([x[0] for x in batch])
    # ratings = torch.stack([x[2] for x in batch])
    users = [x['user'] for x in batch]
    movies = [x['movie'] for x in batch]
    ratings = torch.tensor([x['rating'] for x in batch], dtype=torch.long)
    return [users, movies], ratings

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)
for x, y in train_loader:
    print(y.shape)
    print(type(x))
    break

torch.Size([32])
<class 'list'>




In [20]:
users.head(2)

Unnamed: 0_level_0,Other,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,...,negative_5,negative_6,negative_7,negative_8,negative_9,negative_10,negative_11,negative_12,negative_13,negative_14
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.129125,0.121951,0.041607,0.060258,0.119082,0.064562,0.0,0.097561,0.067432,...,9008,7418,7418,7418,1394,4857,7418,7418,7663,1744
2,0.0,0.15493,0.042254,0.0,0.0,0.098592,0.140845,0.042254,0.239437,0.0,...,7418,7418,7418,7418,7418,7418,7418,7418,7418,7418


In [None]:
from torch import nn
import torch.nn.functional as F

class MovieEmbedder(nn.Module):
    def __init__(self, n_tokens, hid_size=50, emb_size=20, n_features=10, feature_size=10):
        super().__init__()
        # title encoder
        self.emb = nn.Embedding(n_tokens, hid_size)
        self.pool = nn.AdaptiveAvgPool1d(1)
        self.fc = nn.Linear(hid_size, emb_size)

        # feature encoder
        self.fc1 = nn.Linear(n_features, hid_size)
        self.fc2 = nn.Linear(hid_size, feature_size)

    def forward(self, x):
        title = x['title']
        emb = self.emb(title).transpose(1, 2)
        emb = self.pool(emb).squeeze()
        emb = self.fc(F.relu(emb))

        features = x['features']
        features = self.fc1(features)
        features = self.fc2(F.relu(features))

        return torch.cat([emb, features], dim=1)
    

class UserEmbedder(nn.Module):
    def __init__(self, n_tokens, hid_size=50, emb_size=10, token_features=[
        ('positive', 10),
        ('negative', 10),
        ('neutral', 10)
    ], n_features=10, feature_size=10):
        super().__init__()
        self.token_features = token_features
        # title encoder
        self.emb = nn.ParameterDict({
            'positive': nn.Embedding(n_tokens, hid_size),
            'neutral': nn.Embedding(n_tokens, hid_size),
            'negative': nn.Embedding(n_tokens, hid_size)
        })
        self.pool = nn.AdaptiveAvgPool1d(1)
        self.encode = nn.ModuleDict({
            'positive': nn.Sequential(
                nn.Linear(hid_size, emb_size)
            ),
            'neutral': nn.Sequential(
                nn.Linear(hid_size, emb_size)
            ),
            'negative': nn.Sequential(
                nn.Linear(hid_size, emb_size)
            )
        })

        self.feature_encoder = nn.Sequential(
            nn.Linear(n_features, hid_size),
            nn.ReLU(inplace=True),
            nn.Linear(hid_size, feature_size)
        )

    def forward(self, x):
        embs = []
        for group, n in self.token_features:
            emb = self.emb[group](x[group]).transpose(1, 2)
            emb = self.pool(emb).squeeze()
            emb = self.encode[group](emb)
            embs.append(emb)
        
        features = self.feature_encoder(x['features'])
        final = torch.cat(embs + [features], dim=1)
        return final
            




class Recommender(nn.Module):
    def __init__(self, n_tokens, user_titles=['positive', 'negative', 'neutral'], emb_size=25, n_classes=10):
        super().__init__()
        # User encoder
        self.user_emb = UserEmbedder(n_tokens)

        # Movie encoder
        self.movie_emb = MovieEmbedder(n_tokens)

        # Concatenation and relu with dropout and linear
        self.fc1 = nn.Linear(emb_size * 2, 100)
        self.dropout = nn.Dropout(0.2)
        self.fc2 = nn.Linear(100, n_classes)

    def forward(self, x):
        user = self.user_emb(x['user'])
        movie = self.movie_emb(x['movie'])
        x = torch.cat([user, movie], dim=1)
        x = self.fc1(F.relu(x))
        x = self.dropout(x)
        x = self.fc2(F.relu(x))
        return x
        
