In [1]:
import torch
from torch import nn
import pandas as pd
import numpy as np

import matplotlib as mpl
import matplotlib.pyplot as plt


from sklearn.model_selection import train_test_split

### TMDB dataset

In [2]:
TMDB_movie = pd.read_csv("/home/ramie/Desktop/Content_popularity_prediction/data/TMDB_movie_dataset_v11.csv")
TMDB_rating = pd.read_csv("/home/ramie/Desktop/Content_popularity_prediction/data/TMDB.csv")
TMDB_rating.columns = ["userId", "id", "rating", "timestamp"]
TMDB_rating


Unnamed: 0,userId,id,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205
...,...,...,...,...
99999,671,6268,2.5,1065579370
100000,671,6269,4.0,1065149201
100001,671,6365,4.0,1070940363
100002,671,6385,2.5,1070979663


In [3]:
merged_df = pd.merge(TMDB_movie, TMDB_rating, on='id', how='inner')

dataset = merged_df[["userId", "id", "rating", "timestamp", "popularity","genres"]]
dataset.head(5)

Unnamed: 0,userId,id,rating,timestamp,popularity,genres
0,311,155,4.5,1062016035,130.643,"Drama, Action, Crime, Thriller"
1,467,155,3.0,939064184,130.643,"Drama, Action, Crime, Thriller"
2,472,155,3.0,974245114,130.643,"Drama, Action, Crime, Thriller"
3,516,155,4.0,844688604,130.643,"Drama, Action, Crime, Thriller"
4,547,155,2.0,1022680052,130.643,"Drama, Action, Crime, Thriller"


Multilabel Encoder

In [4]:
from sklearn.preprocessing import OneHotEncoder, MultiLabelBinarizer

dataset = dataset.dropna(subset = ['genres'])
dataset = dataset.reset_index()

dff = dataset.copy()
genre_l = dff['genres'].apply(lambda x: x.split(','))
genre_l = pd.DataFrame(genre_l)

genre_l['genres'] = genre_l['genres'].apply(lambda x :[ y.strip().lower().replace(' ','') for y in x] )

MLB = MultiLabelBinarizer()

genre_encoded = MLB.fit_transform(genre_l['genres'])

genre_encoded_df = pd.DataFrame(genre_encoded, columns=MLB.classes_)
genre_encoded_df=genre_encoded_df.reset_index()
#mod_df = dff.drop(['genres'],axis=1)
mod_df = dff.drop('index',axis=1)
mod_df=mod_df.reset_index()


dataset = pd.concat([mod_df,genre_encoded_df],axis=1).drop('index',axis=1)

dataset = dataset.sort_values(by=['userId','timestamp'], ascending=True).reset_index()
dataset = dataset.drop('index', axis=1)

dataset.head(5)

Unnamed: 0,userId,id,rating,timestamp,popularity,genres,action,adventure,animation,comedy,...,history,horror,music,mystery,romance,sciencefiction,thriller,tvmovie,war,western
0,1,2294,2.0,1260759108,15.781,Comedy,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,1,2455,2.5,1260759113,9.413,"Comedy, Crime, Mystery",0,0,0,1,...,0,0,0,1,0,0,0,0,0,0
2,1,1371,2.5,1260759135,38.467,Drama,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,2105,4.0,1260759139,43.03,"Comedy, Romance",0,0,0,1,...,0,0,0,0,1,0,0,0,0,0
4,1,1263,2.0,1260759151,1.477,Comedy,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


Data preparing

In [5]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

columns_remove = ['userId','rating', 'timestamp', 'popularity','genres']
columns_target = ["popularity"]
#user_150 = dataset[dataset['userId']==1]

X = dataset.drop(columns_remove, axis=1).values
y = dataset['popularity'].values

In [6]:
X_tensor = torch.tensor(X, dtype=torch.float32).view(-1, 5, 4)
y_tensor = torch.tensor(y, dtype=torch.float32).view(-1, 1)

dataset = TensorDataset(X_tensor, y_tensor)
dataloader = DataLoader(dataset, batch_size=64, shuffle=True)

### LSTM

In [None]:
class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(LSTMModel, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        lstm_out, _ = self.lstm(x)
        out = self.fc(lstm_out[:, -1, :])  # Get the output from the last time step
        return out

In [None]:
model = LSTMModel(input_size=X.shape[1], hidden_size=64, output_size=1)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [None]:
# Training loop
epochs = 50
for epoch in range(epochs):
    for inputs, targets in dataloader:
        optimizer.zero_grad()
        outputs = model(inputs.unsqueeze(1))  # Add a time step dimension
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()

    print(f'Epoch {epoch+1}, Loss: {loss.item()}')



In [None]:
model.eval()
with torch.no_grad():
    test_outputs = model(X_tensor.unsqueeze(1))  # Add a time step dimension
    print(test_outputs)   

### CNN

In [7]:
import torch
import torch.nn as nn

X_tensor = torch.tensor(X, dtype=torch.float32).view(-1, 5, 4)
y_tensor = torch.tensor(y, dtype=torch.float32).view(-1, 1)

dataset = TensorDataset(X_tensor, y_tensor)
dataloader = DataLoader(dataset, batch_size=64, shuffle=True)

class CNNModel(nn.Module):
    def __init__(self):
        super(CNNModel, self).__init__()
        self.conv1 = nn.Conv1d(in_channels=5, out_channels=16, kernel_size=3)
        self.relu = nn.ReLU()
        self.pool = nn.MaxPool1d(kernel_size=2)
        self.flatten = nn.Flatten()
        self.fc1 = nn.Linear(16, 64)  
        self.fc2 = nn.Linear(64, 1)

    def forward(self, x):
        x = self.conv1(x)
        x = self.relu(x)
        x = self.pool(x)
        x = self.flatten(x)
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        x=self.relu(x)
        return x

# Instantiate the model
model = CNNModel()
criterion = nn.L1Loss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [10]:
model.state_dict()['conv1.weight'].flatten().tolist()

[-0.03678722679615021,
 -0.21043390035629272,
 -0.24307739734649658,
 -0.1445058286190033,
 0.11992589384317398,
 -0.14495788514614105,
 0.00877449382096529,
 0.08450651168823242,
 -0.13473887741565704,
 0.21349367499351501,
 -0.18500860035419464,
 0.1632033884525299,
 0.14454956352710724,
 -0.2090136557817459,
 -0.15094681084156036,
 0.12587572634220123,
 0.0038669160567224026,
 -0.00508859334513545,
 0.21823227405548096,
 -0.03020908124744892,
 0.06211255490779877,
 0.03352119401097298,
 0.1110948845744133,
 0.14601963758468628,
 -0.0029424475505948067,
 0.21727637946605682,
 -0.027779854834079742,
 -0.21609237790107727,
 -0.13598354160785675,
 -0.23977692425251007,
 -0.14513526856899261,
 -0.03876359388232231,
 -0.044036462903022766,
 -0.2022310346364975,
 -0.22766049206256866,
 0.11316891759634018,
 0.19006413221359253,
 0.0292626041918993,
 0.2219301462173462,
 0.0922548770904541,
 0.13578538596630096,
 0.13671092689037323,
 -0.13222914934158325,
 -0.003796830540522933,
 0.1264055

In [8]:
epochs = 50
for epoch in range(epochs):
    for inputs, labels in dataloader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
    print(f'Epoch {epoch+1}, Loss: {loss.item()}')

Epoch 1, Loss: 11.943387031555176
Epoch 2, Loss: 14.499452590942383
Epoch 3, Loss: 14.973130226135254
Epoch 4, Loss: 12.265000343322754
Epoch 5, Loss: 10.385112762451172
Epoch 6, Loss: 17.040306091308594
Epoch 7, Loss: 15.615387916564941
Epoch 8, Loss: 12.970855712890625
Epoch 9, Loss: 15.05125904083252
Epoch 10, Loss: 17.068418502807617
Epoch 11, Loss: 14.275951385498047
Epoch 12, Loss: 13.603226661682129
Epoch 13, Loss: 14.661144256591797
Epoch 14, Loss: 16.3986759185791
Epoch 15, Loss: 15.893856048583984
Epoch 16, Loss: 12.566644668579102
Epoch 17, Loss: 15.27790355682373
Epoch 18, Loss: 13.969049453735352
Epoch 19, Loss: 13.707160949707031
Epoch 20, Loss: 14.016097068786621
Epoch 21, Loss: 12.657710075378418
Epoch 22, Loss: 15.435323715209961
Epoch 23, Loss: 16.511064529418945
Epoch 24, Loss: 15.668145179748535
Epoch 25, Loss: 11.789725303649902
Epoch 26, Loss: 12.905048370361328
Epoch 27, Loss: 15.481194496154785
Epoch 28, Loss: 15.294709205627441
Epoch 29, Loss: 17.89690208435058

In [9]:
model.eval()
with torch.no_grad():
    test_outputs = model(X_tensor)  # Add a time step dimension
    print(test_outputs)   

tensor([[0.],
        [0.],
        [0.],
        ...,
        [0.],
        [0.],
        [0.]])
