In [2]:
import torch
from torch import nn
import pandas as pd
import numpy as np

import matplotlib as mpl
import matplotlib.pyplot as plt


from sklearn.model_selection import train_test_split

import torchtext.vocab as vocab
import torch



### TMDB dataset

In [5]:
TMDB_movie = pd.read_csv("/home/ramie/Desktop/Content_popularity_prediction/data/TMDB_movie_dataset_v11.csv")
TMDB_rating = pd.read_csv("/home/ramie/Desktop/Content_popularity_prediction/data/TMDB.csv")
TMDB_rating.columns = ["userId", "id", "rating", "timestamp"]
TMDB_rating

Unnamed: 0,userId,id,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205
...,...,...,...,...
99999,671,6268,2.5,1065579370
100000,671,6269,4.0,1065149201
100001,671,6365,4.0,1070940363
100002,671,6385,2.5,1070979663


In [6]:
merged_df = pd.merge(TMDB_movie, TMDB_rating, on='id', how='inner')

dataset = merged_df[["userId", "id", "rating", "timestamp", "popularity","genres"]]
dataset

Unnamed: 0,userId,id,rating,timestamp,popularity,genres
0,311,155,4.5,1062016035,130.643,"Drama, Action, Crime, Thriller"
1,467,155,3.0,939064184,130.643,"Drama, Action, Crime, Thriller"
2,472,155,3.0,974245114,130.643,"Drama, Action, Crime, Thriller"
3,516,155,4.0,844688604,130.643,"Drama, Action, Crime, Thriller"
4,547,155,2.0,1022680052,130.643,"Drama, Action, Crime, Thriller"
...,...,...,...,...,...,...
60679,52,51094,3.5,1231770833,0.600,
60680,380,51088,4.0,1199154037,0.600,
60681,468,50259,4.0,1296199327,0.601,
60682,48,52319,4.0,1319745882,0.603,


In [7]:
# Delete na - preprocessing
dataset = dataset.dropna(subset = ['genres'])
dataset = dataset.reset_index()

dff = dataset.copy()
genre_l = dff['genres'].apply(lambda x: x.split(','))
genre_l = pd.DataFrame(genre_l)
genre_l['genres'] = genre_l['genres'].apply(lambda x :[ y.strip().lower().replace(' ','') for y in x] )

dataset['genres'] = genre_l['genres'] 
dataset = dataset.drop('index', axis=1)

In [8]:
dataset

Unnamed: 0,userId,id,rating,timestamp,popularity,genres
0,311,155,4.5,1062016035,130.643,"[drama, action, crime, thriller]"
1,467,155,3.0,939064184,130.643,"[drama, action, crime, thriller]"
2,472,155,3.0,974245114,130.643,"[drama, action, crime, thriller]"
3,516,155,4.0,844688604,130.643,"[drama, action, crime, thriller]"
4,547,155,2.0,1022680052,130.643,"[drama, action, crime, thriller]"
...,...,...,...,...,...,...
59385,624,54286,3.5,1198524298,1.576,"[action, adventure, tvmovie]"
59386,664,54286,3.5,1365264866,1.576,"[action, adventure, tvmovie]"
59387,117,50685,4.0,1320640798,1.400,[documentary]
59388,380,50685,3.5,1234671662,1.400,[documentary]


In [10]:
def load_word_embedding(file_path):
    embedding_dict = {}
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            parts = line.split()
            word = parts[0]
            vector = [float(val) for val in parts[1:]]
            embedding_dict[word] = vector
    return embedding_dict

# Load the custom word embedding from the file
custom_embedding_path = '/home/ramie/Desktop/Content_popularity_prediction/custom_embedding.txt'
custom_embedding = load_word_embedding(custom_embedding_path)

In [11]:
# Map genre to vector
genre_to_vector = {genre: vector for genre, vector in custom_embedding.items()}

# Create vectors for each ID movie based on genres
id_vectors = {}
for id_val, genres in zip(dataset['id'], dataset['genres']):
    id_vector = []
    for genre in genres:
        genre_vector = genre_to_vector.get(genre)
        if genre_vector:
            id_vector.append(genre_vector)
            sum_list = [sum(sublist) for sublist in zip(*id_vector)]

    id_vectors[id_val] = sum_list

In [12]:
id_to_vector = {str(key): id_vectors.get(key) for key in dataset['id']}
dataset['semantic_vector'] = [id_to_vector.get(str(id_val)) for id_val in dataset['id']]
dataset = pd.concat([dataset.drop('semantic_vector', axis=1), dataset['semantic_vector'].apply(pd.Series)], axis=1)

In [13]:
dataset

Unnamed: 0,userId,id,rating,timestamp,popularity,genres,0,1,2,3,...,40,41,42,43,44,45,46,47,48,49
0,311,155,4.5,1062016035,130.643,"[drama, action, crime, thriller]",0.14179,-0.464721,-2.91328,-0.284499,...,-1.333670,2.117820,0.868977,-1.764093,-0.73620,0.436932,2.080406,-0.174683,1.04835,2.30700
1,467,155,3.0,939064184,130.643,"[drama, action, crime, thriller]",0.14179,-0.464721,-2.91328,-0.284499,...,-1.333670,2.117820,0.868977,-1.764093,-0.73620,0.436932,2.080406,-0.174683,1.04835,2.30700
2,472,155,3.0,974245114,130.643,"[drama, action, crime, thriller]",0.14179,-0.464721,-2.91328,-0.284499,...,-1.333670,2.117820,0.868977,-1.764093,-0.73620,0.436932,2.080406,-0.174683,1.04835,2.30700
3,516,155,4.0,844688604,130.643,"[drama, action, crime, thriller]",0.14179,-0.464721,-2.91328,-0.284499,...,-1.333670,2.117820,0.868977,-1.764093,-0.73620,0.436932,2.080406,-0.174683,1.04835,2.30700
4,547,155,2.0,1022680052,130.643,"[drama, action, crime, thriller]",0.14179,-0.464721,-2.91328,-0.284499,...,-1.333670,2.117820,0.868977,-1.764093,-0.73620,0.436932,2.080406,-0.174683,1.04835,2.30700
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59385,624,54286,3.5,1198524298,1.576,"[action, adventure, tvmovie]",0.77385,0.129980,-1.34822,0.457441,...,-0.137444,0.100108,0.069247,-0.149290,-0.03886,0.145943,1.166909,-0.076480,0.25599,1.00080
59386,664,54286,3.5,1365264866,1.576,"[action, adventure, tvmovie]",0.77385,0.129980,-1.34822,0.457441,...,-0.137444,0.100108,0.069247,-0.149290,-0.03886,0.145943,1.166909,-0.076480,0.25599,1.00080
59387,117,50685,4.0,1320640798,1.400,[documentary],0.14329,0.826770,-0.62576,0.214360,...,0.311680,0.498470,-0.377190,-0.820940,-0.53088,-0.167510,0.259490,0.049949,-0.38922,0.48096
59388,380,50685,3.5,1234671662,1.400,[documentary],0.14329,0.826770,-0.62576,0.214360,...,0.311680,0.498470,-0.377190,-0.820940,-0.53088,-0.167510,0.259490,0.049949,-0.38922,0.48096


Data preparing

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

columns_remove = ['userId','rating', 'timestamp', 'popularity','genres']
columns_target = ["popularity"]



X = dataset.drop(columns_remove, axis=1).values
y = dataset['popularity'].values

X_tensor = torch.tensor(X, dtype=torch.float32)
y_tensor = torch.tensor(y, dtype=torch.float32).view(-1, 1)

dataset = TensorDataset(X_tensor, y_tensor)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

In [1]:
class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(LSTMModel, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        lstm_out, _ = self.lstm(x)
        out = self.fc(lstm_out[:, -1, :])  # Get the output from the last time step
        return out

NameError: name 'nn' is not defined

In [None]:
model = LSTMModel(input_size=X.shape[1], hidden_size=64, output_size=1)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [None]:
# Training loop
epochs = 50
for epoch in range(epochs):
    for inputs, targets in dataloader:
        optimizer.zero_grad()
        outputs = model(inputs.unsqueeze(1))  # Add a time step dimension
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()

    print(f'Epoch {epoch+1}, Loss: {loss.item()}')

In [None]:
model.eval()
with torch.no_grad():
    test_outputs = model(X_tensor.unsqueeze(1))  # Add a time step dimension
    print(test_outputs)   