In [50]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

In [51]:
# Load data
dataPath = 'data/x_train_alpha/'
X_train = pd.read_pickle(dataPath +
                         'x_train_alpha(0.02).pkl').values.astype(float)

In [52]:
# Standardize data
scaler = StandardScaler() 
X_train = scaler.fit_transform(X_train)

In [53]:
# Define autoencoder
class Autoencoder(nn.Module):
    def __init__(self, input_dim, encoded_dim):
        super(Autoencoder, self).__init__()
        self.encoder = nn.Sequential(nn.Linear(input_dim, 128), nn.ReLU(),
                                     nn.Linear(128, 64), nn.ReLU(),
                                     nn.Linear(64, encoded_dim))
        self.decoder = nn.Sequential(nn.Linear(encoded_dim, 64), nn.ReLU(),
                                     nn.Linear(64, 128), nn.ReLU(),
                                     nn.Linear(128, input_dim), nn.Sigmoid())

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return encoded, decoded

In [54]:
# Train autoencoder
input_dim = X_train.shape[1]
encoded_dim = 10
autoencoder = Autoencoder(input_dim, encoded_dim)
criterion = nn.MSELoss()
optimizer = optim.Adam(autoencoder.parameters(), lr=0.001)

num_epochs = 100
for epoch in range(num_epochs):
    inputs = torch.from_numpy(X_train).float()
    encoded, decoded = autoencoder(inputs)
    loss = criterion(decoded, inputs)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    if (epoch + 1) % 10 == 0:
        print('Epoch [{}/{}], Loss: {:.4f}'.format(epoch + 1, num_epochs,
                                                   loss.item()))



Epoch [10/100], Loss: 1.2203
Epoch [20/100], Loss: 1.1205
Epoch [30/100], Loss: 1.0045
Epoch [40/100], Loss: 0.9986
Epoch [50/100], Loss: 0.9890
Epoch [60/100], Loss: 0.9620
Epoch [70/100], Loss: 0.9386
Epoch [80/100], Loss: 0.9300
Epoch [90/100], Loss: 0.9272
Epoch [100/100], Loss: 0.9259


In [55]:
torch.save(autoencoder.encoder, 'models/autoencoder.pt')

In [56]:
# Load the saved encoder model
# encoder = torch.load('models/autoencoder.pt')
# with torch.no_grad():
#     encoded_X_train = encoder(torch.from_numpy(X_train)).numpy()

In [57]:
# Extract encoded features
encoded_features, _ = autoencoder(torch.from_numpy(X_train).float())
encoded_features = encoded_features.detach().numpy()

In [58]:
# Save encoded features to file
encoded_features_df = pd.DataFrame(encoded_features)
encoded_features_df.to_pickle('data/encoded_features/encoded_features.pkl')

In [59]:
# Cluster users using encoded features
kmeans = KMeans(n_clusters=5, random_state=0, n_init=10).fit(encoded_features)
cluster_labels = kmeans.labels_

In [60]:
# Print cluster sizes
for i in range(5):
    print(f"Cluster {i}: {np.sum(cluster_labels == i)} users")

Cluster 0: 292 users
Cluster 1: 187 users
Cluster 2: 13 users
Cluster 3: 351 users
Cluster 4: 100 users


In [61]:
# Evaluate the clustering performance using a clustering metric
from sklearn.metrics import silhouette_score

silhouette = silhouette_score(encoded_features, cluster_labels)
print("Silhouette Score: {:.2f}".format(silhouette))

Silhouette Score: 0.42


In [62]:
# load the encoded features
X_encoded = pd.read_pickle("data/encoded_features/encoded_features.pkl")

In [63]:
# perform clustering using KMeans
n_clusters = 5
kmeans = KMeans(n_clusters=n_clusters, random_state=0, n_init=10).fit(X_encoded)

In [64]:
# assume we want to get top-10 recommendations for user with UID 123
user_id = 1
recommendation_count = 10

In [65]:
# find the cluster assignment for the user
user_index = X_encoded.index.get_loc(user_id)
user_cluster = kmeans.labels_[user_index]

# get the indices of all users in the same cluster
cluster_indices = np.where(kmeans.labels_ == user_cluster)[0]

In [66]:
# load the ratings data
cluster_ratings = pd.read_csv("datasets/ml-100k/ua.base", delimiter='\t', names=['UID', 'MID', 'rate', 'timestamp'], usecols=[0, 1, 2])


In [67]:
# get the ratings of all movies by users in the same cluster
cluster_ratings = cluster_ratings.loc[cluster_ratings['UID'].isin(X_encoded.iloc[cluster_indices].index)][['MID', 'rate']]

# compute the average rating for each movie
avg_ratings = cluster_ratings.groupby(['MID']).mean()

In [68]:
# sort the movies by their average rating and get the top-10 recommendations
top_n_recommendations = avg_ratings.sort_values(by='rate', ascending=False).head(recommendation_count)


In [69]:
top_n_recommendations

Unnamed: 0_level_0,rate
MID,Unnamed: 1_level_1
851,5.0
1656,5.0
1293,5.0
1295,5.0
1612,5.0
626,5.0
1599,5.0
1592,5.0
814,5.0
1275,5.0


In [70]:
# load the u.item file
item_df = pd.read_csv('datasets/ml-100k/u.item', sep='|', encoding='latin-1', header=None, usecols=[0,1], names=['MID', 'name'])


In [71]:
# Recommendations top-n
for mid in top_n_recommendations.index.values:
    movie_name = item_df.loc[item_df['MID'] == mid]['name'].values[0]
    print(movie_name)

Two or Three Things I Know About Her (1966)
Little City (1998)
Star Kid (1997)
Kicked in the Head (1997)
Leading Man, The (1996)
So Dear to My Heart (1949)
Someone Else's America (1995)
Magic Hour, The (1998)
Great Day in Harlem, A (1994)
Killer (Bulletproof Heart) (1994)


In [72]:
# Movies user rated about 4-5
data = pd.read_csv(
    "datasets/ml-100k/ua.base",
    sep="\t",
    header=None,
    names=["UID", "MID", "rate", "timestamp"],
)
data["UID"] = data["UID"].astype(int)
data["rate"] = data["rate"].astype(int)
data = data[data["rate"] > 3]
data = data[data["UID"] == user_id]
data = data.sort_values(by="rate", ascending=False)
data = data.drop_duplicates(subset=["MID"], keep="first")
data = data.drop(["UID", "rate", "timestamp"], axis=1)

for mid in data["MID"]:
    movie_name = item_df.loc[item_df["MID"] == mid]["name"].values[0]
    print(movie_name)

Toy Story (1995)
Aliens (1986)
Raiders of the Lost Ark (1981)
Princess Bride, The (1987)
Empire Strikes Back, The (1980)
Cinema Paradiso (1988)
Wrong Trousers, The (1993)
Monty Python and the Holy Grail (1974)
Manon of the Spring (Manon des sources) (1986)
Jean de Florette (1986)
Monty Python's Life of Brian (1979)
Sleeper (1973)
Swingers (1996)
Big Night (1996)
Bound (1996)
Godfather, The (1972)
Lone Star (1996)
Maya Lin: A Strong Clear Vision (1994)
Haunted World of Edward D. Wood Jr., The (1995)
Wallace & Gromit: The Best of Aardman Animation (1996)
Horseman on the Roof, The (Hussard sur le toit, Le) (1995)
Truth About Cats & Dogs, The (1996)
Mystery Science Theater 3000: The Movie (1996)
Kids in the Hall: Brain Candy (1996)
Fargo (1996)
Brazil (1985)
Good, The Bad and The Ugly, The (1966)
Welcome to the Dollhouse (1995)
12 Angry Men (1957)
Full Monty, The (1997)
Chasing Amy (1997)
Contact (1997)
Pillow Book, The (1995)
Chasing Amy (1997)
Kolya (1996)
Mars Attacks! (1996)
Star Trek: