In [None]:
import numpy as np
import scipy.sparse as sp
import torch

# Load the sparse matrix from the .npz file
file_path = '/content/uTm.npz'
R_cpu = sp.load_npz(file_path).astype(np.float32)  # Convert to float32

# Convert to PyTorch sparse tensor
R_coo = R_cpu.tocoo()
values = R_coo.data
indices = np.vstack((R_coo.row, R_coo.col))

i = torch.LongTensor(indices)
v = torch.FloatTensor(values)
shape = R_coo.shape

R_gpu_sparse = torch.sparse_coo_tensor(i, v, torch.Size(shape)).to('cuda')
R_gpu = R_gpu_sparse.to_dense()  # Convert to dense tensor

# Shape of the matrix
num_users, num_tags = R_gpu.shape

# Hyperparameters
latent_features = 500  # Reduced number of latent features
alpha = 0.01  # Learning rate
lambda_reg = 0.1  # Regularization parameter
iterations = 50

# Initialize U and V using SVD
U, S, Vt = torch.svd(R_gpu)
U = U[:, :latent_features].clone().detach().requires_grad_(True)
S = torch.diag(S[:latent_features]).clone().detach()
V = Vt[:, :latent_features].t().clone().detach().requires_grad_(True)
U, V = U.to('cuda'), V.to('cuda')

# Define optimizer
optimizer = torch.optim.Adam([U, V], lr=alpha, weight_decay=lambda_reg)

# Reduce batch size for memory efficiency
batch_size = 1000

# Training loop
for it in range(iterations):
    optimizer.zero_grad()

    # Compute prediction and error in batches
    for start in range(0, len(R_coo.data), batch_size):
        end = min(start + batch_size, len(R_coo.data))
        batch_indices = torch.LongTensor([R_coo.row[start:end], R_coo.col[start:end]]).to('cuda')
        batch_values = torch.FloatTensor(R_coo.data[start:end]).to('cuda')

        prediction = torch.mm(U[batch_indices[0, :], :], V[batch_indices[1, :], :].t())
        batch_error = batch_values - prediction.squeeze()
        batch_error_sq = torch.pow(batch_error, 2)

        # Accumulate gradients for this batch
        batch_error_sq.sum().backward()

    optimizer.step()

    if it % 10 == 0:
        with torch.no_grad():
            prediction = torch.mm(U, V.t())
            error = R_gpu - prediction
            error_sq = torch.pow(error[R_coo.row, R_coo.col], 2)
            relative_error = error_sq.sum() / torch.pow(R_gpu[R_coo.row, R_coo.col], 2).sum()
            print(f"Iteration {it}: relative error = {relative_error.item() * 100:.2f}%")

# Save the resulting matrices
torch.save(U.cpu(), 'U_matrix_final1.pt')
torch.save(V.cpu(), 'V_matrix_final1.pt')

print("\nMatrix factorization with regularization completed.")
print("U matrix shape:", U.shape)
print("V matrix shape:", V.t().shape)


  batch_indices = torch.LongTensor([R_coo.row[start:end], R_coo.col[start:end]]).to('cuda')


Iteration 0: relative error = 100.00%
Iteration 10: relative error = 99.67%


In [None]:
tags_list = [
    "boolean", "boolean-operations", "boolean-logic", "boolean-expression", "boolean-algebra", "boolean-polynomials",
    "proof", "induction", "proof-of-correctness",
    "probability", "probability-theory","abstract-data-type","arrays", "static-array", "array-comparison", "array-reverse", "array-address", "array-pointer","dynamic-arrays","dynamic-allocation", "array-merge",
    "sparse-array", "arrayofarrays", "multidimensional-array", "sparse-matrix", "matrix-multiplication","array-multisort", "variable-length-array",
    "linked-list", "list", "singly-linked-list", "doubly-linked-list","string", "string-operations", "bitstring", "substring", "string-concatenation", "string-substitution",
    "string-comparison", "string-matching","algorithm","array-algorithms","quicksort", "mergesort", "radix-sort", "linear-search", "bucket-sort", "ternary-search",
    "counting-sort", "lexicographic-ordering", "combinatorics", "binary-search", "bubble-sort", "selection-sort",
    "sorting", "insertion-sort", "stable-sort","data-structures","stack", "queue","queueing",
    "enqueue","heap","heapalloc","heap-memory","heap-table","heapsort","binary-heap","fibonacci-heap", "priority-queue", "circular-queue", "fifo", "stack-memory", "binomial-heap", "lifo",
    "stack-allocation", "min-heap", "max-heap","minmax-heap", "heap-size","heaps-algorithm","tree", "treenode", "treepath", "tree-search", "tree-balancing", "tree-rotation", "tree-traversal",
    "breadth-first-search", "depth-first-search", "best-first-search", "treesort", "binary-tree", "ternary-tree",
    "multiway-tree", "binary-search-tree", "balanced-binary-search-tree", "ternary-search-tree", "avl-tree", "b-tree",
    "b-plus-tree", "red-black-tree", "red-black-tree-insertion", "splay-tree", "hashtree", "n-ary-tree", "huffman-tree",
    "huffman-code","string-algorithm", "longest-substring", "lcs", "greedy", "optimization", "mathematical-optimization",
    "linear-programming","knuth-morris-pratt", "rabin-karp","graph", "graph-theory", "strongly-connected-graph", "adjacency-list", "adjacency-matrix", "directed-graph",
    "undirected-graph", "digraphs", "subgraph", "minimum-spanning-tree", "minimum-spanning-forest",
    "directed-acyclic-graphs", "graph-traversal", "a-star", "Dijkstra", "kruskals-algorithm", "prims-algorithm",
    "hungarian-algorithm", "simplex", "simplex-algorithm", "strassen", "clique", "clique-problem",
    "graph-algorithm", "tarjans-algorithm", "shortest-path", "planar-graph", "floyd-warshall",
    "traveling-salesman", "path-finding", "bipartite", "hamiltonian-cycle", "topological-sort", "graph-coloring",
    "spanning-tree", "bellman-ford", "kosaraju-algorithm", "euler-path", "graph-query", "dependency-graph",
    "vertex-cover", "hamiltonian-path", "cyclic-graph","hash", "hashmap", "hash-function", "hashset", "double-hashing", "hashtable", "hashcode", "hash-collision",
    "consistent-hashing","dynamic-programming", "knapsack-problem", "code-complexity", "big-o", "tail-recursion", "partitioning",
    "time-complexity","space-complexity","memory-management", "network-flow", "branch-and-bound", "max-flow", "master-theorem", "recursive-backtracking",
    "fibonacci", "towers-of-hanoi", "recursion", "divide-and-conquer", "asymptotic-complexity",
    "greatest-common-divisor", "partition-problem", "ford-fulkerson", "edmonds-karp", "set", "set-cover",
    "amortized-analysis", "set-intersection", "set-union", "set-theory", "set-difference", "set-operations","set-cover"
    "disjoint-union", "disjoint-sets", "union-find","complexity-theory", "recursive-datastructures", "p-np", "np", "np-hard", "np-complete", "sat",
    "satisfiability", "2-satisfiability", "stable-marriage", "heuristics", "artificial-intelligence", "automata",
    "automata-theory", "finite-automata", "non-deterministic", "subset-sum", "minimax", "turing-machines",
    "turing-complete", "halting-problem", "backtracking", "formal-languages", "stability", "computability",
    "probing", "linear-probing", "quadratic-probing"]

len(set(tags_list))


In [None]:
import numpy as np
import torch

def load_matrices(U_path, V_path):
    U = torch.load(U_path).detach().numpy()
    V = torch.load(V_path).detach().numpy()
    return U, V

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def predict_tags_binary(U, V, known_tags_indices, num_tags, k=5):
    """
    Predict the rest of the tags for users given a few known tags in a binary matrix.

    Parameters:
    U (numpy array): User latent feature matrix (shape: num_users x num_latent_features)
    V (numpy array): Tag latent feature matrix (shape: num_tags x num_latent_features)
    known_tags_indices (list): Indices of the known tags for the users
    num_tags (int): Total number of tags
    k (int): Number of top rows to consider for averaging

    Returns:
    numpy array: Average predicted probabilities for all tags across selected users
    """
    # Reconstruct the full matrix for all users
    predicted_matrix = np.dot(U, V.T)

    # Apply a sigmoid function to convert scores to probabilities
    predicted_probs = predicted_matrix

    # Fix known tags to 1 for all users
    for index in known_tags_indices:
        predicted_probs[:, index] = 1

    # Identify the top k rows for the known tags column whose values are very close to 1
    close_to_one_indices = []
    for index in known_tags_indices:
        top_k_indices = np.argsort(-predicted_probs[:, index])[:k]
        close_to_one_indices.extend(top_k_indices)

    close_to_one_indices = list(set(close_to_one_indices))  # Remove duplicates

    # Calculate average probabilities across the selected users
    avg_predicted_tags = np.mean(predicted_probs[close_to_one_indices, :], axis=0)

    # Apply sigmoid to the average to get the final probabilities
    final_probabilities = sigmoid(avg_predicted_tags)

    return final_probabilities

# Example usage
U_path = '/content/U_matrix_final1.pt'
V_path = '/content/V_matrix_final1.pt'

# Load the matrices
U, V = load_matrices(U_path, V_path)

known_tags = ['oop', 'sorting', 'math', 'big-o', 'arrays']  # Example known tags
known_tags_indices = [tags_list.index(tag) for tag in known_tags]

# Predict the tags for all users
predicted_tags = predict_tags_binary(U, V, known_tags_indices, len(tags_list), k=10)

# Sort the predicted tags by probability in descending order
sorted_tags = sorted(zip(tags_list, predicted_tags), key=lambda x: x[1], reverse=True)

# Display the sorted predicted probabilities
for tag, prob in sorted_tags:
    print(f"{tag}: {prob:.4f}")


arrays: 0.7311
sorting: 0.7311
oop: 0.7311
math: 0.7311
big-o: 0.7311
shapely: 0.7246
linked-list: 0.7242
c-strings: 0.7239
singly-linked-list: 0.7229
python: 0.7228
linear-programming: 0.7227
list: 0.7222
insertion-sort: 0.7218
recursion: 0.7218
selection-sort: 0.7217
data-structures: 0.7215
stack: 0.7215
probability: 0.7211
binary-tree: 0.7211
heap-memory: 0.7210
substring: 0.7209
c: 0.7207
partitioning: 0.7206
space-complexity: 0.7206
tree-balancing: 0.7205
counting-sort: 0.7205
java: 0.7204
game-theory: 0.7204
abstract-data-type: 0.7203
nested-lists: 0.7203
graph-algorithm: 0.7201
kruskals-algorithm: 0.7200
linear-probing: 0.7199
string: 0.7198
adjacency-matrix: 0.7198
dynamic-allocation: 0.7198
hashmap: 0.7198
proof-of-correctness: 0.7197
heap: 0.7197
binary-heap: 0.7196
network-flow: 0.7196
hash-function: 0.7196
time-complexity: 0.7195
knapsack-problem: 0.7194
disjoint-sets: 0.7194
linkedhashmap: 0.7193
hungarian-algorithm: 0.7193
graph-coloring: 0.7193
arrayofarrays: 0.7193
stri