In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

2+2

In [1]:
!pip install faiss-gpu

Collecting faiss-gpu
  Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.4 kB)
Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (85.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 MB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0mm
[?25hInstalling collected packages: faiss-gpu
Successfully installed faiss-gpu-1.7.2


In [4]:
import faiss
dir(faiss.StandardGpuResources())

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__swig_destroy__',
 '__weakref__',
 'getDefaultStream',
 'getMemoryInfo',
 'getResources',
 'getTempMemoryAvailable',
 'noTempMemory',
 'revertDefaultStream',
 'setDefaultNullStreamAllDevices',
 'setDefaultStream',
 'setLogMemoryAllocations',
 'setPinnedMemory',
 'setTempMemory',
 'syncDefaultStreamCurrentDevice',
 'this',
 'thisown']

In [7]:
import os;os.listdir('/kaggle/input/embeddings-2-feb-a/')

['customer_embeddings.npz', 'article_embeddings.npz']

In [10]:
import numpy as np
import faiss
import pandas as pd

# --- Step 1: Load Embedding Data (with allow_pickle=True) ---
article_data = np.load("/kaggle/input/embeddings-2-feb-a//article_embeddings.npz", allow_pickle=True)
article_ids = article_data['article_ids']              # e.g., shape (num_articles,)
article_embeddings = article_data['article_embeddings']  # e.g., shape (num_articles, embedding_dim)

customer_data = np.load("/kaggle/input/embeddings-2-feb-a//customer_embeddings.npz", allow_pickle=True)
customer_ids = customer_data['customer_ids']             # e.g., shape (num_customers,)
customer_embeddings = customer_data['customer_embeddings']  # e.g., shape (num_customers, embedding_dim)

# Ensure the embeddings are in float32 (FAISS requires float32)
article_embeddings = article_embeddings.astype(np.float32)
customer_embeddings = customer_embeddings.astype(np.float32)

# --- Step 2: Build FAISS GPU Index ---
embedding_dim = article_embeddings.shape[1]  # e.g., 32
k = 12  # Number of top articles to retrieve per customer

# Create FAISS GPU resources and a CPU index (using inner product for cosine similarity when vectors are normalized)
gpu_res = faiss.StandardGpuResources()
index_cpu = faiss.IndexFlatIP(embedding_dim)
index_cpu.add(article_embeddings)  # Add article embeddings to the index

# Transfer the index to the GPU (device 0)
gpu_index = faiss.index_cpu_to_gpu(gpu_res, 0, index_cpu)


In [11]:
gpu_index

<faiss.swigfaiss.GpuIndexFlat; proxy of <Swig Object of type 'faiss::gpu::GpuIndexFlat *' at 0x79d31d42eb20> >

In [12]:
import torch

if torch.cuda.is_available():
    print(f"Number of GPUs available: {torch.cuda.device_count()}")
    for i in range(torch.cuda.device_count()):
        print(f"GPU {i}: {torch.cuda.get_device_name(i)}")

Number of GPUs available: 2
GPU 0: Tesla T4
GPU 1: Tesla T4


In [13]:
print(faiss.StandardGpuResources())

<faiss.swigfaiss.StandardGpuResources; proxy of <Swig Object of type 'faiss::gpu::StandardGpuResources *' at 0x79d334f33360> >


In [14]:
num_gpus = faiss.get_num_gpus()
print(f"Number of available GPUs: {num_gpus}")

Number of available GPUs: 2


In [15]:
distances, indices = gpu_index.search(customer_embeddings, k)

In [17]:
distances.shape

(1371980, 12)

In [18]:

recommended_article_ids = article_ids[indices]  # shape: (num_customers, k)

In [19]:
recommended_article_ids

array([[759871025, 759871002, 759871034, ..., 477507010, 754413002,
        754413001],
       [759871025, 759871002, 759871034, ..., 477507010, 754413002,
        754413001],
       [759871025, 759871002, 759871034, ..., 477507010, 754413002,
        754413001],
       ...,
       [477507001, 477507010, 523490003, ..., 305931001, 146730001,
        436261001],
       [477507001, 477507010, 523490003, ..., 436261001, 146730001,
        305931001],
       [477507009, 754413002, 754413001, ..., 727948005, 854826001,
        854826002]])

In [31]:

# Map the indices back to actual article IDs.
recommended_article_ids = article_ids[indices]  # shape: (num_customers, k)

# --- Step 4: Create a Submission DataFrame and Save to CSV ---
# Create a DataFrame where each row has a customer_id and a list of recommended article_ids.
submission_df = pd.DataFrame({
    'customer_id': customer_ids,
    'prediction': [' '.join('0' + str(x) for x in rec.tolist()) for rec in recommended_article_ids]
})

# Save the submission to a CSV file
submission_df.to_csv("submission-Feb-2-C.csv", index=False)
print("Recommendations saved to faiss_recommendations.csv")

Recommendations saved to faiss_recommendations.csv


In [32]:
submission_df.prediction[0]

'0759871025 0759871002 0759871034 0864562001 0868063004 0868063001 0733749001 0733749010 0477507001 0477507010 0754413002 0754413001'

In [None]:
0706016001
759871025