## 3_cos_rank.ipynb

This notebook performs cosine similarity calculations and ranking for document chunks. The workflow includes:
1. Loading precomputed embeddings for queries and document chunks.
2. Calculating cosine similarity in manageable chunks to optimize memory usage.
3. Ranking documents for each query based on similarity scores.
4. Saving the ranked results for submission.

### Output
- Intermediate `.npy` files for similarity scores in `chunk_m3_cos/` directory.
- A submission file (`m3_128_submission.csv`) containing ranked document IDs for each query.


In [1]:
import pickle
import numpy as np
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity

The document is divided into several parts

In [2]:
# Load query vector
with open('../pkl/m3_chunk_512/m3_query_embedding.pkl', 'rb') as f:
    query_embeddings_dict = pickle.load(f)
query_ids = list(query_embeddings_dict.keys())
query_embeddings = np.array([query_embeddings_dict[qid] for qid in query_ids])

In [3]:
# doc embeddings range is the number of documents
doc_file_paths = [f'../pkl/m3_chunk_512/m3_chunk_512_embedding_{i}.pkl' for i in range(1,26)]

In [7]:
chunk_size = 100  # Process 100 lines at a time to reduce memory usage
# Process each document file individually
for i, file_path in enumerate(tqdm(doc_file_paths, desc="Processing documents")):
    print('doc_file:',file_path)
    with open(file_path, 'rb') as f:
        doc_embeddings_dict = pickle.load(f)

    doc_ids = list(doc_embeddings_dict.keys())
    doc_embeddings = np.array([doc_embeddings_dict[doc_id] for doc_id in doc_ids])
    del doc_embeddings_dict
    # Calculate similarity in chunks and save
    for start in tqdm(range(0, query_embeddings.shape[0], chunk_size), desc=f"Calculating chunks for document {i+1}", leave=False):
        
        end = start + chunk_size
        print("start:",start,"end:",end)
        query_chunk = query_embeddings[start:end]
        # Calculate the similarity of the current block
        chunk_similarity_scores = cosine_similarity(query_chunk, doc_embeddings)
        print()
        # Append save to file
        with open(f'./chunk_doc_512/chunk_m3_cos/m3_similarity_scores_part{i}_chunk{start}.npy', 'ab') as f:
            np.save(f, chunk_similarity_scores)

        del chunk_similarity_scores

    del doc_embeddings

Processing documents:   0%|          | 0/25 [00:00<?, ?it/s]

doc_file: ../pkl/m3_chunk_512/m3_chunk_512_embedding_1.pkl




start: 0 end: 100





start: 100 end: 200





start: 200 end: 300





start: 300 end: 400





start: 400 end: 500





start: 500 end: 600





start: 600 end: 700





start: 700 end: 800





start: 800 end: 900





start: 900 end: 1000





start: 1000 end: 1100





start: 1100 end: 1200





start: 1200 end: 1300





start: 1300 end: 1400





start: 1400 end: 1500





start: 1500 end: 1600





start: 1600 end: 1700





start: 1700 end: 1800





start: 1800 end: 1900





start: 1900 end: 2000



Processing documents:   4%|▍         | 1/25 [01:34<37:42, 94.28s/it]

doc_file: ../pkl/m3_chunk_512/m3_chunk_512_embedding_2.pkl




start: 0 end: 100





start: 100 end: 200





start: 200 end: 300





start: 300 end: 400





start: 400 end: 500





start: 500 end: 600





start: 600 end: 700





start: 700 end: 800





start: 800 end: 900





start: 900 end: 1000





start: 1000 end: 1100





start: 1100 end: 1200





start: 1200 end: 1300





start: 1300 end: 1400





start: 1400 end: 1500





start: 1500 end: 1600





start: 1600 end: 1700





start: 1700 end: 1800





start: 1800 end: 1900





start: 1900 end: 2000



Processing documents:   8%|▊         | 2/25 [03:14<37:22, 97.50s/it]

doc_file: ../pkl/m3_chunk_512/m3_chunk_512_embedding_3.pkl




start: 0 end: 100





start: 100 end: 200





start: 200 end: 300





start: 300 end: 400





start: 400 end: 500





start: 500 end: 600





start: 600 end: 700





start: 700 end: 800





start: 800 end: 900





start: 900 end: 1000





start: 1000 end: 1100





start: 1100 end: 1200





start: 1200 end: 1300





start: 1300 end: 1400





start: 1400 end: 1500





start: 1500 end: 1600





start: 1600 end: 1700





start: 1700 end: 1800





start: 1800 end: 1900





start: 1900 end: 2000



Processing documents:  12%|█▏        | 3/25 [04:50<35:38, 97.20s/it]

doc_file: ../pkl/m3_chunk_512/m3_chunk_512_embedding_4.pkl




start: 0 end: 100





start: 100 end: 200





start: 200 end: 300





start: 300 end: 400





start: 400 end: 500





start: 500 end: 600





start: 600 end: 700





start: 700 end: 800





start: 800 end: 900





start: 900 end: 1000





start: 1000 end: 1100





start: 1100 end: 1200





start: 1200 end: 1300





start: 1300 end: 1400





start: 1400 end: 1500





start: 1500 end: 1600





start: 1600 end: 1700





start: 1700 end: 1800





start: 1800 end: 1900





start: 1900 end: 2000



Processing documents:  16%|█▌        | 4/25 [06:27<33:55, 96.92s/it]

doc_file: ../pkl/m3_chunk_512/m3_chunk_512_embedding_5.pkl




start: 0 end: 100





start: 100 end: 200





start: 200 end: 300





start: 300 end: 400





start: 400 end: 500





start: 500 end: 600





start: 600 end: 700





start: 700 end: 800





start: 800 end: 900





start: 900 end: 1000





start: 1000 end: 1100





start: 1100 end: 1200





start: 1200 end: 1300





start: 1300 end: 1400





start: 1400 end: 1500





start: 1500 end: 1600





start: 1600 end: 1700





start: 1700 end: 1800





start: 1800 end: 1900





start: 1900 end: 2000



Processing documents:  20%|██        | 5/25 [08:20<34:15, 102.75s/it]

doc_file: ../pkl/m3_chunk_512/m3_chunk_512_embedding_6.pkl




start: 0 end: 100





start: 100 end: 200





start: 200 end: 300





start: 300 end: 400





start: 400 end: 500





start: 500 end: 600





start: 600 end: 700





start: 700 end: 800





start: 800 end: 900





start: 900 end: 1000





start: 1000 end: 1100





start: 1100 end: 1200





start: 1200 end: 1300





start: 1300 end: 1400





start: 1400 end: 1500





start: 1500 end: 1600





start: 1600 end: 1700





start: 1700 end: 1800





start: 1800 end: 1900





start: 1900 end: 2000



Processing documents:  24%|██▍       | 6/25 [09:58<31:58, 100.99s/it]

doc_file: ../pkl/m3_chunk_512/m3_chunk_512_embedding_7.pkl




start: 0 end: 100





start: 100 end: 200





start: 200 end: 300





start: 300 end: 400





start: 400 end: 500





start: 500 end: 600





start: 600 end: 700





start: 700 end: 800





start: 800 end: 900





start: 900 end: 1000





start: 1000 end: 1100





start: 1100 end: 1200





start: 1200 end: 1300





start: 1300 end: 1400





start: 1400 end: 1500





start: 1500 end: 1600





start: 1600 end: 1700





start: 1700 end: 1800





start: 1800 end: 1900





start: 1900 end: 2000



Processing documents:  28%|██▊       | 7/25 [11:39<30:20, 101.11s/it]

doc_file: ../pkl/m3_chunk_512/m3_chunk_512_embedding_8.pkl




start: 0 end: 100





start: 100 end: 200





start: 200 end: 300





start: 300 end: 400





start: 400 end: 500





start: 500 end: 600





start: 600 end: 700





start: 700 end: 800





start: 800 end: 900





start: 900 end: 1000





start: 1000 end: 1100





start: 1100 end: 1200





start: 1200 end: 1300





start: 1300 end: 1400





start: 1400 end: 1500





start: 1500 end: 1600





start: 1600 end: 1700





start: 1700 end: 1800





start: 1800 end: 1900





start: 1900 end: 2000



Processing documents:  32%|███▏      | 8/25 [16:49<47:30, 167.65s/it]

doc_file: ../pkl/m3_chunk_512/m3_chunk_512_embedding_9.pkl




start: 0 end: 100





start: 100 end: 200





start: 200 end: 300





start: 300 end: 400





start: 400 end: 500





start: 500 end: 600





start: 600 end: 700





start: 700 end: 800





start: 800 end: 900





start: 900 end: 1000





start: 1000 end: 1100





start: 1100 end: 1200





start: 1200 end: 1300





start: 1300 end: 1400





start: 1400 end: 1500





start: 1500 end: 1600





start: 1600 end: 1700





start: 1700 end: 1800





start: 1800 end: 1900





start: 1900 end: 2000



Processing documents:  36%|███▌      | 9/25 [18:20<38:19, 143.70s/it]

doc_file: ../pkl/m3_chunk_512/m3_chunk_512_embedding_10.pkl




start: 0 end: 100





start: 100 end: 200





start: 200 end: 300





start: 300 end: 400





start: 400 end: 500





start: 500 end: 600





start: 600 end: 700





start: 700 end: 800





start: 800 end: 900





start: 900 end: 1000





start: 1000 end: 1100





start: 1100 end: 1200





start: 1200 end: 1300





start: 1300 end: 1400





start: 1400 end: 1500





start: 1500 end: 1600





start: 1600 end: 1700





start: 1700 end: 1800





start: 1800 end: 1900





start: 1900 end: 2000



Processing documents:  40%|████      | 10/25 [19:53<31:59, 127.99s/it]

doc_file: ../pkl/m3_chunk_512/m3_chunk_512_embedding_11.pkl




start: 0 end: 100





start: 100 end: 200





start: 200 end: 300





start: 300 end: 400





start: 400 end: 500





start: 500 end: 600





start: 600 end: 700





start: 700 end: 800





start: 800 end: 900





start: 900 end: 1000





start: 1000 end: 1100





start: 1100 end: 1200





start: 1200 end: 1300





start: 1300 end: 1400





start: 1400 end: 1500





start: 1500 end: 1600





start: 1600 end: 1700





start: 1700 end: 1800





start: 1800 end: 1900





start: 1900 end: 2000



Processing documents:  44%|████▍     | 11/25 [21:27<27:27, 117.71s/it]

doc_file: ../pkl/m3_chunk_512/m3_chunk_512_embedding_12.pkl




start: 0 end: 100





start: 100 end: 200





start: 200 end: 300





start: 300 end: 400





start: 400 end: 500





start: 500 end: 600





start: 600 end: 700





start: 700 end: 800





start: 800 end: 900





start: 900 end: 1000





start: 1000 end: 1100





start: 1100 end: 1200





start: 1200 end: 1300





start: 1300 end: 1400





start: 1400 end: 1500





start: 1500 end: 1600





start: 1600 end: 1700





start: 1700 end: 1800





start: 1800 end: 1900





start: 1900 end: 2000



Processing documents:  48%|████▊     | 12/25 [23:00<23:51, 110.13s/it]

doc_file: ../pkl/m3_chunk_512/m3_chunk_512_embedding_13.pkl




start: 0 end: 100





start: 100 end: 200





start: 200 end: 300





start: 300 end: 400





start: 400 end: 500





start: 500 end: 600





start: 600 end: 700





start: 700 end: 800





start: 800 end: 900





start: 900 end: 1000





start: 1000 end: 1100





start: 1100 end: 1200





start: 1200 end: 1300





start: 1300 end: 1400





start: 1400 end: 1500





start: 1500 end: 1600





start: 1600 end: 1700





start: 1700 end: 1800





start: 1800 end: 1900





start: 1900 end: 2000



Processing documents:  52%|█████▏    | 13/25 [24:33<20:59, 104.96s/it]

doc_file: ../pkl/m3_chunk_512/m3_chunk_512_embedding_14.pkl




start: 0 end: 100





start: 100 end: 200





start: 200 end: 300





start: 300 end: 400





start: 400 end: 500





start: 500 end: 600





start: 600 end: 700





start: 700 end: 800





start: 800 end: 900





start: 900 end: 1000





start: 1000 end: 1100





start: 1100 end: 1200





start: 1200 end: 1300





start: 1300 end: 1400





start: 1400 end: 1500





start: 1500 end: 1600





start: 1600 end: 1700





start: 1700 end: 1800





start: 1800 end: 1900





start: 1900 end: 2000



Processing documents:  56%|█████▌    | 14/25 [26:09<18:43, 102.09s/it]

doc_file: ../pkl/m3_chunk_512/m3_chunk_512_embedding_15.pkl




start: 0 end: 100





start: 100 end: 200





start: 200 end: 300





start: 300 end: 400





start: 400 end: 500





start: 500 end: 600





start: 600 end: 700





start: 700 end: 800





start: 800 end: 900





start: 900 end: 1000





start: 1000 end: 1100





start: 1100 end: 1200





start: 1200 end: 1300





start: 1300 end: 1400





start: 1400 end: 1500





start: 1500 end: 1600





start: 1600 end: 1700





start: 1700 end: 1800





start: 1800 end: 1900





start: 1900 end: 2000



Processing documents:  60%|██████    | 15/25 [27:42<16:35, 99.59s/it] 

doc_file: ../pkl/m3_chunk_512/m3_chunk_512_embedding_16.pkl




start: 0 end: 100





start: 100 end: 200





start: 200 end: 300





start: 300 end: 400





start: 400 end: 500





start: 500 end: 600





start: 600 end: 700





start: 700 end: 800





start: 800 end: 900





start: 900 end: 1000





start: 1000 end: 1100





start: 1100 end: 1200





start: 1200 end: 1300





start: 1300 end: 1400





start: 1400 end: 1500





start: 1500 end: 1600





start: 1600 end: 1700





start: 1700 end: 1800





start: 1800 end: 1900





start: 1900 end: 2000



Processing documents:  64%|██████▍   | 16/25 [29:15<14:36, 97.36s/it]

doc_file: ../pkl/m3_chunk_512/m3_chunk_512_embedding_17.pkl




start: 0 end: 100





start: 100 end: 200





start: 200 end: 300





start: 300 end: 400





start: 400 end: 500





start: 500 end: 600





start: 600 end: 700





start: 700 end: 800





start: 800 end: 900





start: 900 end: 1000





start: 1000 end: 1100





start: 1100 end: 1200





start: 1200 end: 1300





start: 1300 end: 1400





start: 1400 end: 1500





start: 1500 end: 1600





start: 1600 end: 1700





start: 1700 end: 1800





start: 1800 end: 1900





start: 1900 end: 2000



Processing documents:  68%|██████▊   | 17/25 [30:49<12:51, 96.38s/it]

doc_file: ../pkl/m3_chunk_512/m3_chunk_512_embedding_18.pkl




start: 0 end: 100





start: 100 end: 200





start: 200 end: 300





start: 300 end: 400





start: 400 end: 500





start: 500 end: 600





start: 600 end: 700





start: 700 end: 800





start: 800 end: 900





start: 900 end: 1000





start: 1000 end: 1100





start: 1100 end: 1200





start: 1200 end: 1300





start: 1300 end: 1400





start: 1400 end: 1500





start: 1500 end: 1600





start: 1600 end: 1700





start: 1700 end: 1800





start: 1800 end: 1900





start: 1900 end: 2000



Processing documents:  72%|███████▏  | 18/25 [33:19<13:08, 112.65s/it]

doc_file: ../pkl/m3_chunk_512/m3_chunk_512_embedding_19.pkl




start: 0 end: 100





start: 100 end: 200





start: 200 end: 300





start: 300 end: 400





start: 400 end: 500





start: 500 end: 600





start: 600 end: 700





start: 700 end: 800





start: 800 end: 900





start: 900 end: 1000





start: 1000 end: 1100





start: 1100 end: 1200





start: 1200 end: 1300





start: 1300 end: 1400





start: 1400 end: 1500





start: 1500 end: 1600





start: 1600 end: 1700





start: 1700 end: 1800





start: 1800 end: 1900





start: 1900 end: 2000



Processing documents:  76%|███████▌  | 19/25 [34:53<10:41, 106.96s/it]

doc_file: ../pkl/m3_chunk_512/m3_chunk_512_embedding_20.pkl




start: 0 end: 100





start: 100 end: 200





start: 200 end: 300





start: 300 end: 400





start: 400 end: 500





start: 500 end: 600





start: 600 end: 700





start: 700 end: 800





start: 800 end: 900





start: 900 end: 1000





start: 1000 end: 1100





start: 1100 end: 1200





start: 1200 end: 1300





start: 1300 end: 1400





start: 1400 end: 1500





start: 1500 end: 1600





start: 1600 end: 1700





start: 1700 end: 1800





start: 1800 end: 1900





start: 1900 end: 2000



Processing documents:  80%|████████  | 20/25 [36:29<08:39, 103.81s/it]

doc_file: ../pkl/m3_chunk_512/m3_chunk_512_embedding_21.pkl




start: 0 end: 100





start: 100 end: 200





start: 200 end: 300





start: 300 end: 400





start: 400 end: 500





start: 500 end: 600





start: 600 end: 700





start: 700 end: 800





start: 800 end: 900





start: 900 end: 1000





start: 1000 end: 1100





start: 1100 end: 1200





start: 1200 end: 1300





start: 1300 end: 1400





start: 1400 end: 1500





start: 1500 end: 1600





start: 1600 end: 1700





start: 1700 end: 1800





start: 1800 end: 1900





start: 1900 end: 2000



Processing documents:  84%|████████▍ | 21/25 [38:23<07:06, 106.71s/it]

doc_file: ../pkl/m3_chunk_512/m3_chunk_512_embedding_22.pkl




start: 0 end: 100





start: 100 end: 200





start: 200 end: 300





start: 300 end: 400





start: 400 end: 500





start: 500 end: 600





start: 600 end: 700





start: 700 end: 800





start: 800 end: 900





start: 900 end: 1000





start: 1000 end: 1100





start: 1100 end: 1200





start: 1200 end: 1300





start: 1300 end: 1400





start: 1400 end: 1500





start: 1500 end: 1600





start: 1600 end: 1700





start: 1700 end: 1800





start: 1800 end: 1900





start: 1900 end: 2000



Processing documents:  88%|████████▊ | 22/25 [40:10<05:20, 106.72s/it]

doc_file: ../pkl/m3_chunk_512/m3_chunk_512_embedding_23.pkl




start: 0 end: 100





start: 100 end: 200





start: 200 end: 300





start: 300 end: 400





start: 400 end: 500





start: 500 end: 600





start: 600 end: 700





start: 700 end: 800





start: 800 end: 900





start: 900 end: 1000





start: 1000 end: 1100





start: 1100 end: 1200





start: 1200 end: 1300





start: 1300 end: 1400





start: 1400 end: 1500





start: 1500 end: 1600





start: 1600 end: 1700





start: 1700 end: 1800





start: 1800 end: 1900





start: 1900 end: 2000



Processing documents:  92%|█████████▏| 23/25 [41:46<03:27, 103.63s/it]

doc_file: ../pkl/m3_chunk_512/m3_chunk_512_embedding_24.pkl




start: 0 end: 100





start: 100 end: 200





start: 200 end: 300





start: 300 end: 400





start: 400 end: 500





start: 500 end: 600





start: 600 end: 700





start: 700 end: 800





start: 800 end: 900





start: 900 end: 1000





start: 1000 end: 1100





start: 1100 end: 1200





start: 1200 end: 1300





start: 1300 end: 1400





start: 1400 end: 1500





start: 1500 end: 1600





start: 1600 end: 1700





start: 1700 end: 1800





start: 1800 end: 1900





start: 1900 end: 2000



Processing documents:  96%|█████████▌| 24/25 [43:22<01:41, 101.36s/it]

doc_file: ../pkl/m3_chunk_512/m3_chunk_512_embedding_25.pkl




start: 0 end: 100





start: 100 end: 200





start: 200 end: 300





start: 300 end: 400





start: 400 end: 500





start: 500 end: 600





start: 600 end: 700





start: 700 end: 800





start: 800 end: 900





start: 900 end: 1000





start: 1000 end: 1100





start: 1100 end: 1200





start: 1200 end: 1300





start: 1300 end: 1400





start: 1400 end: 1500





start: 1500 end: 1600





start: 1600 end: 1700





start: 1700 end: 1800





start: 1800 end: 1900





start: 1900 end: 2000



Processing documents: 100%|██████████| 25/25 [45:00<00:00, 108.02s/it]


In [8]:
print('finish')

finish


In [9]:
# Merge the generated 2000/size * n files into n files according to the query
# Merge function
def merge_parts_for_query_chunk(chunk_size):
    for start in range(0, 2000, chunk_size):
        # Use a loop to load all part files
        parts = [np.load(f'./chunk_doc_512/chunk_m3_cos/m3_similarity_scores_part{part_idx}_chunk{start}.npy') for part_idx in range(20)]
        
        # Splice in column direction (axis=1)
        merged_chunk = np.concatenate(parts, axis=1)
        print(merged_chunk.shape)
        # Save the merged result
        np.save(f'./chunk_doc_512/chunk_m3_cos/merged_similarity_scores_chunk{start}.npy', merged_chunk)
        # Delete spliced temporary data
        del merged_chunk

# Implementation of the merger
merge_parts_for_query_chunk(chunk_size=chunk_size)

(100, 10931284)
(100, 10931284)
(100, 10931284)
(100, 10931284)
(100, 10931284)
(100, 10931284)
(100, 10931284)
(100, 10931284)
(100, 10931284)
(100, 10931284)
(100, 10931284)
(100, 10931284)
(100, 10931284)
(100, 10931284)
(100, 10931284)
(100, 10931284)
(100, 10931284)
(100, 10931284)
(100, 10931284)
(100, 10931284)


Rank

In [10]:
doc_list = [f'../pkl/m3_chunk_512/m3_chunk_512_embedding_{i}.pkl' for i in range(1,26)]
doc_ids = []
for doc in doc_list:
    with open(doc, 'rb') as f:
        doc_embeddings_dict = pickle.load(f)
    # Extract the query ID and corresponding vector
    doc_ids += list(doc_embeddings_dict.keys())
del doc_embeddings_dict

In [11]:
%%time
# Get the 10 most relevant documents for each query
top_k = 10
chunk_size = 100 # how many query's per small document
submission = []
for i in range(2000 // chunk_size):
    # Here's the similarity file open
    with open(f'./chunk_doc_512/chunk_m3_cos/merged_similarity_scores_chunk{i*chunk_size}.npy','rb') as f:
        similarity_scores = np.load(f)
        # Returns an indexed array of input arrays sorted in ascending order.
        retrieved_docs = np.argsort(-similarity_scores, axis=1)[:, :20]  # Sort the similarity from largest to smallest and take the index of the top 20 documents
        # Get the corresponding docid
    top_k_doc_ids = [[doc_ids[idx] for idx in retrieved_docs[i]] for i in range(len(retrieved_docs))]
    # Process two-dimensional arrays, preserving the portion of each string before _chunk
    top_k_doc_ids = [[doc_id.split('_chunk')[0] for doc_id in sublist] for sublist in top_k_doc_ids]
    print(top_k_doc_ids[0])
    id = 0 + chunk_size * i

    # For each query
    for i in range(chunk_size):
        top_docs = top_k_doc_ids[i] # Get the first 20 of that enquiry
        unique_docs = []
        seen_docs = set()  # Used to track added document IDs

        for doc in top_docs:
            # If the document ID has not been added before, add to the list of unique documents
            if doc not in seen_docs:
                unique_docs.append(doc)
                seen_docs.add(doc)
            # If 10 unique documents have been collected, it can be stopped
            if len(unique_docs) == top_k:
                break
        # If there are fewer than 10 unique documents, they can be filled with empty strings
        while len(unique_docs) < top_k:
            unique_docs.append('')

        # Ensure that there are 10 document IDs in the predicted_docs column, separated by spaces
        submission.append({
            'id': id,
            'docids': str(' '.join(unique_docs).split())
        })
        id = id + 1

['doc-en-0', 'doc-en-794977', 'doc-en-310297', 'doc-en-0', 'doc-en-14117', 'doc-en-794977', 'doc-en-0', 'doc-en-0', 'doc-en-0', 'doc-en-829005', 'doc-en-665739', 'doc-fr-8346', 'doc-en-831056', 'doc-en-792602', 'doc-en-790703', 'doc-en-0', 'doc-en-827465', 'doc-en-753726', 'doc-en-824371', 'doc-en-14117']
['doc-en-281459', 'doc-en-432773', 'doc-en-2068', 'doc-en-594627', 'doc-en-15306', 'doc-en-806311', 'doc-en-284632', 'doc-en-13362', 'doc-en-466795', 'doc-en-99621', 'doc-en-359514', 'doc-en-661699', 'doc-es-15198', 'doc-en-605634', 'doc-en-820772', 'doc-en-466795', 'doc-en-791654', 'doc-en-752985', 'doc-en-597315', 'doc-en-641481']
['doc-en-754418', 'doc-en-748368', 'doc-fr-11366', 'doc-fr-9402', 'doc-fr-8089', 'doc-en-296401', 'doc-en-2191', 'doc-fr-14192', 'doc-fr-13329', 'doc-en-741733', 'doc-en-807393', 'doc-fr-12976', 'doc-en-296401', 'doc-fr-10840', 'doc-fr-9067', 'doc-en-395359', 'doc-fr-9148', 'doc-en-796493', 'doc-it-3318', 'doc-en-58020']
['doc-en-7438', 'doc-en-7438', 'doc

In [12]:
print(len(submission))

2000


In [14]:
import pandas as pd
# Saved as submission.csv
submission_df = pd.DataFrame(submission)
submission_df.to_csv('m3_128_submission.csv', index=False)