In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/sample/search.npy
/kaggle/input/sample/sample 1.csv


## Data Loading and Preprocessing

In [2]:
emails = pd.read_csv('/kaggle/input/sample/sample 1.csv')
emails.head()

Unnamed: 0,Subject,From,To,Date,Body
0,Project Update,alice@orgA.com,bob@orgB.com,2024-01-02,"Hi Bob, I wanted to give you a quick update on..."
1,Re: Project Update,bob@orgB.com,alice@orgA.com,2024-01-03,"Hey Alice, Thanks for the update! I'm really l..."
2,Confidential: Design Specs,alice@orgA.com,bob@orgB.com,2024-01-04,"Bob, I've attached some preliminary design spe..."
3,HR Policy Changes,hr@orgA.com,all@orgA.com,2024-01-05,"Dear Team, We have updated our HR policies reg..."
4,Quick Question About Specs,bob@orgB.com,alice@orgA.com,2024-01-06,"Hi Alice, I had a question about the specs you..."


In [3]:
emails_formatted = emails.to_dict(orient='records') # Array of Dict
emails_formatted

[{'Subject': 'Project Update',
  'From': 'alice@orgA.com',
  'To': 'bob@orgB.com',
  'Date': '2024-01-02',
  'Body': "Hi Bob, I wanted to give you a quick update on our current project. Things are progressing well, and we're nearing the final stages of development. I think you'll be impressed with what we have in store. Let's catch up soon. Best, Alice"},
 {'Subject': 'Re: Project Update',
  'From': 'bob@orgB.com',
  'To': 'alice@orgA.com',
  'Date': '2024-01-03',
  'Body': "Hey Alice, Thanks for the update! I'm really looking forward to seeing the final product. Let's schedule a meeting next week to discuss potential collaborations. Cheers, Bob"},
 {'Subject': 'Confidential: Design Specs',
  'From': 'alice@orgA.com',
  'To': 'bob@orgB.com',
  'Date': '2024-01-04',
  'Body': "Bob, I've attached some preliminary design specs for the upcoming project. Please keep this under wraps as we are still finalizing details internally. Your insights would be invaluable, but let's keep this between

### Utility Functions

In [4]:
def strip_email(email):
    for key in email:
        if isinstance(email[key], str):
            email[key] = email[key].strip()
    return email
    
def index_emails(emails):
    for idx, email in enumerate(emails):
        emails[idx] = strip_email(emails[idx])

    return emails

In [5]:
emails_formatted = index_emails(emails_formatted)


## Semantic Search using FAISS

In [6]:
!pip install faiss-cpu sentence-transformers

Collecting faiss-cpu
  Downloading faiss_cpu-1.9.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Collecting sentence-transformers
  Downloading sentence_transformers-3.3.1-py3-none-any.whl.metadata (10 kB)
Downloading faiss_cpu-1.9.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (27.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.5/27.5 MB[0m [31m53.6 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hDownloading sentence_transformers-3.3.1-py3-none-any.whl (268 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m21.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu, sentence-transformers
Successfully installed faiss-cpu-1.9.0.post1 sentence-transformers-3.3.1


In [7]:
documents = [f"{item['Subject']} {item['Body']}" for item in emails_formatted]

In [8]:
# import faiss
# from sentence_transformers import SentenceTransformer
# model = SentenceTransformer("all-mpnet-base-v2")

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [10]:
embeddings = model.encode(documents)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [11]:
# Create a FAISS index
dimension = embeddings.shape[1]  # Dimensionality of the embeddings
index = faiss.IndexFlatL2(dimension)  # Using L2 distance for similarity search

# Add embeddings to the index
index.add(embeddings)

In [12]:
# Define your search query
query = "Sharing secret information"
query_embedding = model.encode([query]).astype('float32')

# Search the index for the top k similar documents
k = 5  # Number of nearest neighbors to retrieve
D, I = index.search(query_embedding, k)  # D: distances, I: indices of nearest neighbors

# Retrieve results based on indices
results = [emails_formatted[i] for i in I[0]]
results

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[{'Subject': 'Confidential Insights Needed',
  'From': 'alice@orgA.com',
  'To': 'bob@orgB.com',
  'Date': '2024-01-18',
  'Body': "Hey Bob, I'm looking for some confidential insights regarding competitor strategies in light of recent market shifts. Your perspective would be invaluable here—let's keep this off the record, okay? Best, alice"},
 {'Subject': 'Urgent: Need Your Thoughts',
  'From': 'alice@orgA.com',
  'To': 'bob@orgB.com',
  'Date': '2024-01-22',
  'Body': 'Bob, I need your thoughts on some sensitive topics regarding our upcoming launch strategy—specifically how we can position ourselves against competitors without drawing too much attention. Can we chat later today? alice'},
 {'Subject': 'Insider Info Needed ASAP',
  'From': 'alice@orgA.com',
  'To': 'bob@orgB.com',
  'Date': '2024-01-10',
  'Body': "Bob, I need your expertise on something urgent. Can we discuss the latest market trends? I believe they could impact our upcoming launch significantly. Let me know when you'r

In [88]:
len(results)

5

## Intersection between Elastic Search and Semantic Serach

In [13]:
elastic_array =  np.load('/kaggle/input/sample/search.npy',allow_pickle=True)
semantic_results = np.array(results)
semantic_results = semantic_results.tolist()
#semantic_results
semantic_df = pd.DataFrame(semantic_results)
semantic_df

Unnamed: 0,Subject,From,To,Date,Body
0,Confidential Insights Needed,alice@orgA.com,bob@orgB.com,2024-01-18,"Hey Bob, I'm looking for some confidential ins..."
1,Urgent: Need Your Thoughts,alice@orgA.com,bob@orgB.com,2024-01-22,"Bob, I need your thoughts on some sensitive to..."
2,Insider Info Needed ASAP,alice@orgA.com,bob@orgB.com,2024-01-10,"Bob, I need your expertise on something urgent..."
3,Final Review Needed Before Launch,alice@orgA.com,bob@orgB.com,2024-01-26,"Hey Bob, We're in the final review stages befo..."
4,Confidential: Design Specs,alice@orgA.com,bob@orgB.com,2024-01-04,"Bob, I've attached some preliminary design spe..."


In [14]:
elastic_results = [entry['_source'] for entry in elastic_array]
elastic_df = pd.DataFrame(elastic_results)
#elastic_results
elastic_df

Unnamed: 0,Subject,From,To,Date,Body
0,Confidential Insights Needed,alice@orgA.com,bob@orgB.com,2024-01-18,"Hey Bob, I'm looking for some confidential ins..."
1,Confidential: Design Specs,alice@orgA.com,bob@orgB.com,2024-01-04,"Bob, I've attached some preliminary design spe..."
2,Project Feedback Request,carol@orgA.com,team@orgA.com,2024-01-20,"Hi Team, As we wrap up this phase of the proje..."
3,Re: Quick Question About Specs,alice@orgA.com,bob@orgB.com,2024-01-07,"Hey Bob, Absolutely! There are a few features ..."
4,Final Review Needed Before Launch,alice@orgA.com,bob@orgB.com,2024-01-26,"Hey Bob, We're in the final review stages befo..."
5,Market Analysis Request,alice@orgA.com,bob@orgB.com,2024-01-14,"Bob, Could you send me that market analysis re..."
6,Re: Confidential Insights Needed,bob@orgB.com,alice@orgA.com,2024-01-19,"Of course, Alice! I'll compile some thoughts a..."
7,Project Update,alice@orgA.com,bob@orgB.com,2024-01-02,"Hi Bob, I wanted to give you a quick update on..."
8,Re: Meeting Reminder: Project Sync-Up,dave@orgA.com,carol@orgA.com,2024-01-09,"Hi Carol, Looking forward to the meeting! I've..."
9,Re: Final Review Needed Before Launch,bob@orgB.com,alice@orgA.com,2024-01-27,"Of course, Alice! I'll review everything and p..."


In [15]:
hybrid_df = pd.merge(semantic_df, elastic_df, on=['Subject', 'From', 'To', 'Date', 'Body'], how='inner')


In [16]:
hybrid_df.to_dict(orient='records')

[{'Subject': 'Confidential Insights Needed',
  'From': 'alice@orgA.com',
  'To': 'bob@orgB.com',
  'Date': '2024-01-18',
  'Body': "Hey Bob, I'm looking for some confidential insights regarding competitor strategies in light of recent market shifts. Your perspective would be invaluable here—let's keep this off the record, okay? Best, alice"},
 {'Subject': 'Final Review Needed Before Launch',
  'From': 'alice@orgA.com',
  'To': 'bob@orgB.com',
  'Date': '2024-01-26',
  'Body': "Hey Bob, We're in the final review stages before launch and need your keen eye on a few details—especially regarding competitive positioning and market entry tactics. Can you help? Let's keep it discreet as always! alice"},
 {'Subject': 'Confidential: Design Specs',
  'From': 'alice@orgA.com',
  'To': 'bob@orgB.com',
  'Date': '2024-01-04',
  'Body': "Bob, I've attached some preliminary design specs for the upcoming project. Please keep this under wraps as we are still finalizing details internally. Your insights

## Stella

In [9]:
# ！The default dimension is 1024, if you need other dimensions, please clone the model and modify `modules.json` to replace `2_Dense_1024` with another dimension, e.g. `2_Dense_256` or `2_Dense_8192` !
import faiss
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("dunzhang/stella_en_1.5B_v5", trust_remote_code=True).cuda()
doc_embeddings = model.encode(documents)


modules.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/397 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/174k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/51.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/844 [00:00<?, ?B/s]

modeling_qwen.py:   0%|          | 0.00/65.3k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/dunzhang/stella_en_1.5B_v5:
- modeling_qwen.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors:   0%|          | 0.00/6.17G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.31k [00:00<?, ?B/s]

tokenization_qwen.py:   0%|          | 0.00/10.8k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/dunzhang/stella_en_1.5B_v5:
- tokenization_qwen.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/80.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/370 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/289 [00:00<?, ?B/s]

2_Dense_1024/config.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/6.30M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/6.30M [00:00<?, ?B/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]