In [10]:
import numpy as np
import pandas as pd
import faiss
from sentence_transformers import SentenceTransformer

In [11]:


# === 1. File paths ===
embeddings_csv_path = r"C:\Users\ASUS-PC\Desktop\mindcheck_ai_jupyter\data\aimind_embeddings.csv"
cleaned_csv_path    = r"C:\Users\ASUS-PC\Desktop\mindcheck_ai_jupyter\data\aimind_cleaned.csv"
index_save_path     = r"C:\Users\ASUS-PC\Desktop\mindcheck_ai_jupyter\data\mental_health.index"


In [25]:
print("üìÇ Loading embeddings...")
df_emb = pd.read_csv(embeddings_csv_path)

# Drop non-numeric columns
df_emb = df_emb.select_dtypes(include=[np.number])

# If we still have an extra column (769 instead of 768), drop the first one
if df_emb.shape[1] != 768:
    print(f"‚ö† Detected {df_emb.shape[1]} columns, expected 768. Dropping the first column...")
    df_emb = df_emb.iloc[:, 1:]

# Ensure float32 & contiguous
embeddings = df_emb.values.astype("float32").copy()
print(f"‚úÖ Embeddings loaded with shape {embeddings.shape}")


üìÇ Loading embeddings...
‚ö† Detected 769 columns, expected 768. Dropping the first column...
‚úÖ Embeddings loaded with shape (149015, 768)


In [26]:
# === 3. Load texts ===
print("üìÇ Loading texts...")
df_text = pd.read_csv(cleaned_csv_path)

# Try common column names for text
possible_text_cols = ["text", "body", "content", "merged_text", "post"]
text_col = None
for col in possible_text_cols:
    if col in df_text.columns:
        text_col = col
        break
if text_col is None:
    text_col = df_text.columns[0]  # fallback to first column

texts = df_text[text_col].astype(str).tolist()

# Sanity check
assert len(texts) == embeddings.shape[0], f"‚ùå Mismatch: {len(texts)} texts vs {embeddings.shape[0]} embeddings"
print(f"‚úÖ Loaded {len(texts)} texts")


üìÇ Loading texts...
‚úÖ Loaded 149015 texts


  df_text = pd.read_csv(cleaned_csv_path)


In [27]:

# === 4. Normalize & build FAISS index ===
faiss.normalize_L2(embeddings)
dim = embeddings.shape[1]
index = faiss.IndexFlatIP(dim)
index.add(embeddings)
print(f"‚úÖ FAISS index built with {index.ntotal} vectors")

‚úÖ FAISS index built with 149015 vectors


In [28]:
# === 5. Save FAISS index ===
faiss.write_index(index, index_save_path)
print(f"üíæ Saved FAISS index to {index_save_path}")

üíæ Saved FAISS index to C:\Users\ASUS-PC\Desktop\mindcheck_ai_jupyter\data\mental_health.index


In [29]:
# === 6. Load embedding model (must match how embeddings were created) ===
# Change to the same model name you used when creating aimind_embeddings.csv
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("intfloat/multilingual-e5-base")

In [30]:
# === 7. Search function with dimension check ===
def search(query, top_k=5):
    # Convert query to embedding
    query_emb = model.encode([query], normalize_embeddings=True)
    query_emb = np.array(query_emb, dtype="float32")
    
    # Check if dimensions match
    if query_emb.shape[1] != index.d:
        raise ValueError(
            f"Dimension mismatch: Query embedding dim = {query_emb.shape[1]}, "
            f"FAISS index dim = {index.d}. "
            f"Use the same embedding model that was used to create aimind_embeddings.csv."
        )
    
    # Search in FAISS index
    scores, idxs = index.search(query_emb, top_k)
    # Return text and score for top_k results
    return [(texts[i], float(scores[0][j])) for j, i in enumerate(idxs[0])]

In [31]:
# === 8. Test search ===
sample_query = "I feel anxious and can't sleep"
results = search(sample_query, top_k=3)

print(f"\nüîç Results for query: '{sample_query}'\n")
for r, s in results:
    print(f"Score: {s:.4f} | Text: {r[:100]}...")



üîç Results for query: 'I feel anxious and can't sleep'

Score: 0.0560 | Text: Why do I go crazy if there's silence? ...
Score: 0.0560 | Text: Not feeling valid because I don't have a special interest... ...
Score: 0.0560 | Text: Anybody feel guilty just for existing? ...


In [32]:
import numpy as np
import pandas as pd
import faiss
from sentence_transformers import SentenceTransformer

# === 1. File paths ===
embeddings_csv_path = r"C:\Users\ASUS-PC\Desktop\mindcheck_ai_jupyter\data\aimind_embeddings.csv"
cleaned_csv_path    = r"C:\Users\ASUS-PC\Desktop\mindcheck_ai_jupyter\data\aimind_cleaned.csv"
index_save_path     = r"C:\Users\ASUS-PC\Desktop\mindcheck_ai_jupyter\data\mental_health.index"

# === 2. Load embedding model (EXACT same one used for CSV embeddings) ===
# Change this to match your original generation step
model_name = "intfloat/multilingual-e5-base"  
model = SentenceTransformer(model_name)

# === 3. Load embeddings CSV ===
print("üìÇ Loading embeddings...")
df_emb = pd.read_csv(embeddings_csv_path)

# Drop any non-numeric columns
df_emb = df_emb.select_dtypes(include=[np.number])

# If dimension is > model output size, drop first column (likely ID)
expected_dim = model.get_sentence_embedding_dimension()
if df_emb.shape[1] != expected_dim:
    print(f"‚ö† Detected {df_emb.shape[1]} columns, expected {expected_dim}. Dropping first column...")
    df_emb = df_emb.iloc[:, 1:]

# Convert to float32 & make C-contiguous
embeddings = df_emb.values.astype("float32").copy()
print(f"‚úÖ Embeddings loaded with shape {embeddings.shape}")

# === 4. Load texts ===
print("üìÇ Loading texts...")
df_text = pd.read_csv(cleaned_csv_path)

# Auto-detect text column
possible_cols = ["text", "body", "content", "merged_text", "post"]
text_col = next((col for col in possible_cols if col in df_text.columns), df_text.columns[0])
texts = df_text[text_col].astype(str).tolist()

# Sanity check
assert len(texts) == embeddings.shape[0], f"‚ùå Mismatch: {len(texts)} texts vs {embeddings.shape[0]} embeddings"
print(f"‚úÖ Loaded {len(texts)} texts")

# === 5. Normalize & build FAISS index ===
faiss.normalize_L2(embeddings)
dim = embeddings.shape[1]
index = faiss.IndexFlatIP(dim)
index.add(embeddings)
print(f"‚úÖ FAISS index built with {index.ntotal} vectors")

# Save index
faiss.write_index(index, index_save_path)
print(f"üíæ Saved FAISS index to {index_save_path}")

# === 6. Search function ===
def search(query, top_k=5):
    # Embed and normalize query
    query_emb = model.encode([query], normalize_embeddings=True)
    query_emb = np.array(query_emb, dtype="float32")
    
    # Dimension check
    if query_emb.shape[1] != index.d:
        raise ValueError(f"Dimension mismatch: query dim {query_emb.shape[1]}, index dim {index.d}")
    
    scores, idxs = index.search(query_emb, top_k)
    return [(texts[i], float(scores[0][j])) for j, i in enumerate(idxs[0])]

# === 7. Score sanity check ===
test_vec = embeddings[0:1]
max_score, _ = index.search(test_vec, 1)
print(f"üîç Max possible self-match score: {max_score[0][0]:.4f} (should be close to 1.0)")

# === 8. Test query ===
sample_query = "I feel anxious and can't sleep"
results = search(sample_query, top_k=3)

print(f"\nüîç Results for query: '{sample_query}'\n")
for r, s in results:
    print(f"Score: {s:.4f} | Text: {r[:100]}...")


üìÇ Loading embeddings...
‚ö† Detected 769 columns, expected 768. Dropping first column...
‚úÖ Embeddings loaded with shape (149015, 768)
üìÇ Loading texts...


  df_text = pd.read_csv(cleaned_csv_path)


‚úÖ Loaded 149015 texts
‚úÖ FAISS index built with 149015 vectors
üíæ Saved FAISS index to C:\Users\ASUS-PC\Desktop\mindcheck_ai_jupyter\data\mental_health.index
üîç Max possible self-match score: 1.0000 (should be close to 1.0)

üîç Results for query: 'I feel anxious and can't sleep'

Score: 0.0560 | Text: Why do I go crazy if there's silence? ...
Score: 0.0560 | Text: Not feeling valid because I don't have a special interest... ...
Score: 0.0560 | Text: Anybody feel guilty just for existing? ...


In [37]:
from sentence_transformers import SentenceTransformer

# Load a lightweight paraphrasing model
para_model = SentenceTransformer("paraphrase-MiniLM-L6-v2")

def local_paraphrase_minilm(query, num_return_sequences=3):
    # Simple "paraphrasing" by encoding and decoding nearest neighbors in embedding space
    # Here we'll just return the same sentence as placeholder since MiniLM doesn't do generative output
    # In a real paraphrasing setup you'd combine this with a small language model
    return [query]  # keeping only the original for now to avoid sentencepiece issues

def search_local_expanded(query, top_k=5):
    variations = local_paraphrase_minilm(query)
    emb_list = [model.encode([v], normalize_embeddings=True) for v in variations]
    query_emb = np.mean(emb_list, axis=0).astype("float32")

    if query_emb.shape[1] != index.d:
        raise ValueError(f"Dim mismatch: query {query_emb.shape[1]}, index {index.d}")

    scores, idxs = index.search(query_emb, top_k)
    return [(texts[i], float(scores[0][j])) for j, i in enumerate(idxs[0])]


Downloading .gitattributes: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 744/744 [00:00<00:00, 1.58MB/s]
Downloading config.json: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 190/190 [00:00<00:00, 350kB/s]
Downloading README.md: 3.51kB [00:00, 365kB/s]
Downloading config.json: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 629/629 [00:00<?, ?B/s] 
Downloading (‚Ä¶)ce_transformers.json: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 122/122 [00:00<?, ?B/s] 
Downloading model.safetensors: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 90.9M/90.9M [00:06<00:00, 14.2MB/s]
Downloading model.onnx: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 90.4M/90.4M [00:05<00:00, 17.1MB/s]
Downloading model_O1.onnx: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 90.4M/90.4M [00:04<00:00, 20.5MB/s]
Downloading model_O2.onnx: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 90.3M/90.3M [00:04<00:00, 18.6MB/s]
Downloading model_O3.onnx: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 90.3M/90.3M [00:04<00:00, 19.5MB/s]
Downloading model_O4.onnx: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 45.2M/45.2

In [38]:
from sentence_transformers import SentenceTransformer

# Load a lightweight semantic model (no SentencePiece)
para_model = SentenceTransformer("paraphrase-MiniLM-L6-v2")

def local_paraphrase_minilm(query, num_return_sequences=3):
    """Generate basic offline paraphrases without SentencePiece."""
    variations = [query]

    # Simple variations (manual tweaks)
    if "can't" in query:
        variations.append(query.replace("can't", "cannot"))
    if "I feel" in query.lower():
        variations.append(query.replace("I feel", "I am feeling"))
    if "anxious" in query.lower():
        variations.append(query.replace("anxious", "worried"))
    if "sleep" in query.lower():
        variations.append(query + " at night")

    # Limit to unique items and return top N
    return list(dict.fromkeys(variations))[:num_return_sequences]

def search_local_expanded(query, top_k=5):
    # Create variations
    variations = local_paraphrase_minilm(query)
    print(f"\nüîÑ Variations for '{query}':")
    for v in variations:
        print(f"- {v}")

    # Encode all variations and average embeddings
    emb_list = [model.encode([v], normalize_embeddings=True) for v in variations]
    query_emb = np.mean(emb_list, axis=0).astype("float32")

    if query_emb.shape[1] != index.d:
        raise ValueError(f"Dim mismatch: query {query_emb.shape[1]}, index {index.d}")

    # Search in FAISS
    scores, idxs = index.search(query_emb, top_k)
    return [(texts[i], float(scores[0][j])) for j, i in enumerate(idxs[0])]

# Test
sample_query = "I feel anxious and can't sleep"
results = search_local_expanded(sample_query, top_k=5)

print(f"\nüîç Results for query: '{sample_query}'\n")
for r, s in results:
    print(f"Score: {s:.4f} | Text: {r[:100]}...")



üîÑ Variations for 'I feel anxious and can't sleep':
- I feel anxious and can't sleep
- I feel anxious and cannot sleep
- I feel worried and can't sleep

üîç Results for query: 'I feel anxious and can't sleep'

Score: 0.0561 | Text: Why do I go crazy if there's silence? ...
Score: 0.0560 | Text: will prazosin stop all dreams? ...
Score: 0.0560 | Text: Not feeling valid because I don't have a special interest... ...
Score: 0.0560 | Text: I feel like my trauma is never valid enough. ...
Score: 0.0560 | Text: Crying randomly throughout the day? Does anyone find themselves doing this? I‚Äôm not sure if it‚Äôs a s...


In [39]:
import pandas as pd
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
import os

# === 1. Paths ===
cleaned_csv_path = r"C:\Users\ASUS-PC\Desktop\mindcheck_ai_jupyter\data\aimind_cleaned.csv"
index_save_path = r"C:\Users\ASUS-PC\Desktop\mindcheck_ai_jupyter\data\mental_health.index"
embeddings_save_path = r"C:\Users\ASUS-PC\Desktop\mindcheck_ai_jupyter\data\aimind_embeddings.csv"

# === 2. Load cleaned text data ===
print("üìÇ Loading cleaned dataset...")
df = pd.read_csv(cleaned_csv_path)
# If you have text in a column like "body" or "text", change here:
text_col = "body" if "body" in df.columns else df.columns[0]
texts = df[text_col].astype(str).tolist()
print(f"‚úÖ Loaded {len(texts)} documents.")

# === 3. Load SAME embedding model for corpus and queries ===
model_name = "sentence-transformers/all-MiniLM-L6-v2"  # or the exact model you used earlier
print(f"üì• Loading embedding model: {model_name}")
model = SentenceTransformer(model_name)

# === 4. Encode corpus ===
print("‚öô Encoding corpus...")
embeddings = model.encode(texts, normalize_embeddings=True, show_progress_bar=True)
embeddings = np.array(embeddings, dtype="float32")
print(f"‚úÖ Embeddings shape: {embeddings.shape}")

# === 5. Save embeddings CSV (optional for inspection) ===
pd.DataFrame(embeddings).to_csv(embeddings_save_path, index=False)
print(f"üíæ Saved embeddings to {embeddings_save_path}")

# === 6. Build FAISS index ===
print("‚öô Building FAISS index...")
dim = embeddings.shape[1]
index = faiss.IndexFlatIP(dim)  # Inner product works with normalized vectors for cosine
index.add(embeddings)
print(f"‚úÖ FAISS index built with {index.ntotal} vectors.")

# === 7. Save index ===
faiss.write_index(index, index_save_path)
print(f"üíæ Saved FAISS index to {index_save_path}")

# === 8. Quick self-test ===
sample_query = "I feel anxious and can't sleep"
print(f"\nüîç Testing query: {sample_query}")
q_emb = model.encode([sample_query], normalize_embeddings=True).astype("float32")
scores, idxs = index.search(q_emb, k=5)
for score, idx in zip(scores[0], idxs[0]):
    print(f"Score: {score:.4f} | Text: {texts[idx][:100]}...")


üìÇ Loading cleaned dataset...


  df = pd.read_csv(cleaned_csv_path)


‚úÖ Loaded 149015 documents.
üì• Loading embedding model: sentence-transformers/all-MiniLM-L6-v2
‚öô Encoding corpus...


Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4657/4657 [1:12:49<00:00,  1.07it/s]  


‚úÖ Embeddings shape: (149015, 384)
üíæ Saved embeddings to C:\Users\ASUS-PC\Desktop\mindcheck_ai_jupyter\data\aimind_embeddings.csv
‚öô Building FAISS index...
‚úÖ FAISS index built with 149015 vectors.
üíæ Saved FAISS index to C:\Users\ASUS-PC\Desktop\mindcheck_ai_jupyter\data\mental_health.index

üîç Testing query: I feel anxious and can't sleep
Score: 0.7606 | Text: Anxiety and Sleep Issues ...
Score: 0.7606 | Text: Anxiety and Sleep Issues ...
Score: 0.7523 | Text: Anxious but tired. ...
Score: 0.7345 | Text: Random days of intense anxiety. Can't get out of bed ...
Score: 0.7240 | Text: It feels like I can't feel anything when I am anxious about something ...
