In [1]:
%pip install pandas numpy matplotlib sentence-transformers faiss-cpu langdetect scikit-learn

Collecting sentence-transformers
  Using cached sentence_transformers-5.1.2-py3-none-any.whl.metadata (16 kB)
Using cached sentence_transformers-5.1.2-py3-none-any.whl (488 kB)
Installing collected packages: sentence-transformers
Successfully installed sentence-transformers-5.1.2
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: C:\Users\kotag\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [2]:
import pandas as pd
df = pd.read_csv('data_train.csv', encoding='latin1')
df = df.sample(frac=0.01, random_state=42)
df.dropna(inplace=True)
questions = df.question1.values

In [3]:
questions[:5]

array(['How do I play Pokémon GO in Korea?',
       'What are some of the best side dishes for crab cakes?',
       'Which is more advisable and better material for a crash test in automobiles, ductile or brittle?',
       'How do I improve logical programming skills?',
       'How close we are to see 3rd world war?'], dtype=object)

In [4]:
# No longer needed - sentence-transformers replaces TensorFlow Hub
print("Dependencies installed successfully")

Dependencies installed successfully


In [5]:
# No longer needed - using sentence-transformers instead
print("All dependencies ready")

All dependencies ready


In [6]:
import numpy as np
from sentence_transformers import SentenceTransformer
from langdetect import detect
import warnings
warnings.filterwarnings('ignore')

class Multilingual_Encoder:
    """
    Improved multilingual encoder using sentence-transformers
    Supports 50+ languages with high-quality embeddings
    """
    def __init__(self, model_name='paraphrase-multilingual-MiniLM-L12-v2'):
        """
        Initialize with a multilingual model
        Options:
        - 'paraphrase-multilingual-MiniLM-L12-v2' (lightweight, 384-dim)
        - 'distiluse-base-multilingual-cased-v2' (larger, 512-dim)
        - 'xlm-r-distilroberta-base' (multilingual)
        """
        print(f"Loading model: {model_name}")
        self.model = SentenceTransformer(model_name)
        print(f"Model loaded. Embedding dimension: {self.model.get_sentence_embedding_dimension()}")
    
    def encode(self, text):
        """Encode text to embedding vector"""
        if isinstance(text, list):
            return self.model.encode(text, convert_to_numpy=True)
        else:
            return self.model.encode([text], convert_to_numpy=True)[0]
    
    def encode_batch(self, texts, batch_size=32):
        """Encode multiple texts efficiently"""
        return self.model.encode(texts, batch_size=batch_size, convert_to_numpy=True)

def detect_language(text):
    """Detect the language of input text"""
    try:
        lang = detect(text)
        return lang
    except:
        return "unknown"





In [7]:
import faiss
from sklearn.preprocessing import normalize

class Multilingual_FAISS:
    """
    Improved FAISS index with cosine similarity (better for semantic search)
    Supports efficient similarity search
    """
    def __init__(self, dimensions):
        self.dimensions = dimensions
        self.index = faiss.IndexFlatIP(dimensions)  # Inner product (cosine similarity)
        self.vectors = {}
        self.counter = 0
        self.texts = []
    
    def add(self, text, v):
        """Add text and its embedding to the index"""
        # Normalize the vector for cosine similarity
        v_normalized = normalize(v.reshape(1, -1), norm='l2')[0]
        v_normalized = v_normalized.reshape(1, -1).astype('float32')
        
        self.index.add(v_normalized)
        self.vectors[self.counter] = (text, v_normalized[0])
        self.texts.append(text)
        self.counter += 1
    
    def add_batch(self, texts, vectors):
        """Add multiple texts and embeddings at once (faster)"""
        vectors_normalized = normalize(vectors, norm='l2')
        vectors_normalized = vectors_normalized.astype('float32')
        
        self.index.add(vectors_normalized)
        for i, (text, vec) in enumerate(zip(texts, vectors_normalized)):
            self.vectors[self.counter] = (text, vec)
            self.texts.append(text)
            self.counter += 1
    
    def search(self, v, k=5, threshold=0.3):
        """
        Search for similar questions
        Args:
            v: query embedding vector
            k: number of results to return
            threshold: minimum similarity score (0-1)
        """
        # Normalize query vector
        v_normalized = normalize(v.reshape(1, -1), norm='l2')[0]
        v_normalized = v_normalized.reshape(1, -1).astype('float32')
        
        # Search
        distances, item_idx = self.index.search(v_normalized, k)
        
        results = []
        print(f"\n{'Rank':<5} {'Similarity':<12} {'Result':<100}")
        print("-" * 120)
        
        for rank, (d, i) in enumerate(zip(distances[0], item_idx[0]), 1):
            if i == -1:
                break
            
            # Convert cosine similarity to percentage
            similarity = float(d) * 100
            
            if similarity >= threshold * 100:
                text, vec = self.vectors[i]
                results.append((text, similarity))
                print(f"{rank:<5} {similarity:>10.2f}% {text[:95]:<100}")
        
        return results


In [8]:
# Initialize the improved multilingual encoder and build index
print("Initializing multilingual encoder...")
encoder = Multilingual_Encoder('paraphrase-multilingual-MiniLM-L12-v2')

# Get embedding dimension
dimension = encoder.model.get_sentence_embedding_dimension()
print(f"Embedding dimension: {dimension}")

# Create FAISS index
faiss_index = Multilingual_FAISS(dimension)

# Encode all questions in batch (much faster than one-by-one)
print(f"\nEncoding {len(questions)} questions...")
question_embeddings = encoder.encode_batch(questions, batch_size=64)

# Add all to index at once
print("Building FAISS index...")
faiss_index.add_batch(questions, question_embeddings)
print(f"Index built with {faiss_index.counter} questions")


Initializing multilingual encoder...
Loading model: paraphrase-multilingual-MiniLM-L12-v2


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/645 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/471M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Model loaded. Embedding dimension: 384
Embedding dimension: 384

Encoding 4043 questions...
Building FAISS index...
Index built with 4043 questions
Building FAISS index...
Index built with 4043 questions


In [9]:
# Test 1: English query
print("\n" + "="*120)
print("TEST 1: English Query")
print("="*120)
query1 = "How to learn machine learning?"
lang1 = detect_language(query1)
print(f"Query: {query1}")
print(f"Detected Language: {lang1}")

query_vec1 = encoder.encode(query1)
faiss_index.search(query_vec1, k=5, threshold=0.2)

# Test 2: Spanish query
print("\n" + "="*120)
print("TEST 2: Spanish Query")
print("="*120)
query2 = "¿Cómo aprender programación?"
lang2 = detect_language(query2)
print(f"Query: {query2}")
print(f"Detected Language: {lang2}")

query_vec2 = encoder.encode(query2)
faiss_index.search(query_vec2, k=5, threshold=0.2)

# Test 3: Hindi query
print("\n" + "="*120)
print("TEST 3: Hindi Query")
print("="*120)
query3 = "प्रोग्रामिंग कैसे सीखें"
lang3 = detect_language(query3)
print(f"Query: {query3}")
print(f"Detected Language: {lang3}")

query_vec3 = encoder.encode(query3)
faiss_index.search(query_vec3, k=5, threshold=0.2)



TEST 1: English Query
Query: How to learn machine learning?
Detected Language: en

Rank  Similarity   Result                                                                                              
------------------------------------------------------------------------------------------------------------------------
1          99.32% How can I learn machine learning?                                                                   
2          73.94% What are the best books about machine learning?                                                     
3          70.24% What are some good Machine Learning books for a beginner?                                           
4          51.63% What is the basic difference between inferential statistics and machine learning?                   
5          50.47% How do I improve my learning and understanding capabilities?                                        

TEST 2: Spanish Query
Query: ¿Cómo aprender programación?
Detected Language: es

[('From where can I learn programming?', 81.83436393737793),
 ('How to learn coding?', 80.07534742355347),
 ('What is best way to start learning programming?', 73.65648746490479),
 ('What is the best and most efficient way to learn and master a programming language?',
  71.57421708106995),
 ('Fundamentally, what is programming?', 64.77685570716858)]

In [12]:

# Test 3: Hindi query
print("\n" + "="*120)
print("TEST 3: Hindi Query")
print("="*120)
query3 = "प्रोग्रामिंग कैसे सीखें"
lang3 = detect_language(query3)
print(f"Query: {query3}")
print(f"Detected Language: {lang3}")

query_vec3 = encoder.encode(query3)
faiss_index.search(query_vec3, k=5, threshold=0.2)



TEST 3: Hindi Query
Query: प्रोग्रामिंग कैसे सीखें
Detected Language: hi

Rank  Similarity   Result                                                                                              
------------------------------------------------------------------------------------------------------------------------
1          81.83% From where can I learn programming?                                                                 
2          80.08% How to learn coding?                                                                                
3          73.66% What is best way to start learning programming?                                                     
4          71.57% What is the best and most efficient way to learn and master a programming language?                 
5          64.78% Fundamentally, what is programming?                                                                 


[('From where can I learn programming?', 81.83436393737793),
 ('How to learn coding?', 80.07534742355347),
 ('What is best way to start learning programming?', 73.65648746490479),
 ('What is the best and most efficient way to learn and master a programming language?',
  71.57421708106995),
 ('Fundamentally, what is programming?', 64.77685570716858)]

In [10]:
# Analyze dataset languages
print("\n" + "="*120)
print("DATASET LANGUAGE ANALYSIS")
print("="*120)
print(f"Total questions in dataset: {len(questions)}")

# Sample language detection
sample_questions = questions[:min(100, len(questions))]
languages_detected = [detect_language(q) for q in sample_questions]

from collections import Counter
lang_counts = Counter(languages_detected)
print(f"\nLanguages found in first 100 questions:")
for lang, count in lang_counts.most_common():
    print(f"  {lang}: {count}")



DATASET LANGUAGE ANALYSIS
Total questions in dataset: 4043

Languages found in first 100 questions:
  en: 98
  es: 1
  pt: 1

Languages found in first 100 questions:
  en: 98
  es: 1
  pt: 1


In [11]:
# Interactive search function
def multilingual_search(query, top_k=5, threshold=0.2):
    """
    Perform multilingual semantic search
    
    Args:
        query: Search query in any language
        top_k: Number of results to return
        threshold: Minimum similarity score (0-1)
    
    Returns:
        List of tuples (text, similarity_score)
    """
    detected_lang = detect_language(query)
    print(f"\nQuery: {query}")
    print(f"Detected Language: {detected_lang}")
    print("-" * 120)
    
    query_vec = encoder.encode(query)
    results = faiss_index.search(query_vec, k=top_k, threshold=threshold)
    
    return results

# Example usage
# results = multilingual_search("What is Python?", top_k=5)
