In [2]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [2]:
import gensim.downloader 
print(list(gensim.downloader.info()['models'].keys()))

['fasttext-wiki-news-subwords-300', 'conceptnet-numberbatch-17-06-300', 'word2vec-ruscorpora-300', 'word2vec-google-news-300', 'glove-wiki-gigaword-50', 'glove-wiki-gigaword-100', 'glove-wiki-gigaword-200', 'glove-wiki-gigaword-300', 'glove-twitter-25', 'glove-twitter-50', 'glove-twitter-100', 'glove-twitter-200', '__testing_word2vec-matrix-synopsis']


In [3]:
# download glove vectors
glove_vectors=gensim.downloader.load('glove-twitter-25')



2025-08-23 17:29:42,235 : INFO : glove-twitter-25 downloaded
2025-08-23 17:29:42,237 : INFO : loading projection weights from /home/prajna/gensim-data/glove-twitter-25/glove-twitter-25.gz
2025-08-23 17:30:08,228 : INFO : KeyedVectors lifecycle event {'msg': 'loaded (1193514, 25) matrix of type float32 from /home/prajna/gensim-data/glove-twitter-25/glove-twitter-25.gz', 'binary': False, 'encoding': 'utf8', 'datetime': '2025-08-23T17:30:08.228678', 'gensim': '4.3.3', 'python': '3.12.3 (main, Apr 15 2024, 18:25:56) [Clang 17.0.6 ]', 'platform': 'Linux-6.6.87.2-microsoft-standard-WSL2-x86_64-with-glibc2.35', 'event': 'load_word2vec_format'}


In [5]:
import gzip
import io
def load_vectors(path):
    try:

        with gzip.open(path, "rb") as gz_file:
            with io.TextIOWrapper(gz_file, encoding='utf-8') as text_file:
                for line in text_file:
                    print(line.strip())
    except FileNotFoundError:
        logging.error(f"Error: file not found at {path}")
    except Exception as e:
        logging.error(f"An exception occured: {str(e)}")



In [21]:
# check vocabulary of the model
def check_vocabulary(wv):
    for index, word in enumerate(wv.index_to_key):
        if index==10:
            break
        print(f"word #{index}/{len(wv.index_to_key)} is {word}")

In [22]:
check_vocabulary(glove_vectors)

word #0/1193514 is <user>
word #1/1193514 is .
word #2/1193514 is :
word #3/1193514 is rt
word #4/1193514 is ,
word #5/1193514 is <repeat>
word #6/1193514 is <hashtag>
word #7/1193514 is <number>
word #8/1193514 is <url>
word #9/1193514 is !


In [None]:
# 25 dimensional vector
glove_vectors.get_vector('king')


array([-0.74501 , -0.11992 ,  0.37329 ,  0.36847 , -0.4472  , -0.2288  ,
        0.70118 ,  0.82872 ,  0.39486 , -0.58347 ,  0.41488 ,  0.37074 ,
       -3.6906  , -0.20101 ,  0.11472 , -0.34661 ,  0.36208 ,  0.095679,
       -0.01765 ,  0.68498 , -0.049013,  0.54049 , -0.21005 , -0.65397 ,
        0.64556 ], dtype=float32)

In [29]:
from collections import defaultdict
from gensim import corpora

documents = [
    "Human machine interface for lab abc computer applications",
    "A survey of user opinion of computer system response time",
    "The EPS user interface management system",
    "System and human system engineering testing of EPS",
    "Relation of user perceived response time to error measurement",
    "The generation of random binary unordered trees",
    "The intersection graph of paths in trees",
    "Graph minors IV Widths of trees and well quasi ordering",
    "Graph minors A survey",
]

# remove common words and tokenize
stoplist = set('for a of the and to in'.split())
texts = [
    [word for word in document.lower().split() if word not in stoplist]
    for document in documents
]

# remove words that appear only once
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1

texts = [
    [token for token in text if frequency[token] > 1]
    for text in texts
]

dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

2025-08-23 18:10:27,876 : INFO : adding document #0 to Dictionary<0 unique tokens: []>
2025-08-23 18:10:27,878 : INFO : built Dictionary<12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...> from 9 documents (total 29 corpus positions)
2025-08-23 18:10:27,878 : INFO : Dictionary lifecycle event {'msg': "built Dictionary<12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...> from 9 documents (total 29 corpus positions)", 'datetime': '2025-08-23T18:10:27.878943', 'gensim': '4.3.3', 'python': '3.12.3 (main, Apr 15 2024, 18:25:56) [Clang 17.0.6 ]', 'platform': 'Linux-6.6.87.2-microsoft-standard-WSL2-x86_64-with-glibc2.35', 'event': 'created'}


In [34]:
# vectorize and add new vectors
def vector_add(a,b):
    return [x+y for x,y in zip(a,b)]

In [3]:
# iterate through each word and add to the embeddings.
from gensim import utils
from gensim.test.utils import common_texts, common_corpus, common_dictionary, datapath

print(common_texts) #a list of list of individual tokens. This is the format 



2025-08-23 21:52:27,964 : INFO : adding document #0 to Dictionary<0 unique tokens: []>
2025-08-23 21:52:27,967 : INFO : built Dictionary<12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...> from 9 documents (total 29 corpus positions)
2025-08-23 21:52:27,969 : INFO : Dictionary lifecycle event {'msg': "built Dictionary<12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...> from 9 documents (total 29 corpus positions)", 'datetime': '2025-08-23T21:52:27.969271', 'gensim': '4.3.3', 'python': '3.12.3 (main, Apr 15 2024, 18:25:56) [Clang 17.0.6 ]', 'platform': 'Linux-6.6.87.2-microsoft-standard-WSL2-x86_64-with-glibc2.35', 'event': 'created'}


[['human', 'interface', 'computer'], ['survey', 'user', 'computer', 'system', 'response', 'time'], ['eps', 'user', 'interface', 'system'], ['system', 'human', 'system', 'eps'], ['user', 'response', 'time'], ['trees'], ['graph', 'trees'], ['graph', 'minors', 'trees'], ['graph', 'minors', 'survey']]


In [4]:
# preprocess the .txt files
with open('tech4dev_blogs/A sprint from the hills.txt') as f:
    for n,l in enumerate(f):
        if n<15:
            print([l])

['Title: A sprint from the hills\n']
['Author: Kurund Jalmi\n']
['Date: November 2021\n']
['Category: Tech4Dev project Updates\n']
['URL: https://projecttech4dev.org/a-sprint-from-the-hills/\n']
['\n']
['The article is written by the Plio team(Aman, Karn, Deepansh). Originally posed\n']
['here\n']
['.\n']
['You know how so many people have been taking full advantage of remote working by working from very exotic-looking places? Yeah, we haven’t been like those people.\n']
['Until last week. We got a chance to travel to Tehri (Uttarakhand, India) where multiple organisations working at the intersection of technology and social impact had assembled for a week-long sprint.\n']
['New Tehri\xa0is a modern town developed at the hilltop, spread over an altitude from 1550-1950 metres above sea level overlooking the gigantic Tehri lake and\xa0the Tehri dam. Every morning, we woke up to an amalgamation of the lake, mountains, clouds, chirping birds, a troop of rafters and the clingiest of dogs in

In [5]:
# build tokenized text from file
def build_texts(file_path):
    with open(file_path) as f:
        for line in f:
            yield utils.simple_preprocess(line, deacc=True, min_len=5)

In [7]:
train_texts=list(build_texts('t4d_corpus_no_empty_lines.txt'))


In [8]:
len(train_texts)

52673

In [10]:
train_texts_cleaned=[word_list for word_list in train_texts if not (isinstance(word_list,list) and len(word_list)==0)]

In [11]:
len(train_texts_cleaned)


43472

In [93]:
with open('t4d_corpora.txt', 'w') as file:
    file.write(train_texts_cleaned)

TypeError: write() argument must be str, not list

In [12]:
train_texts_cleaned[11:12]

[['improving', 'visual', 'designs', 'other', 'parts', 'product']]

In [13]:
# train the word2vwc model
import gensim.models
model=gensim.models.Word2Vec(sentences=train_texts_cleaned,vector_size=300, workers=4, sg=1, hs=0, compute_loss=True, seed=42)

2025-08-23 22:05:14,039 : INFO : collecting all words and their counts
2025-08-23 22:05:14,042 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2025-08-23 22:05:14,083 : INFO : PROGRESS: at sentence #10000, processed 89105 words, keeping 9207 word types
2025-08-23 22:05:14,137 : INFO : PROGRESS: at sentence #20000, processed 175539 words, keeping 12402 word types
2025-08-23 22:05:14,176 : INFO : PROGRESS: at sentence #30000, processed 264102 words, keeping 12855 word types
2025-08-23 22:05:14,224 : INFO : PROGRESS: at sentence #40000, processed 351754 words, keeping 12855 word types
2025-08-23 22:05:14,244 : INFO : collected 12876 word types from a corpus of 380940 raw words and 43472 sentences
2025-08-23 22:05:14,245 : INFO : Creating a fresh vocabulary
2025-08-23 22:05:14,287 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=5 retains 6270 unique words (48.70% of original 12876, drops 6606)', 'datetime': '2025-08-23T22:05:14.286885', 'gensim': '

In [17]:
wv=model.wv

In [20]:
print(wv.most_similar(positive=['dalgo'], topn=5))

[('transformation', 0.6316876411437988), ('uncategorized', 0.6107181310653687), ('glific', 0.6060605049133301), ('visualization', 0.5742401480674744), ('airbyte', 0.5701705813407898)]


In [24]:
for index, word in enumerate(wv.index_to_key):
    if index == 10:
        break
    print(f"word #{index}/{len(wv.index_to_key)} is {word}")

word #0/6270 is their
word #1/6270 is project
word #2/6270 is about
word #3/6270 is glific
word #4/6270 is which
word #5/6270 is sprint
word #6/6270 is program
word #7/6270 is these
word #8/6270 is platform
word #9/6270 is impact


In [31]:
model.save('model/gensim-t4d-word2Vec.model')

2025-08-23 22:22:06,799 : INFO : Word2Vec lifecycle event {'fname_or_handle': 'model/gensim-t4d-word2Vec.model', 'separately': 'None', 'sep_limit': 10485760, 'ignore': frozenset(), 'datetime': '2025-08-23T22:22:06.799524', 'gensim': '4.3.3', 'python': '3.12.3 (main, Apr 15 2024, 18:25:56) [Clang 17.0.6 ]', 'platform': 'Linux-6.6.87.2-microsoft-standard-WSL2-x86_64-with-glibc2.35', 'event': 'saving'}
2025-08-23 22:22:06,804 : INFO : not storing attribute cum_table
2025-08-23 22:22:06,870 : INFO : saved model/gensim-t4d-word2Vec.model


In [46]:
model=gensim.models.Word2Vec.load('model/gensim-t4d-word2Vec.model')

2025-08-23 22:40:18,742 : INFO : loading Word2Vec object from model/gensim-t4d-word2Vec.model
2025-08-23 22:40:18,771 : INFO : loading wv recursively from model/gensim-t4d-word2Vec.model.wv.* with mmap=None
2025-08-23 22:40:18,773 : INFO : setting ignored attribute cum_table to None
2025-08-23 22:40:18,860 : INFO : Word2Vec lifecycle event {'fname': 'model/gensim-t4d-word2Vec.model', 'datetime': '2025-08-23T22:40:18.860889', 'gensim': '4.3.3', 'python': '3.12.3 (main, Apr 15 2024, 18:25:56) [Clang 17.0.6 ]', 'platform': 'Linux-6.6.87.2-microsoft-standard-WSL2-x86_64-with-glibc2.35', 'event': 'loaded'}


In [47]:
import numpy as np

# convert to numpy array for fast search
vectors=model.wv.vectors
vocab_list=list(model.wv.key_to_index.keys())
# Save optimized format
np.savez_compressed("search_embeddings.npz", 
                   vectors=vectors, 
                   vocab=vocab_list)

In [48]:
class HybridSearchEngine:
    def __init__(self, model_path):
        """Initialize with both Gensim and manual methods available"""
        self.model = gensim.models.Word2Vec.load(model_path)
        self.wv = self.model.wv  # KeyedVectors object
        
        # For manual methods
        self.vectors = self.wv.vectors
        self.vocab = list(self.wv.key_to_index.keys())
        self.word_to_idx = {word: idx for idx, word in enumerate(self.vocab)}
        
        # Pre-compute normalized vectors
        norms = np.linalg.norm(self.vectors, axis=1, keepdims=True)
        self.norm_vectors = self.vectors / norms
        
        print(f"Initialized hybrid search with {len(self.vocab)} words")
    
    def gensim_search(self, word, top_k=10):
        """Use Gensim's built-in most_similar method"""
        try:
            return self.wv.most_similar(word, topn=top_k)
        except KeyError:
            return f"'{word}' not in vocabulary"
    def manual_search(self, word, top_k=10):
        """Use manual similarity computation"""
        if word not in self.word_to_idx:
            return f"'{word}' not in vocabulary"
        
        word_idx = self.word_to_idx[word]
        query_vector = self.norm_vectors[word_idx]
        
        similarities = np.dot(self.norm_vectors, query_vector)
        top_indices = np.argsort(similarities)[::-1][1:top_k+1]
        
        return [(self.vocab[i], similarities[i]) for i in top_indices]

In [49]:
search_engine=HybridSearchEngine("model/gensim-t4d-word2Vec.model")


2025-08-23 22:56:54,556 : INFO : loading Word2Vec object from model/gensim-t4d-word2Vec.model
2025-08-23 22:56:54,604 : INFO : loading wv recursively from model/gensim-t4d-word2Vec.model.wv.* with mmap=None
2025-08-23 22:56:54,606 : INFO : setting ignored attribute cum_table to None
2025-08-23 22:56:54,683 : INFO : Word2Vec lifecycle event {'fname': 'model/gensim-t4d-word2Vec.model', 'datetime': '2025-08-23T22:56:54.683593', 'gensim': '4.3.3', 'python': '3.12.3 (main, Apr 15 2024, 18:25:56) [Clang 17.0.6 ]', 'platform': 'Linux-6.6.87.2-microsoft-standard-WSL2-x86_64-with-glibc2.35', 'event': 'loaded'}


Initialized hybrid search with 6270 words


In [53]:
import time

In [92]:
start_time=time.time()
gensim_similar=search_engine.gensim_search("gilfic")
gensim_time=time.time()-start_time
print(f"✅ most_similar(): {gensim_time:.4f}s")
print(f"   Results: {gensim_similar[:10]}...")

✅ most_similar(): 0.0005s
   Results: 'gilfic' n...


In [85]:
start_time=time.time()
manual_similar=search_engine.manual_search("Cascading")
manual_time=time.time()-start_time
print(f"✅ most_similar(): {manual_time:.4f}s")
print(f"   Results: {manual_similar[:10]}...")

✅ most_similar(): 0.0002s
   Results: 'Cascading...


In [51]:
manual_results=search_engine.manual_search("dalgo")
manual_results


[('transformation', 0.63168764),
 ('uncategorized', 0.61071813),
 ('glific', 0.6060605),
 ('visualization', 0.57424015),
 ('airbyte', 0.5701705),
 ('sprints', 0.5670184),
 ('developer', 0.55133516),
 ('fractional', 0.54705876),
 ('functional', 0.5466279),
 ('connector', 0.5440986)]

In [42]:
t4d_w2v.raw_vocab


defaultdict(int, {})