In [1]:
# !pip install gensim

## Our Objective
Training **Word2Vec** and **GloVe** embeddings on a custom dataset(wikibooks and quora), and comparing their ability to capture semantic relationships through multiple downstream tasks (like word similarity and analogy tasks).

In [3]:
## preprocessing

# Clean text: lowercase, remove punctuation, stopwords, etc.
# Tokenize and prepare the corpus for embedding training.

In [4]:
import nltk
from nltk.tokenize import word_tokenize

import pandas as pd 
import gensim
from gensim.models import Word2Vec



import sqlite3
from gensim.utils import simple_preprocess

In [5]:
import logging

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)


In [6]:
# Establish a connection to the SQLite database
conn = sqlite3.connect('/kaggle/input/wikibooks-dataset/wikibooks.sqlite')

# Create a cursor object
cursor = conn.cursor()

# Execute the SQL query to retrieve table names
cursor.execute("SELECT * from en")

# Fetch all the table names
raw_eng_text = cursor.fetchall()

In [7]:
cursor.execute(f"PRAGMA table_info(en);")
column_names = cursor.fetchall()
column_names = [column[1] for column in column_names]

# Create a pandas DataFrame from the fetched data
df_eng_text = pd.DataFrame(raw_eng_text, columns=column_names)
 
    
df_eng_text.body_text.str.len().max()
len(df_eng_text)

86736

In [8]:
sub_df = df_eng_text['body_text']

In [9]:
sub_df 

0        Front Page: Radiation Oncology | RTOG Trials |...
1        Băuturi/Beverages[edit | edit source]\nTea : C...
2        Karrigell is an open Source Python web framewo...
3        setupUnitPanel[edit | edit source]\nHelper fun...
4        Contents\n\n1 The Concept\n2 The System\n3 The...
                               ...                        
86731    Previous: Self Help\n\nIndex\n\nNext: Variable...
86732    ← Contributing\n\nCalculus\n\nAlgebra →\n\n\nP...
86733    There are 11 castles in Somerset.\n\n\n\n\nNam...
86734    Contents\n\n1 CULTURAL STUDIES AND IDENTITY\n\...
86735    Sardine is a nutritious oily fish.\n沙丁鱼是一种有营养的...
Name: body_text, Length: 86736, dtype: object

In [10]:
df = pd.read_csv('/kaggle/input/question-pairs-dataset/questions.csv')

In [11]:
texts = [*df['question1'].values.tolist(), *df['question1'].values.tolist(), *sub_df.values.tolist()]

In [12]:
texts = [[word for word in word_tokenize(str(sentance)) if word.isalnum()] for sentance in texts]

In [None]:
cbow_model = Word2Vec(texts, vector_size=100, window=5, min_count=1, sg=0, alpha=0.03, min_alpha=0.0007, epochs=100)
skipgram_model = Word2Vec(texts, vector_size=100, window=5, min_count=1, sg=1, alpha=0.03, min_alpha=0.0007, epochs=100)

cbow_model.train(texts, total_examples=len(texts), epochs=100)
skipgram_model.train(texts, total_examples=len(texts), epochs=100)



In [None]:
# texts

cbow_model

In [None]:
word_vectors_cbow = cbow_model.wv
similarity_cbow = word_vectors_cbow.similarity('invest', 'market')
print(f"Similarity between 'invest', 'market': {similarity_cbow} with CBOW")


word_vectors_skipgram= skipgram_model.wv
similarity_skip = word_vectors_skipgram.similarity('invest', 'market')
print(f"Similarity between 'invest', 'market': {similarity_skip} with Skip-Gram")

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def similarity_score(word1, word2, model, is_glove=False):
    try:
        if is_glove:
            vec1 = model.word_vectors[model.dictionary[word1]]
            vec2 = model.word_vectors[model.dictionary[word2]]
        else:
            vec1 = model.wv[word1]
            vec2 = model.wv[word2]
        return cosine_similarity([vec1], [vec2])[0][0]
    except:
        return None

words_to_test = [('king', 'queen'), ('apple', 'banana'), ('paris', 'france')]

for w1, w2 in words_to_test:
    print(f"\n{w1} - {w2}")
    print("CBOW:", similarity_score(w1, w2, cbow_model))
    print("Skip-gram:", similarity_score(w1, w2, skipgram_model))
