[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/VitcSS/Word_Embedding/blob/master/deliver.ipynb)

In [2]:
import gensim.downloader as api
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
from gensim.test.utils import datapath
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
try :
    import umap
except:
    !pip install umap
finally:
    import umap
import matplotlib.pyplot as plt

In [8]:

def train_word2vec_model(dataset, model_save_path):
    # Download the dataset
    corpus = api.load(dataset)

    # Tokenize the text
    tokenized_text = [word_tokenize(sentence.lower()) for sentence in corpus]

    # Train the Word2Vec model
    model = Word2Vec(sentences=tokenized_text, vector_size=100, window=5, min_count=5, workers=4)

    # Save the trained model
    model.save(model_save_path)

def download_text8():
    # Download the Text8 dataset
    return api.load("text8")

def download_wikipedia_dump():
    # Download the Wikipedia dump
    # Note: This is a large file, and the download may take some time
    return api.load("enwiki-latest-pages-articles")

def train_word2vec_model(corpus, model_save_path):
    # Tokenize the text
    tokenized_text = [str(sentence).lower().split() for sentence in corpus]

    # Train the Word2Vec model
    model = Word2Vec(sentences=tokenized_text, vector_size=100, window=5, min_count=5, workers=4)

    # Save the trained model
    model.save(model_save_path)

def evaluate_word_similarity(model_path, similarity_dataset):
    # Load the trained model
    model = Word2Vec.load(model_path)

    # Evaluate on Word Similarity Task
    similarities = model.wv.evaluate_word_pairs(datapath(similarity_dataset))
    print("Spearman correlation for Word Similarity Task:", similarities[0])

def evaluate_word_analogy(model_path, analogy_dataset):
    # Load the trained model
    model = Word2Vec.load(model_path)

    # Evaluate on Word Analogy Task
    analogies = model.wv.evaluate_word_analogies(datapath(analogy_dataset))
    print("Accuracy on Word Analogy Task:", analogies[0])

def visualize_word_embeddings(model_path):
    # Load the trained Word2Vec model
    model = Word2Vec.load(model_path)

    # Get word vectors and corresponding words
    words = list(model.wv.index_to_key)
    word_vectors = [model.wv[word] for word in words]

    # Dimensionality reduction using UMAP
    reducer = umap.UMAP()
    embedding = reducer.fit_transform(word_vectors)

    # Scatter plot
    plt.figure(figsize=(10, 8))
    plt.scatter(embedding[:, 0], embedding[:, 1], marker=".", s=5, alpha=0.5)
    for i, word in enumerate(words):
        plt.annotate(word, xy=(embedding[i, 0], embedding[i, 1]), fontsize=8)
    plt.title("UMAP Visualization of Word Embeddings")
    plt.show()


In [9]:

# Download and process the Text8 dataset
text8_corpus = download_text8()

# Train and evaluate on Text8 dataset
train_word2vec_model(text8_corpus, "word2vec_model_text8.model")
evaluate_word_similarity("word2vec_model_text8.model", "wordsim353.tsv")
evaluate_word_analogy("word2vec_model_text8.model", "questions-words.txt")

# Download and process the Wikipedia dump
wikipedia_corpus = download_wikipedia_dump()

# Train and evaluate on Wikipedia dataset
train_word2vec_model(wikipedia_corpus, "word2vec_model_wikipedia.model")
evaluate_word_similarity("word2vec_model_wikipedia.model", "wordsim353.tsv")
evaluate_word_analogy("word2vec_model_wikipedia.model", "questions-words.txt")

# Visualize word embeddings
visualize_word_embeddings("word2vec_model_wikipedia.model")


: 