In [1]:
import gensim
from gensim.models import Word2Vec
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch
import numpy as np

# Step 1: Load and preprocess your input text file
def load_and_preprocess(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
    sentences = [sentence.split() for sentence in text.split('.')]
    return sentences

# Step 2: Train Word2Vec model
def train_word2vec(sentences):
    model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)
    return model

# Step 3: Fine-tune GPT-2 model
def fine_tune_gpt2(sentences, word2vec_model):
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
    model = GPT2LMHeadModel.from_pretrained('gpt2')

    # Prepare input data
    input_ids = []
    for sentence in sentences:
        tokens = tokenizer.encode(' '.join(sentence), add_special_tokens=True)
        input_ids.append(torch.tensor(tokens))

    # Fine-tune the model
    model.train()
    optimizer = torch.optim.Adam(model.parameters(), lr=5e-5)

    for epoch in range(3):  # Adjust number of epochs as needed
        for batch in input_ids:
            outputs = model(batch, labels=batch)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

    return model, tokenizer

# Step 4: Perform question-answering
def question_answering(question, context, model, tokenizer, word2vec_model):
    # Combine question and context
    input_text = f"Question: {question}\nContext: {context}\nAnswer:"
    input_ids = tokenizer.encode(input_text, return_tensors='pt')

    # Generate answer
    output = model.generate(input_ids, max_length=100, num_return_sequences=1, no_repeat_ngram_size=2)
    answer = tokenizer.decode(output[0], skip_special_tokens=True)

    return answer

# Main execution
if __name__ == "__main__":
    file_path = "context.txt"
    sentences = load_and_preprocess(file_path)
    
    word2vec_model = train_word2vec(sentences)
    gpt2_model, tokenizer = fine_tune_gpt2(sentences, word2vec_model)

    # Example usage
    context = "A hospital receptionist"
    question = "I need to cancel my appointment for Wednesday."
    
    answer = question_answering(question, context, gpt2_model, tokenizer, word2vec_model)
    print(f"Question: {question}")
    print(f"Answer: {answer}")



model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Question: I need to cancel my appointment for Wednesday.
Answer: Question: I need to cancel my appointment for Wednesday.
Context: A hospital receptionist
Answer: Monday works Wednesday Friday: Bye Monday?: That's fine Monday, Wednesday works fine Wednesday Wednesday: Okay Monday is fine?: Wednesday is Monday: Wednesday's work?: Okay, Monday's Monday will be fine: Friday works Monday Monday Wednesday Monday Friday Monday Tuesday Wednesday Thursday Friday Friday Saturday: works Friday and Wednesday will work Friday?: You're welcome?: Thanks for waiting for me!: Okay thanks for letting me


In [4]:
import gensim
from gensim.models import Word2Vec
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
import numpy as np

# Download necessary NLTK data
nltk.download('punkt')

def preprocess_text(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
    # Tokenize the text into sentences
    sentences = sent_tokenize(text)
    # Tokenize each sentence into words
    tokenized_sentences = [word_tokenize(sentence.lower()) for sentence in sentences]
    return tokenized_sentences

def train_word2vec(sentences, vector_size=100, window=5, min_count=1, workers=4):
    model = Word2Vec(sentences, vector_size=vector_size, window=window, min_count=min_count, workers=workers)
    return model

def explore_word2vec(model, word):
    try:
        # Get the vector for a word
        vector = model.wv[word]
        print(f"Vector for '{word}': {vector[:5]}... (showing first 5 dimensions)")
        
        # Find similar words
        similar_words = model.wv.most_similar(word, topn=5)
        print(f"\nTop 5 similar words to '{word}':")
        for similar_word, score in similar_words:
            print(f"{similar_word}: {score:.4f}")
        print("\n" + "-"*50 + "\n")
    
    except KeyError:
        print(f"'{word}' not in vocabulary")

# Main execution
if __name__ == "__main__":
    file_path = "context.txt" 
    # Preprocess the text
    sentences = preprocess_text(file_path)
    
    # Train the Word2Vec model
    model = train_word2vec(sentences)
    
    # Save the model
    model.save("word2vec_model.bin")
    print("Word2Vec model trained and saved.")
    
    # Explore the model for all words in the vocabulary
    vocabulary = list(model.wv.key_to_index.keys())
    print(f"\nExploring all {len(vocabulary)} words in the vocabulary:\n")
    
    for word in vocabulary:
        explore_word2vec(model, word)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\user1\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Word2Vec model trained and saved.

Exploring all 234 words in the vocabulary:

Vector for '.': [-0.00067933  0.00078535  0.00518064  0.00898539 -0.00913443]... (showing first 5 dimensions)

Top 5 similar words to '.':
good: 0.3175
three: 0.2879
such: 0.2826
communication: 0.2538
edgar: 0.2383

--------------------------------------------------

Vector for 'you': [-0.00865384  0.00415911  0.00535747  0.00574585  0.00770615]... (showing first 5 dimensions)

Top 5 similar words to 'you':
asking: 0.2274
pm: 0.2169
phone: 0.1995
such: 0.1979
communication: 0.1968

--------------------------------------------------

Vector for '?': [ 4.8910446e-05  3.2708663e-03 -6.8035577e-03 -1.4380853e-03
  7.7291098e-03]... (showing first 5 dimensions)

Top 5 similar words to '?':
my: 0.3872
&: 0.2187
name: 0.2187
dr.: 0.2093
desk: 0.2061

--------------------------------------------------

Vector for 'a': [-8.3736507e-03  9.6544959e-03 -9.2086309e-05 -2.0035380e-03
  4.6665696e-03]... (showing first 5 d