
# 1. Word Embeddings with Gensim (Word2Vec)


In [6]:
# pip install gensim spacy


# Step 2: Load and Preprocess Text Data


In [10]:
import gensim
from gensim.models import Word2Vec
import spacy

# Load a pre-trained SpaCy tokenizer to process text
nlp = spacy.load('en_core_web_sm')

# Sample text corpus (you can replace this with your own text)
corpus = [
    "Data science is an inter-disciplinary field that uses scientific methods",
    "Machine learning is a subset of artificial intelligence",
    "Natural language processing involves computational methods for text analysis"
]

# Tokenizing the corpus using SpaCy
tokenized_corpus = [[token.text.lower() for token in nlp(text)] for text in corpus]



# Step 3: Train Word2Vec Model



In [11]:
# Training the Word2Vec model
model = Word2Vec(sentences=tokenized_corpus, vector_size=100, window=5, min_count=1, workers=4)

# Save the model
model.save("word2vec.model")


# Step 4: Using the Trained Embedding Model


In [12]:
# Load the trained model
model = Word2Vec.load("word2vec.model")

# Get the vector for a specific word
word_vector = model.wv['data']
print("Word Vector for 'data':", word_vector)

# Find most similar words to a given word
similar_words = model.wv.most_similar('data', topn=5)
print("Words similar to 'data':", similar_words)


Word Vector for 'data': [-6.9636069e-03 -2.4585116e-03 -8.0229370e-03  7.5005279e-03
  6.1274157e-03  5.2584694e-03  8.3778575e-03 -6.9653272e-04
 -9.3127284e-03  9.1156662e-03 -4.9285362e-03  7.8479899e-03
  5.5338596e-03 -1.0790766e-03 -7.6642158e-03 -1.4598024e-03
  6.2535368e-03 -6.9660828e-03  1.4420962e-03 -7.9518585e-03
  8.7213479e-03 -2.8557885e-03  9.4373021e-03 -5.7080747e-03
 -9.7177243e-03 -8.6279036e-03 -4.0748348e-03  4.7095944e-03
 -2.4193883e-04  9.2235124e-03  3.1092144e-03  3.7477673e-03
  2.9963492e-03  8.1486488e-03 -2.3967146e-03  7.4073388e-03
 -9.5367134e-03  2.9210865e-03 -6.8166968e-04  4.5225740e-04
  6.8430100e-03 -2.8419732e-03 -2.3567795e-03 -1.0047674e-04
 -4.9769162e-04 -3.5749613e-03  6.2444829e-03 -6.5586674e-03
  7.8919996e-03 -9.3460083e-05  2.6088404e-03  3.2231498e-03
 -2.8165340e-04  1.7063022e-03 -3.1406546e-03  4.7564553e-03
  2.4301052e-04 -3.2805956e-03 -8.7145744e-03 -9.9980794e-03
  3.1277776e-04 -5.7468102e-03 -1.1096597e-03 -4.2060935e-03


# 2. Generative Models for Text Synthesis with Hugging Face Transformers (GPT-2 or GPT-3)

In [None]:
# pip install transformers torch


In [13]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Load pre-trained model and tokenizer
model_name = 'gpt2'  # You can switch to 'gpt2-medium' or 'gpt2-large' for more power
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# Encode input text
input_text = "Artificial intelligence is revolutionizing the world by"
input_ids = tokenizer.encode(input_text, return_tensors='pt')

# Generate text
output = model.generate(input_ids, max_length=100, num_return_sequences=1, no_repeat_ngram_size=2)

# Decode and print the generated text
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
print("Generated Text:", generated_text)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Generated Text: Artificial intelligence is revolutionizing the world by creating new ways to interact with the human brain.

The new technology is called artificial intelligence, or AI. It is a new way of thinking about the brain, and it is being used to create new kinds of information. The new information is not just about what you are doing, but also about how you feel. This new knowledge is used by the AI to help us understand our emotions, our thoughts, how we feel, what we think,


In [15]:
from transformers import Trainer, TrainingArguments

# Example fine-tuning dataset (replace with your own data)
train_texts = ["Artificial intelligence improves healthcare.", "AI is transforming industries."]

# Set the padding token to the EOS token
tokenizer.pad_token = tokenizer.eos_token

train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)

# Define TrainingArguments
training_args = TrainingArguments(
    output_dir="./results",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=1,
    save_steps=10_000,
    save_total_limit=2,
)

# Fine-tune the model using the Trainer class
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_encodings,
)

trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:


Abort: 

# 3. Text Summarization Using BART (Another Generative Model)


In [16]:
from transformers import BartForConditionalGeneration, BartTokenizer

# Load pre-trained BART model and tokenizer
model_name = 'facebook/bart-large-cnn'
model = BartForConditionalGeneration.from_pretrained(model_name)
tokenizer = BartTokenizer.from_pretrained(model_name)

# Input text for summarization
article = """
Artificial intelligence (AI) is intelligence demonstrated by machines, as opposed to the natural intelligence displayed by humans and animals. Leading AI textbooks define the field as the study of "intelligent agents": any device that perceives its environment and takes actions that maximize its chance of successfully achieving its goals.
"""

# Encode input text
inputs = tokenizer(article, return_tensors="pt", max_length=1024, truncation=True)

# Generate summary
summary_ids = model.generate(inputs["input_ids"], max_length=100, min_length=30, length_penalty=2.0, num_beams=4, early_stopping=True)

# Decode and print the summary
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
print("Summary:", summary)


config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



Summary: Artificial intelligence (AI) is intelligence demonstrated by machines, as opposed to the natural intelligence displayed by humans and animals. Leading AI textbooks define the field as the study of "intelligent agents"


In [17]:

print("Summary:", summary)


Summary: Artificial intelligence (AI) is intelligence demonstrated by machines, as opposed to the natural intelligence displayed by humans and animals. Leading AI textbooks define the field as the study of "intelligent agents"
