In [9]:
pip install transformers datasets torch rouge-score



In [10]:
from datasets import load_dataset

# Load the dataset
dataset = load_dataset("pubmed_qa", "pqa_labeled")
train_dataset = dataset['train'].select(range(300))  # Use first 300 samples for training
eval_dataset = dataset['train'].select(range(300, 330))  # Reserve 30 samples for evaluation

In [11]:
from transformers import GPT2Tokenizer

# Load GPT-2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token  # GPT-2 doesn't have a padding token, so we use EOS token

# Function to tokenize input data
def preprocess_function(examples):
    # Use list comprehension to concatenate question and context for each example in the batch
    inputs = [str(q) + " " + str(c) for q, c in zip(examples["question"], examples["context"])] # Changed to convert q and c to strings explicitly
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")

    # Use the `long_answer` as the label for fine-tuning
    # Use the same max_length as the input to ensure consistent shapes.
    labels = tokenizer(examples["long_answer"], max_length=512, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Apply the preprocessing to datasets
train_dataset = train_dataset.map(preprocess_function, batched=True)
eval_dataset = eval_dataset.map(preprocess_function, batched=True)

# Set format for PyTorch
train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
eval_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])


`clean_up_tokenization_spaces` was not set. It will be set to `True` by default. This behavior will be depracted in transformers v4.45, and will be then set to `False` by default. For more details check this issue: https://github.com/huggingface/transformers/issues/31884



In [12]:
from transformers import GPT2LMHeadModel, Trainer, TrainingArguments

# Load GPT-2 pre-trained model
model = GPT2LMHeadModel.from_pretrained("gpt2")

# Define training arguments
training_args = TrainingArguments(
    output_dir="./gpt2-results",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=4,  # You can increase this if you have more GPU memory
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)

# Start training
trainer.train()



`evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead



Epoch,Training Loss,Validation Loss
1,No log,0.841785
2,No log,0.834062
3,No log,0.835757


TrainOutput(global_step=225, training_loss=0.9186393229166666, metrics={'train_runtime': 159.7019, 'train_samples_per_second': 5.635, 'train_steps_per_second': 1.409, 'total_flos': 235162828800000.0, 'train_loss': 0.9186393229166666, 'epoch': 3.0})

In [13]:
import math

# Get evaluation loss
eval_results = trainer.evaluate()
eval_loss = eval_results["eval_loss"]

# Calculate perplexity
perplexity = math.exp(eval_loss)
print(f"Perplexity: {perplexity}")


Perplexity: 2.3065588051900607


In [14]:
from rouge_score import rouge_scorer

scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)

# Generate predictions for the evaluation dataset
predictions = trainer.predict(eval_dataset).predictions.argmax(-1) # Get predicted token ids by taking argmax of logits
decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)

# Compare predictions with the actual labels
for i in range(30):  # Loop through evaluation samples
    reference = eval_dataset[i]['labels']
    decoded_reference = tokenizer.decode(reference, skip_special_tokens=True)

    score = scorer.score(decoded_reference, decoded_preds[i])
    print(f"ROUGE Scores for sample {i}: {score}")

ROUGE Scores for sample 0: {'rouge1': Score(precision=0.29411764705882354, recall=0.11363636363636363, fmeasure=0.16393442622950818), 'rougeL': Score(precision=0.29411764705882354, recall=0.11363636363636363, fmeasure=0.16393442622950818)}
ROUGE Scores for sample 1: {'rouge1': Score(precision=0.42857142857142855, recall=0.20689655172413793, fmeasure=0.2790697674418604), 'rougeL': Score(precision=0.25, recall=0.1206896551724138, fmeasure=0.16279069767441862)}
ROUGE Scores for sample 2: {'rouge1': Score(precision=0.21052631578947367, recall=0.0975609756097561, fmeasure=0.13333333333333333), 'rougeL': Score(precision=0.15789473684210525, recall=0.07317073170731707, fmeasure=0.09999999999999999)}
ROUGE Scores for sample 3: {'rouge1': Score(precision=0.4444444444444444, recall=0.2, fmeasure=0.2758620689655173), 'rougeL': Score(precision=0.3888888888888889, recall=0.175, fmeasure=0.24137931034482757)}
ROUGE Scores for sample 4: {'rouge1': Score(precision=0.2413793103448276, recall=0.16279069

In [15]:
!pip install nltk spacy sentence-transformers scikit-learn plotly
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m30.2 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [16]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import spacy

# Download stop words
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Initialize stemming and lemmatization tools
stemmer = PorterStemmer()
nlp = spacy.load('en_core_web_sm')

def preprocess_text(text):
    # Tokenize and remove stop words
    tokens = [word for word in text.split() if word.lower() not in stop_words]

    # Apply stemming
    stemmed_tokens = [stemmer.stem(word) for word in tokens]

    # Apply lemmatization
    lemmatized_tokens = [token.lemma_ for token in nlp(" ".join(stemmed_tokens))]

    return " ".join(lemmatized_tokens)

# Load or define your dataset here
# For example, you might load it from a file or create a sample dataset
dataset = {"train": [{"context": "This is an example sentence."}, {"context": "Another example for preprocessing."}]}

# Apply preprocessing on the dataset
processed_texts = [preprocess_text(sample["context"]) for sample in dataset["train"]]
print (processed_texts)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!

[W111] Jupyter notebook detected: if using `prefer_gpu()` or `require_gpu()`, include it in the same cell right before `spacy.load()` to ensure that the model is loaded on the correct device. More information: http://spacy.io/usage/v3#jupyter-notebook-gpu



['exampl sentence .', 'anoth exampl preprocesse .']


In [17]:
from sentence_transformers import SentenceTransformer

# Load pre-trained sentence transformer model
embedder = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# Convert preprocessed texts into embeddings
embeddings = embedder.encode(processed_texts)



`clean_up_tokenization_spaces` was not set. It will be set to `True` by default. This behavior will be depracted in transformers v4.45, and will be then set to `False` by default. For more details check this issue: https://github.com/huggingface/transformers/issues/31884



In [18]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Compute cosine similarity between all pairs of embeddings
similarities = cosine_similarity(embeddings)

# Get top 100 word pairs based on similarity scores
top_100_pairs = np.unravel_index(np.argsort(similarities, axis=None)[-100:], similarities.shape)

# Extract the corresponding word pairs and their similarity scores
top_pairs_with_scores = [(processed_texts[i], processed_texts[j], similarities[i][j])
                         for i, j in zip(top_100_pairs[0], top_100_pairs[1])]


In [20]:
import plotly.graph_objs as go
import networkx as nx

# Create a network graph
G = nx.Graph()

# Add nodes (words) to the graph
for i in range(len(processed_texts)):
    G.add_node(i, label=processed_texts[i])

# Add edges for the top 100 word pairs
for i, j, score in top_pairs_with_scores:
    G.add_edge(i, j, weight=score)

# Get node positions using a spring layout for better visualization
pos = nx.spring_layout(G)

# Plotly traces for edges and nodes
edge_trace = go.Scatter(
    x=[],
    y=[],
    line=dict(width=1, color='#888'),
    hoverinfo='none',
    mode='lines')

# Add edges to the trace
for edge in G.edges():
    x0, y0 = pos[edge[0]]
    x1, y1 = pos[edge[1]]
    edge_trace['x'] = edge_trace['x'] + (x0, x1, None) # Change from += to = and convert list to tuple
    edge_trace['y'] = edge_trace['y'] + (y0, y1, None) # Change from += to = and convert list to tuple

node_trace = go.Scatter(
    x=[],
    y=[],
    text=[],
    mode='markers',
    hoverinfo='text',
    marker=dict(
        showscale=True,
        colorscale='YlGnBu',
        size=10,
        colorbar=dict(
            thickness=15,
            title='Node Connections',
            xanchor='left',
            titleside='right')))

# Add nodes to the trace
for node in G.nodes():
    x, y = pos[node]
    node_trace['x'] = node_trace['x'] + (x,)
    node_trace['y'] = node_trace['y'] + (y,)
    if 'label' in G.nodes[node]:
        node_trace['text'] = node_trace['text'] + (G.nodes[node]['label'],)

# Create the layout for the Plotly visualization
layout = go.Layout(
    title='Interactive Visualization of Top 100 Word Pairs',
    titlefont=dict(size=16),
    showlegend=False,
    hovermode='closest',
    margin=dict(b=0, l=0, r=0, t=0),
    xaxis=dict(showgrid=False, zeroline=False),
    yaxis=dict(showgrid=False, zeroline=False))

# Combine traces into a figure and display
fig = go.Figure(data=[edge_trace, node_trace], layout=layout)
fig.show()
