In [1]:
# Install necessary libraries (if not installed)
!pip install transformers torch sentencepiece rouge-score nltk scikit-learn tqdm

Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_

In [2]:
# Install necessary libraries (if not installed)
# !pip install transformers torch sentencepiece rouge-score nltk scikit-learn tqdm

import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
from tqdm import tqdm
from google.colab import files

# Upload CSV File
print("Upload your dataset (CSV file)...")
uploaded = files.upload()

# Get the uploaded file name
file_name = list(uploaded.keys())[0]
print(f"Dataset uploaded: {file_name}")

# Load dataset
df = pd.read_csv(file_name)

# Ensure the dataset contains the expected column
if 'sentence' not in df.columns:
    raise ValueError("Dataset must contain a column named 'sentence'")

# Check if CUDA is available (for GPU acceleration)
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Load DeepSeek-R1-Distill-Qwen-7B model
model_name = "deepseek-ai/deepseek-r1-distill-qwen-7b"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16).to(device)

def paraphrase(text):
    """Paraphrase input text using DeepSeek-R1-Distill-Qwen-7B."""
    prompt = f"Paraphrase this sentence: {text}"
    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=50,  # Limit new tokens for paraphrasing
            temperature=0.7,
            top_k=50,
            top_p=0.95,
            repetition_penalty=2.0
        )

    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Apply paraphrasing to the dataset
tqdm.pandas()
df["paraphrased_sentence"] = df["sentence"].progress_apply(paraphrase)

# Compute similarity using TF-IDF + Cosine Similarity
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(df["sentence"].tolist() + df["paraphrased_sentence"].tolist())

# Extracting the similarity scores
original_vectors = tfidf_matrix[:len(df)]
paraphrased_vectors = tfidf_matrix[len(df):]

similarities = [cosine_similarity(original_vectors[i], paraphrased_vectors[i])[0][0] for i in range(len(df))]

# Add similarity scores to DataFrame
df["similarity_score"] = similarities

# Calculate mean and standard deviation of similarity scores
mean_similarity = np.mean(similarities)
std_dev_similarity = np.std(similarities)

# Initialize ROUGE and BLEU score computation
scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True)
bleu_scores = []
rouge_scores = {"rouge-1": [], "rouge-2": [], "rouge-l": []}

smoothing = SmoothingFunction().method1

for i in range(len(df)):
    ref = df.loc[i, "sentence"]
    pred = df.loc[i, "paraphrased_sentence"]

    # Compute BLEU score
    bleu = sentence_bleu([ref.split()], pred.split(), smoothing_function=smoothing)
    bleu_scores.append(bleu)

    # Compute ROUGE scores
    rouge_score = scorer.score(ref, pred)
    rouge_scores["rouge-1"].append(rouge_score["rouge1"].fmeasure)
    rouge_scores["rouge-2"].append(rouge_score["rouge2"].fmeasure)
    rouge_scores["rouge-l"].append(rouge_score["rougeL"].fmeasure)

# Add BLEU and ROUGE scores to DataFrame
df["bleu_score"] = bleu_scores
df["rouge-1"] = rouge_scores["rouge-1"]
df["rouge-2"] = rouge_scores["rouge-2"]
df["rouge-l"] = rouge_scores["rouge-l"]

# Calculate mean and standard deviation for BLEU and ROUGE scores
mean_bleu = np.mean(bleu_scores)
std_bleu = np.std(bleu_scores)

mean_rouge_1 = np.mean(rouge_scores["rouge-1"])
std_rouge_1 = np.std(rouge_scores["rouge-1"])

mean_rouge_2 = np.mean(rouge_scores["rouge-2"])
std_rouge_2 = np.std(rouge_scores["rouge-2"])

mean_rouge_l = np.mean(rouge_scores["rouge-l"])
std_rouge_l = np.std(rouge_scores["rouge-l"])

# Save results
output_file = "PP-002.csv"
df.to_csv(output_file, index=False)

# Provide download link for output file
files.download(output_file)

# Print evaluation results
print(f"Results saved to {output_file}")
print(f"Mean Similarity Score: {mean_similarity:.4f}")
print(f"Standard Deviation of Similarity Scores: {std_dev_similarity:.4f}")
print(f"Mean BLEU Score: {mean_bleu:.4f}, Standard Deviation: {std_bleu:.4f}")
print(f"Mean ROUGE-1 Score: {mean_rouge_1:.4f}, Standard Deviation: {std_rouge_1:.4f}")
print(f"Mean ROUGE-2 Score: {mean_rouge_2:.4f}, Standard Deviation: {std_rouge_2:.4f}")
print(f"Mean ROUGE-L Score: {mean_rouge_l:.4f}, Standard Deviation: {std_rouge_l:.4f}")


Upload your dataset (CSV file)...


Saving paraphtrase_dataset.csv to paraphtrase_dataset.csv
Dataset uploaded: paraphtrase_dataset.csv
Using device: cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/3.07k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/680 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/28.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-000002.safetensors:   0%|          | 0.00/8.61G [00:00<?, ?B/s]

model-00002-of-000002.safetensors:   0%|          | 0.00/6.62G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

  0%|          | 0/480 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
  0%|          | 2/480 [00:03<14:02,  1.76s/it]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
  1%|          | 3/480 [00:05<14:25,  1.81s/it]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
  1%|          | 4/480 [00:07<14:34,  1.84s/it]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
  1%|          | 5/480 [00:09<14:48,  1.87s/it]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
  1%|▏         | 6/480 [00:11<14:53,  1.89s/it]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
  1%|▏         | 7/480 [00:13<14:52,  1.89s/it]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
  2%|▏         | 8/480 [00:14<14:52,  1.89s/it]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
  2%|▏         | 9/480 [00:16<14:50,  1.89s/it]S

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Results saved to PP-002.csv
Mean Similarity Score: 0.4843
Standard Deviation of Similarity Scores: 0.1369
Mean BLEU Score: 0.1450, Standard Deviation: 0.0938
Mean ROUGE-1 Score: 0.3002, Standard Deviation: 0.0976
Mean ROUGE-2 Score: 0.2758, Standard Deviation: 0.0993
Mean ROUGE-L Score: 0.3002, Standard Deviation: 0.0976
