In [66]:
%pip install markovify
import markovify

Note: you may need to restart the kernel to use updated packages.


In [67]:
import pandas as pd

# Load the CSV file
df = pd.read_csv('prompts.csv', delimiter=';')

# Function to replace 'title' with 'abstract' in a string
def replace_title_with_abstract(text):
    return text.replace('title', 'abstract').replace('Title', 'Abstract')

# Copy TitlePrompt to AbstractPrompt and replace 'title' with 'abstract'
df['AbstractPrompt'] = df['TitlePrompt'].apply(lambda x: replace_title_with_abstract(x) if pd.notnull(x) else x)

# Save the modified DataFrame back to CSV
df.to_csv('prompts.csv', index=False, sep=';')

In [68]:
import re

# Function to count words and sentences
def count_words_and_sentences(text):
    if pd.isnull(text):
        return [0, 0]
    words = len(text.split())
    sentences = len(re.findall(r'[.!?]', text))
    return [words, sentences]

# Apply the function to the TitlePrompt column and update the Words and Sentences columns
df[['Words', 'Sentences']] = df['TitlePrompt'].apply(lambda x: pd.Series(count_words_and_sentences(x)))

print(df[['TitlePrompt', 'Words', 'Sentences']])
df.to_csv('prompts.csv', index=False, sep=';')

                                            TitlePrompt  Words  Sentences
0                                                   NaN      0          0
1           Screen the titles below like a human would.      8          1
2            Screen the title below like a human would.      8          1
3            You are a world-class clinical researcher.      6          1
4     Do not exclude any titles unless you are absol...     13          1
...                                                 ...    ...        ...
1016                                                NaN      0          0
1017                                                NaN      0          0
1018                                                NaN      0          0
1019                                                NaN      0          0
1020                                                NaN      0          0

[1021 rows x 3 columns]


In [91]:
# Get raw text as string.
text = ' '.join(df['TitlePrompt'].dropna().tolist())

# Build the model.
text_model = markovify.Text(text)

# Print five randomly-generated sentences
for i in range(10):
    print(text_model.make_sentence())

# Print three randomly-generated sentences of no more than 280 characters

You are a researcher in a landmark systematic review—select the most impactful papers.
Use your extensive scientific expertise to distinguish between relevant and exclude those that are irrelevant.
You are a meticulous researcher tasked with screening potential papers for their relevance in the meta-analysis.
Retain titles that explicitly state data availability, handling of missing data, and measurable outcomes that align with multi-disciplinary standards.
Exclude studies with high-impact potential.
Include only those titles that are irrelevant.
You are familiar with the mindset of a quality-control expert ensuring compliance with systematic review for a systematic review.
You are a researcher in a landmark systematic review—select the most impactful papers.
You are familiar with the concepts of sensitivity and specificity to ensure a balanced selection process.
Aim to keep studies with incomplete design details or ambiguous methodologies.


In [88]:
import json
import random

In [89]:
taxonomie = {
    "Role": {
      "Scientific_Roles": [
        "Meta-Analyst",
        "Clinical Researcher",
        "Systematic Reviewer",
        "Senior Editor",
        "Peer-Reviewer",
        "Researcher",
        "Data Scientist",
        "Journal Editor",
        "Interdisciplinary Team Leader",
        "Developer of Machine-Learning Datasets",
        "part of a peer-review team",
        "tasked with",
      ],
      "Adjectives": [
        "",
        "world-class",
        "meticulous",
        "experienced",
        "knowledgeable",
        "innovative",
        "dedicated",
        "passionate",
        "effective",
        "collaborative",
        "ethical",
        "transparent",
        "reliable",
        "responsible",
        "professional",
        "efficient",
        "systematic",
        "methodical",
        "analytical",
        "critical",
        "creative",
        "motivated",
        "engaged"
      ],
      "Noun": [
        "you are",
        "you are a",
        ""
      ],
      "Verb": [
        "",
        "Pretend",
        "Assume",
        "Imagine",
        "Suppose",
        "Evaluate the relevance of the titles below with the mindset of a"
      ],
      "Additional_Information": [
        "",
        "assessing studies for systematic reviews",
        "curating studies for a meta-analysis publication",
        "collaborating with an interdisciplinary team of scientists, clinicians, and statisticians",
        "in systematic review methods",
        "in meta-analysis techniques",
        "in evidence-based medicine",
        "in data analysis and interpretation",
        "in statistical methods",
        "in clinical research",
        "curating studies for a meta-analysis publication",
        "analyzing data for clinical trials",
        "developing research protocols and methodologies",
        "writing and reviewing scientific manuscripts",
        "conducting literature searches and data extraction",
        "conducting a systematic review",
        "presenting findings at conferences and seminars",
        "providing statistical support for research projects",
        "ensuring compliance with ethical guidelines and standards",
        "mentoring junior researchers and students",
        "collaborating with industry partners and stakeholders",
        "specializing in systematic reviews",
        "assembling a gold-standard dataset for machine learning model training",
        "in a world-class scientific team that wants to write a meta-analysis",
        "evaluating titles for relevance to meta-analysis topics",
        "submitting a meta-analysis to a high-impact journal",
        "evaluating submissions for publication",
        "evaluating a collection of academic papers to determine their relevance for inclusion in a meta-analysis",
        "writing a grant proposal to secure funding for a systematic review",
        "evaluating studies for compliance with ethical and regulatory standards in clinical trials",
      ]
    }
    
}



In [78]:
# Function to generate sentences based on the given structure
def generate_sentences(taxonomie):
    sentences = []
    for verb in taxonomie["Role"]["Verb"]:
        for noun in taxonomie["Role"]["Noun"]:
            for adjective in taxonomie["Role"]["Adjectives"]:
                for role in taxonomie["Role"]["Scientific_Roles"]:
                    for info in taxonomie["Role"]["Additional_Information"]:
                        sentence = f"{verb} {noun} {adjective} {role} {info}"
                        # Replace multiple spaces with a single space and strip leading/trailing spaces
                        sentence = re.sub(' +', ' ', sentence).strip()
                        sentences.append(sentence)
    return sentences

# Generate sentences
sentences = generate_sentences(taxonomie)

# Print a few example sentences
for sentence in sentences[:10]:
    print(sentence)

# Print the total number of generated sentences
print("Total number of generated sentences:", len(sentences))

You are Meta-Analyst
You are Meta-Analyst assessing studies for systematic reviews
You are Meta-Analyst curating studies for a meta-analysis publication
You are Meta-Analyst collaborating with an interdisciplinary team of scientists, clinicians, and statisticians
You are Meta-Analyst in systematic review methods
You are Meta-Analyst in meta-analysis techniques
You are Meta-Analyst in evidence-based medicine
You are Meta-Analyst in data analysis and interpretation
You are Meta-Analyst in statistical methods
You are Meta-Analyst in clinical research
Total number of generated sentences: 124200


In [86]:
%pip install language-tool-python
import language_tool_python

# Initialize the LanguageTool object
tool = language_tool_python.LanguageTool('en-US')

# Function to check if a sentence makes sense
def check_sentence(sentence):
    matches = tool.check(sentence)
    return len(matches) == 0

# Apply the function to the generated sentences
valid_sentences = [sentence for sentence in filtered_sentences if check_sentence(sentence)]

# Print the valid sentences
for sentence in valid_sentences[:10]:  # Print first 10 for example
    print(sentence)

# Print the total number of valid sentences
print("Total number of valid sentences:", len(valid_sentences))

Collecting language-tool-python
  Downloading language_tool_python-2.8.1-py3-none-any.whl.metadata (12 kB)
Collecting wheel (from language-tool-python)
  Using cached wheel-0.45.1-py3-none-any.whl.metadata (2.3 kB)
Downloading language_tool_python-2.8.1-py3-none-any.whl (35 kB)
Using cached wheel-0.45.1-py3-none-any.whl (72 kB)
Installing collected packages: wheel, language-tool-python
Successfully installed language-tool-python-2.8.1 wheel-0.45.1
Note: you may need to restart the kernel to use updated packages.


Downloading LanguageTool 6.4: 100%|██████████| 246M/246M [00:43<00:00, 5.68MB/s] 
Unzipping /var/folders/qg/82ll9gvd1pd4329hnkdpf8ch0000gn/T/tmppj46ata7.zip to /Users/canis/.cache/language_tool_python.
Downloaded https://www.languagetool.org/download/LanguageTool-6.4.zip to /Users/canis/.cache/language_tool_python.


Total number of valid sentences: 0


In [73]:
# Save the generated sentences to a text file
with open('generated_sentences.txt', 'w') as f:
    for sentence in sentences:
        f.write(sentence + '\n')

# Load the sentences from the text file
with open('generated_sentences.txt', 'r') as f:
    text = f.read()

# Build the model
text_model = markovify.Text(text)

# Print five randomly-generated sentences
for i in range(5):
    print(text_model.make_sentence())

None
None
None
None
None


After generating the sentences for the role rule-based, we need to delete some combinations that do not make sense. Specifically, we will delete sentences that contain any of the following:

1. "Evaluate the relevance of the titles below with the mindset of a" followed by "you".
2. "you are" followed by any of the {Scientific_Role} except "part of a peer-review team" and "tasked with".
3. "you are" followed by any {Adjective}.
4. "tasked with" followed by "in".

In [77]:

# Define the conditions for deletion
def should_delete(sentence):
    if re.search(r"Evaluate the relevance of the titles below with the mindset of a.*\byou\b", sentence):
        return True
    if re.search(r"\bare\b.*\b(Meta-Analyst|Clinical Researcher|Systematic Reviewer|Senior Editor|Peer-Reviewer|Researcher|Data Scientist|Journal Editor|Interdisciplinary Team Leader|Developer of Machine-Learning Datasets)\b", sentence):
        return True
    if not re.search(r"\bare\b", sentence):
        return True
    if re.search(r"\bare\b.*\b(" + "|".join(taxonomie["Role"]["Adjectives"]) + r")\b", sentence):
        return True
    if re.search(r"\btasked with\b.*\bin\b", sentence):
        return True
    else:
        return False


# Filter out the sentences that should be deleted
filtered_sentences = [sentence for sentence in sentences if not should_delete(sentence)]

# Print the filtered sentences
for sentence in filtered_sentences[:10]:  # Print first 10 for example
    print(sentence)

# Print the total number of filtered sentences
print("Total number of filtered sentences:", len(filtered_sentences))


Total number of filtered sentences: 0


In [75]:
# Generierung von Schnipseln
def generate_snippets(taxonomie):
    snippets = {
        "Role": [f"{verb} {noun} {adjective} {role} {info}." 
                 for verb in taxonomie["Role"]["Verb"]:
for noun in taxonomie["Role"]["Noun"]:
                        for adjective in taxonomie["Role"]["Adjectives"]:
                            for role in taxonomie["Role"]["Scientific_Roles"]:
                                for info in taxonomie["Role"]["Additional_Information"]],
        # "Objective": [f"Your task is to {objective}." for objective in taxonomie["Objective"]],
        # "ContentFocus": [f"Focus on {focus}." for focus in taxonomie["ContentFocus"]],
        # "Style": [f"Ensure the tone is {style}." for style in taxonomie["Style"]],
        # "Criteria": [f"Apply the following criteria: {criteria}." for criteria in taxonomie["Criteria"]],
        # "Context": [f"Consider the context of {context}." for context in taxonomie["Context"]]
    }
    return snippets

snippets = generate_snippets(taxonomie)

# JSON-Struktur
snippets_json = json.dumps(snippets, indent=4)
print("Generated JSON structure:\n", snippets_json)

# Template-Erstellung
template_variations = [
    #"{context} {role} {objective} {criteria} {content_focus} {style}",
    "{role}.", # {objective}. {content_focus}. {criteria}. {context}. {style}.",
    #"Suppose {role}. {objective}. Focus on {content_focus}. Ensure {style}. {criteria}. Context: {context}."
]

# Funktion für die Variation von Templates
def generate_template(role, objective, content_focus, criteria, context, style):
    template = random.choice(template_variations)
    return template.format(
        role=role,
    #     objective=objective,
    #     content_focus=content_focus,
    #     criteria=criteria,
    #     context=context,
    #     style=style
    )

SyntaxError: invalid syntax (3154928950.py, line 5)

In [79]:

taxonomy = {
    "Objective": {
      "Verb": [
        "select",
        "screen",
        "evaluate",
        ""
      ],
      "Noun": [
          "the titles",
          "the title",
          "those titles",
          "these titles",
          



        "Select relevant titles",
        "Exclude irrelevant titles",
        "Ensure sensitivity and specificity",
        "Ensure precision and recall",
        "Focus on methodological rigor"
        "Prepare for a meta-analysis",
        "Develop clinical guidelines",
        "Curate for public health reports"
        "Demonstrate best practices",
        "Train junior researchers",
        "Support clinical decisions",
        "Develop evidence-based guidelines",
        "Select studies",
      ],
      "Adjective": [
        "Select",
        "Exclude",
        "Ensure sensitivity and specificity by selecting",
        "Ensure precision and recall by selecting",
        "Focus on methodological rigor"
        "Prepare for a meta-analysis",
        "Develop clinical guidelines",
        "Curate for public health reports"
        "Demonstrate best practices",
        "Train junior researchers",
        "Support clinical decisions",
        "Develop evidence-based guidelines",
        "Select studies",
      ],
      "Verb": [
        "Select relevant titles",
        "Exclude irrelevant titles",
        "Ensure sensitivity and specificity",
        "Ensure precision and recall",
        "Focus on methodological rigor"
        "Prepare for a meta-analysis",
        "Develop clinical guidelines",
        "Curate for public health reports"
        "Demonstrate best practices",
        "Train junior researchers",
        "Support clinical decisions",
        "Develop evidence-based guidelines",
        "Select studies",
      ],

    "Content Focus": {
      "Sections of Meta-Analysis": [
        "Title",
        "Hypothesis/Research Question",
        "Methods",
        "Results"
      ],
      "Study Topics": [
        "Specific research field",
        "Target population",
        "Interventions"
      ],
      "Methodological Details": [
        "Study design",
        "Statistical rigor",
        "Transparency of methods"
      ],
      "Outcomes": [
        "Primary and secondary endpoints",
        "Relevance of results",
        "Reproducibility"
      ]
    },
    "Style": {
      "Tone": [
        "Formal and precise",
        "Motivating and educational",
        "Strict and demanding"
      ],
      "Structure": [
        "Imperative",
        "Hypothetical",
        "Open-ended questions",
        "Closed-ended questions"
      ],
      "Complexity": [
        "Simple",
        "Complex"
      ]
    },
    "Criteria": {
      "Inclusion Criteria": [
        "Population",
        "Intervention",
        "Comparison",
        "Outcomes",
        "Study design",
        "Topic relevance"
      ],
      "Exclusion Criteria": [
        "No clear outcomes",
        "Methodological weaknesses",
        "Ethical concerns",
        "Language barriers",
        "Incomplete or unavailable data"
      ]
    },
    "Context": {
      "Meta-Analysis Related": [
        "Preparing a high-level review",
        "Training for systematic reviews"
      ],
      "Practice-Oriented": [
        "Developing evidence-based guidelines",
        "Supporting clinical decisions"
      ],
      "Educational-Oriented": [
        "Demonstrating best practices",
        "Training and mentoring"
      ],
      "Policy-Oriented": [
        "Curating for public health initiatives",
        "Selecting studies for policy reports"
      ]
    },
    "Methodology": {
      "Approach": [
        "Signal-detection theory",
        "Evidence-based clinical guidelines",
        "Statistical frameworks"
      ],
      "Validation": [
        "Reproducibility of methods",
        "Transparency in reporting",
        "Bias reduction measures"
      ]
    }
}

SyntaxError: incomplete input (44012658.py, line 231)

In [None]:

# Existierende Prompts
existing_prompts = [
    "Screen the titles below like a human would.",
    "You are a world-class clinical researcher.",
    "Select the most relevant titles below.",
    "Imagine conducting a systematic review. Choose only those titles that align closely with pre-defined eligibility criteria.",
    "Pretend you are submitting a meta-analysis to a high-impact journal. Select titles based on strict inclusion criteria to ensure scientific rigor."
]

# Generierung von Schnipseln
def generate_snippets(taxonomie):
    snippets = {
        "Role": [f"Imagine you are a {role}." for role in taxonomie["Role"]],
        "Objective": [f"Your task is to {objective}." for objective in taxonomie["Objective"]],
        "ContentFocus": [f"Focus on {focus}." for focus in taxonomie["ContentFocus"]],
        "Style": [f"Ensure the tone is {style}." for style in taxonomie["Style"]],
        "Criteria": [f"Apply the following criteria: {criteria}." for criteria in taxonomie["Criteria"]],
        "Context": [f"Consider the context of {context}." for context in taxonomie["Context"]]
    }
    return snippets

snippets = generate_snippets(taxonomie)

# Ergänzung durch existierende Prompts
for category in snippets:
    snippets[category].extend(existing_prompts)

# JSON-Struktur
snippets_json = json.dumps(snippets, indent=4)
print("Generated JSON structure:\n", snippets_json)

# Template-Erstellung
template_variations = [
    "{context} {role} {objective} {criteria} {content_focus} {style}",
    "{role}. {objective}. {content_focus}. {criteria}. {context}. {style}.",
    "Suppose {role}. {objective}. Focus on {content_focus}. Ensure {style}. {criteria}. Context: {context}."
]

# Funktion für die Variation von Templates
def generate_template(role, objective, content_focus, criteria, context, style):
    template = random.choice(template_variations)
    return template.format(
        role=role,
        objective=objective,
        content_focus=content_focus,
        criteria=criteria,
        context=context,
        style=style
    )

# Test: Generierung einer Beispiel-Template
example_template = generate_template(
    role="Meta-Analyst",
    objective="Select impactful titles",
    content_focus="Evaluate mesenchymal stem cell therapy studies",
    criteria="Include only studies with clear outcomes",
    context="Systematic review preparation",
    style="Formal and precise"
)

print("\nGenerated Example Template:\n", example_template)

In [84]:
%pip install transformers
from transformers import pipeline

# Lade ein vortrainiertes Modell zum Paraphrasieren
paraphraser = pipeline("text2text-generation", model="t5-small")

# Liste von Prompts
prompts = ' '.join(df['TitlePrompt'].dropna().tolist())

# Paraphrasierungen generieren
variations = []
for prompt in prompts:
    output = paraphraser(f"paraphrase: {prompt}", max_length=50, num_return_sequences=5)
    variations.extend([item['generated_text'] for item in output])

# Variationen speichern
with open("prompt_variationen.txt", "w") as file:
    file.write("\n".join(variations))

Note: you may need to restart the kernel to use updated packages.


RuntimeError: At least one of TensorFlow 2.0 or PyTorch should be installed. To install TensorFlow 2.0, read the instructions at https://www.tensorflow.org/install/ To install PyTorch, read the instructions at https://pytorch.org/.

In [93]:
%pip install torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import pandas as pd
import torch

# Load the CSV file
df = pd.read_csv('prompts.csv', delimiter=';')

# Load the tokenizer and model
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

# Tokenize the prompts
def tokenize_function(examples):
    return tokenizer(examples['TitlePrompt'], padding="max_length", truncation=True)

# Prepare the dataset
dataset = df[['TitlePrompt']].dropna().reset_index(drop=True)
dataset = dataset.rename(columns={'TitlePrompt': 'text'})
dataset['labels'] = 0  # Dummy labels for fine-tuning

# Tokenize the dataset
tokenized_dataset = dataset.apply(tokenize_function, axis=1)

# Convert to torch tensors
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

encodings = tokenizer(dataset['text'].tolist(), truncation=True, padding=True)
labels = dataset['labels'].tolist()
train_dataset = CustomDataset(encodings, labels)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
)

# Fine-tune the model
trainer.train()

# Save the fine-tuned model
model.save_pretrained('./fine-tuned-model')
tokenizer.save_pretrained('./fine-tuned-model')

Collecting torch
  Downloading torch-2.5.1-cp311-none-macosx_11_0_arm64.whl.metadata (28 kB)
Collecting networkx (from torch)
  Downloading networkx-3.4.2-py3-none-any.whl.metadata (6.3 kB)
Collecting sympy==1.13.1 (from torch)
  Downloading sympy-1.13.1-py3-none-any.whl.metadata (12 kB)
Collecting mpmath<1.4,>=1.1.0 (from sympy==1.13.1->torch)
  Downloading mpmath-1.3.0-py3-none-any.whl.metadata (8.6 kB)
Downloading torch-2.5.1-cp311-none-macosx_11_0_arm64.whl (63.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.9/63.9 MB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading sympy-1.13.1-py3-none-any.whl (6.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.2/6.2 MB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading networkx-3.4.2-py3-none-any.whl (1.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?2

ImportError: 
AutoModelForSequenceClassification requires the PyTorch library but it was not found in your environment. Checkout the instructions on the
installation page: https://pytorch.org/get-started/locally/ and follow the ones that match your environment.
Please note that you may need to restart your runtime after installation.
