In [3]:
import requests
import json
from tqdm import tqdm
import time

In [4]:
def generate_text_ollama(prompt, model="llama3.1:latest", api_url="http://10.167.31.201:11434/api/generate",
                         temperature=1.1, top_p=0.9, top_k=100):
    """
    Queries the Ollama API to generate text.
    
    Args:
        prompt (str): The prompt to send to the model.
        model (str): The name of the model to use.
        api_url (str): The endpoint for the Ollama API.
    
    Returns:
        str: The generated text.
    """
    headers = {"Content-Type": "application/json"}
    payload = {
        "model": model,
        "prompt": prompt,
        "stream": False,
        "options": {
            "temperature": temperature,
            "top_p": top_p,
            "top_k": top_k,

        }
    }
    
    try:
        response = requests.post(api_url, json=payload, headers=headers)
        response.raise_for_status()
        result = response.json()
        output = result.get("response", "")
        return output
    except requests.exceptions.RequestException as e:
        print(f"Error querying Ollama API: {e}")
        return []

### JSON templates

In [52]:
# prompt = """Generate a sentence based on the given taxonomy of temporal expression. The sentence should be categorized according to the following criteria:  

# ### **1. Type (Core TimeX2 Taxonomy)**  
# This defines the kind of temporal expression used in the sentence. It falls into one of the following categories:  

# #### **1.1 Precise Time Expressions**  
# These refer to specific points in time with absolute specificity. They include:  
# - **Date:** A calendar date (e.g., *"She was born on October 12, 1998."*).  
# - **Time of day:** A specific time in a day (e.g., *"The meeting starts at 9 AM."*).  
# - **Duration:** A span of time indicating how long something lasts (e.g., *"The war lasted from 1939 to 1945."*, *"The meeting is from 9 AM to 5 PM."*).  

# #### **1.2 Fuzzy Time Expressions**  
# These refer to a generic or imprecise point in time. The exact value is subjective, relative, or undefined. They include:  
# - **Indeterminate date:** An event happening within an interval (e.g. week, weekend, quarter, season), for which it isn't possible to determine a precise date (e.g., *"I'll visit next Wednesday."*, *"Last weekend, I practiced piano."*).  
# - **Indeterminate duration:** A duration without a clear endpoint, categorized as past, present, or future (e.g., *"He has been traveling lately."*, *"In the future, I'd like to learn how to paint."*).  
# - **Indeterminate time of day:** Reference to a time of day to which it's impossible to assign a precise value (e.g., *"I'll see you tonight."*, *"We went shopping this morning."*).  
# - **Unspecified duration:** A duration is given, but its length is ambiguous (e.g., *"The party went on for hours."*).  

# #### **1.3 Set of Times**  
# A reference to multiple disjoint time points, which may or may not be periodic:  
# - **Periodic:** An event recurring with a well-defined frequency (e.g., *"He goes jogging every Friday."*).  
# - **Non-periodic:** A disjoint set of references which are non-repeating (e.g., *"I've visited Rome a few times."*).  

# #### **1.4 Confounder**  
# A misleading use of numbers or dates that might be mistaken for a time expression (e.g., *"The name of the pub is 'October 1969'."*, *"There have been 1988 attempts to colonize Mars."*).  

# #### **1.5 Time-Independent**  
# A sentence without any temporal expression. For example, plain statements or scientific facts (e.g., *"Water boils at 100°C."*, *"the sum of the angles of a triangle is 180 degrees"*, *"The Italian flag is composed of three colors."*).  

# ### **2. Anchoring (Also TimeX2)**  
# Defines whether a time expression is explicitly linked to another time-related reference in the same sentence:  
# - **Date:** A temporal reference tied to another time expression (e.g., *"The course lasts three months. After that, you'll be fluent."*).  
# - **Event:** A temporal reference tied to an event (e.g., *"Three years after the war ended, Italy was still recovering."*).  
# - **None:** No explicit connection to another temporal reference.  

# ### **3. Notability**  
# Indicates whether the event referenced is notable enough to infer a timeframe:  
# - **Yes:** A significant historical or well-known event (e.g., *"My grandfather fought in World War II."*).  
# - **No:** A non-notable event with no global or historical importance (e.g., *"I went to the park last summer."*).  

# ### **4. Tense**  
# Determines the nature of the event:
# - **Past:** The event happened in the past and is concluded (e.g., *"She graduated in 2010."*).  
# - **Present:** The event is happening now (e.g., *"He is currently writing a book."*).  
# - **Future:** The event will happen at some point later (e.g., *"They will meet next Monday."*).  

# ---

# **Output Format:**  
# The response should be a JSON object structured as follows:  
# ```json
# {
#   "sentence": "<generated sentence>",
#   "expression": "<temporal expression contained within the sentence>",
#   "type": "<precise | fuzzy | set | confounder | time-independent>",
#   "subtype": "<date | time of day | duration | indeterminate date | indeterminate duration | indeterminate time of day | unspecified duration | periodic | non-periodic | confounder | time-independent>",
#   "anchoring": "<date | event | none>",
#   "notability": "<yes | no>",
#   "tense": "<past | present | future>"
# }
# ```

# ---

# **Example Outputs:**  

# **Example 1:**
# ```json
# {
#   "sentence": "The bus leaves at 13:17. We have to hurry!",
#   "expression": "13:17",
#   "type": "precise",
#   "subtype": "time of day",
#   "anchoring": "none",
#   "notability": "no",
#   "tense": "future"
# }
# ```

# **Example 2:** 
# ```json
# {
#   "sentence": "The climbers reached the summit at 6:45 AM.",
#   "expression": "6:45 AM",
#   "type": "precise",
#   "subtype": "time of day",
#   "anchoring": "none",
#   "notability": "no",
#   "tense": "past"
# }
# ```

# **Example 3:** 
# ```json
# {
#   "sentence": "At 23:32, the first data started coming in.",
#   "expression": "23:32",
#   "type": "precise",
#   "subtype": "time of day",
#   "anchoring": "none",
#   "notability": "no",
#   "tense": "past"
# }
# ```

# **Example 4:** 
# ```json
# {
#   "sentence": "The couple had to wait until 3:00 PM for the store to open.",
#   "expression": "3:00 PM",
#   "type": "precise",
#   "subtype": "time of day",
#   "anchoring": "none",
#   "notability": "no",
#   "tense": "past"
# }
# ```

# **Example 5:**
# ```json
# {
#   "sentence": "Exactly 2000 students attended the rally.",
#   "expression": "2000",
#   "type": "confounder",
#   "subtype": "confounder",
#   "anchoring": "none",
#   "notability": "no",
#   "tense": "past"
# },
# ```

# **Instructions:**
# - Take note of the examples provided and build entirely new sentences.
# {extra_instruction1}
# {extra_instruction2}
# - Generate a sentence that fits within this taxonomy and return it in the JSON format specified above. 
# - Do not give any context or introduction to your response, simply answer with just the JSON-formatted output. 
# - Ensure that the chosen type, anchoring, notability, and tense are all consistent within the response. 
# - Avoid generating sentences with multiple combined time expression (e.g. absolutely do not generate something like "Every Thursday, he attends a meeting from 2 PM to 4 PM.").

# **Output:**
# """

# prompt = """Generate a sentence based on the following instructions. The sentence should follow the format "{name} {action} {date}". Actions should be commonplace, but rare enough to not happen every day. Actions should be phrased in a genderless way without mentioning "him" or "her". Dates must be expressed as "the {day} of {month}".

# **Output Format:**  
# The response should be a JSON object structured as follows:  
# ```json
# {
#   "sentence": "<generated sentence>",
#   "date": "<date in which the action occurs, formatted as 'the {day} of {month}'>",
#   "name": "<name of the person performing the action>",
#   "action": "<action performed by the person, in the past tense>"
# }
# ```

# ---

# **Example Outputs:**  

# **Example 1:**
# ```json
# {
#   "sentence": "Mary visited the museum on the 15th of March.",
#   "date": "the 15th of March",
#   "name": "Mary",
#   "action": "visited the museum"
# }
# ```

# **Example 2:**
# ```json
# {
#   "sentence": "John attended the concert on the 22nd of July.",
#   "date": "the 22nd of July",
#   "name": "John",
#   "action": "attended the concert"
# }
# ```

# **Example 3:**
# ```json
# {
#   "sentence": "Alice won a contest on the 5th of November.",
#   "date": "the 5th of November",
#   "name": "Alice",
#   "action": "won a contest"
# }
# ```

# **Example 4:**
# ```json
# {
#   "sentence": "Bob sharpened the pencils on the 30th of January.",
#   "date": "the 30th of January",
#   "name": "Bob",
#   "action": "sharpened the pencils"
# }
# ```

# **Example 5:**
# ```json
# {
#   "sentence": "Charlie solved the puzzle on the 18th of September.",
#   "date": "the 18th of September",
#   "name": "Charlie",
#   "action": "solved the puzzle"
# }
# ```

# **Example 6:**
# ```json
# {
#   "sentence": "Diana assembled the library on the 12th of April.",
#   "date": "the 12th of April",
#   "name": "Diana",
#   "action": "assembled the library"
# }
# ```

# **Instructions:**
# - Do not give any context or introduction to your response, simply answer with just the JSON-formatted output.
# - Be creative, generate unique and somewhat rare sentences that fit the criteria.

# **Output:**
# """

prompt = """Generate a sentence based on the following instructions. The sentence should follow the format "{name} {action} {time}". Actions should be commonplace enough to happen every day, but generic enough to happen at every hour of the day. For example, do not mention breakfast, lunch, or dinner as they usually happen in specific parts of the day. Actions should be parts of a routine that happens every day. Actions should be phrased in a genderless way without mentioning "him" or "her". Times must be formatted as HH:MM.

**Output Format:**  
The response should be a JSON object structured as follows:  
```json
{
  "sentence": "<generated sentence>",
  "time": "<time in which the action occurs, formatted as 'HH:MM'>",
  "name": "<name of the person performing the action>",
  "action": "<action performed by the person, in the present tense>"
}
```

---

**Example Outputs:**  

**Example 1:**
```json
{
  "sentence": "Mary washes the dishes at 12:30.",
  "time": "12:30",
  "name": "Mary",
  "action": "ate a cake"
}
```

**Example 2:**
```json
{
  "sentence": "John reads the newspaper at 17:45.",
  "time": "17:45",
  "name": "John",
  "action": "finished the report"
}
```

**Example 3:**
```json
{
  "sentence": "Alice goes to bed at 16:00.",
  "time": "16:00",
  "name": "Alice",
  "action": "visited the mall"
}
```

**Example 4:**
```json
{
  "sentence": "Bob starts the dishwasher at 21:00.",
  "time": "08:15",
  "name": "Bob",
  "action": "started the dishwasher"
}
```

**Example 5:**
```json
{
  "sentence": "Charlie waters the plants at 10:30.",
  "time": "14:20",
  "name": "Charlie",
  "action": "watered the plants"
}
```

**Instructions:**
- Do not give any context or introduction to your response, simply answer with just the JSON-formatted output. 
- Be creative, generate unique and somewhat rare sentences that fit the criteria.

**Output:**
"""

In [None]:
from datetime import datetime
import itertools

extra_instructions1 = [
    # "- Generate a set periodic expression.",
    # "- Generate set non-periodic expression.",
    "- Generate exclusively precise time of day expression.",
    # "- Generate exclusively precise duration expression.",
    # "- Generate fuzzy unspecified duration expression.",
    # "- Generate fuzzy indeterminate duration expression.",
    # "- Generate fuzzy indeterminate time of day expression.",
    # "- Generate confounder expression.",
    # "- Generate time-independent expression.",
    # "- Generate precise date expression.",
    # "- Generate fuzzy indeterminate date expression.",
]
extra_instructions2 = [
    "- Only generate sentences with a time expression at the beginning. (e.g. \"Every Sunday, she visits her grandparents.\", \"In 1999, he won the championship.\", \"Last summer, they went on a road trip.\")",
    "- Only generate sentences with a time expression at the end. (e.g. \"She visits her grandparents every Sunday.\", \"He won the championship in 1999.\", \"They went on a road trip last summer.\")",
    "- Only generate sentences with a time expression in the middle. (e.g. \"The party was held on Saturday night, and a lot of people attended.\", \"He won the championship in 1999, and it was a great moment.\", \"They went on a road trip last summer, and it was a memorable experience.\")",
]

unique_id = datetime.now().strftime("%Y%m%d%H%M%S")
outputs = []
# for eis in tqdm(list(itertools.product(extra_instructions1, extra_instructions2))):
#     ei1, ei2 = eis
#     prompt_ei = prompt.replace("{extra_instruction1}", ei1).replace("{extra_instruction2}", ei2)
for i in tqdm(range(20)):
    output = generate_text_ollama(prompt, temperature=1.)
    try:
        outputs.append(json.loads(output))
    except:
        print(output)
        continue
    time.sleep(1)
# Save the outputs to a file
with open(f"datasets/generated/ollama_outputs_{unique_id}.json", "w") as f:
    json.dump(outputs, f, indent=2)

In [52]:
# Parse json to csv
import pandas as pd
df = pd.DataFrame(outputs)
df.to_csv(f"datasets/generated/ollama_outputs_{unique_id}.csv", index=False)
print("Done!")

Done!


### Text cleaning

In [5]:
prompt = """You are an expert in history. You have to rephrase the following sentence to make it match a given pattern. The sentence should follow the format "{name} {action}". Format your output as a JSON object. Do not return anything else. Keep the date consistent with the one mentioned in the input. Do not mix up the context with the prompt. The prompt is always in the format "{name} was born on the {paraphrased_event}", but "{name}" should not be replaced with anything. Whenever possible, summarize the event to make it less wordy.

**Output Format:**  
The response should be a JSON object structured as follows:  
{
  "event": "{name} was born on the {paraphrased_event}",
  "date": "<the date as presented in the input>"
}

---

**Examples:**  

**Example 1:**

**Input:**
1918-12-01,"The Kingdom of Iceland, a personal union with Denmark, is formed."

**Output:**
{
  "event": "{name} was born on the day the Kingdom of Iceland was formed.",
  "date": "1918-12-01"
}

**Example 2:**

**Input:**
1926-06-19,National Broadcasting Company (NBC) founded in New York City.

**Output:**
{
  "event": "{name} was born on the day the National Broadcasting Company was founded.",
  "date": "1926-06-19"
}

**Example 3:**

**Input:**
1972-11-29,"The arcade game Pong, the first commercially successful video game, is released."

**Output:**
{
  "event": "{name} was born on the day Pong was released.",
  "date": "1972-11-29"
}

**Task:**

**Input:**
{input}

**Output:**
"""


In [6]:
import pandas as pd

df = pd.read_csv("datasets/scraped/timeline_raw.csv")

# df = df.head(10)

outputs = []
attempts = 0
for i, row in tqdm(df.iterrows(), total=len(df)):
    input_sentence = f"{row['date']},{row['text']}"
    prompt_input = prompt.replace("{input}", input_sentence)
    # If it's wrongly formatted repeat the request
    while attempts < 10:
        output = generate_text_ollama(prompt_input, temperature=1.0)
        try:
            output = json.loads(output)
            output["date"] = row["date"] # Overwrite for safety
            outputs.append(output)
            attempts = 0
            break
        except json.JSONDecodeError:
            print(f"Error decoding JSON for input: {input_sentence}")
            attempts += 1
        
    attempts = 0
    time.sleep(0.3)

  0%|          | 0/804 [00:00<?, ?it/s]

100%|██████████| 804/804 [08:40<00:00,  1.54it/s]


In [8]:
pd.DataFrame(outputs).to_csv("datasets/generated/timeline_paraphrased.csv", index=False)