In [None]:
##imports
import pandas as pd
import os
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

In [None]:
##Load Dataset
DATASET_PATH='dataset/'

df=pd.read_csv(DATASET_PATH+"emnlp3000.csv")

df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3011 entries, 0 to 3010
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Unnamed: 0.1  3011 non-null   int64 
 1   Unnamed: 0    3011 non-null   int64 
 2   Title         3011 non-null   object
 3   Poet          3011 non-null   object
 4   text          3011 non-null   object
 5   ctext         3011 non-null   object
 6   Poem Link     3011 non-null   object
 7   our_summary   3011 non-null   object
dtypes: int64(2), object(6)
memory usage: 188.3+ KB


Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Title,Poet,text,ctext,Poem Link,our_summary
0,0,0,"Dear John, Dear Coltrane by Michael S. Harper",Michael S. Harper,"'Dear John, Dear Coltrane' by Michael S. Harpe...","a love supreme, a love supreme\na love supreme...",https://www.poetryfoundation.org/poems/42827/d...,"The poem explores themes of love, loss, pain, ..."
1,1,1,Parrot by Stevie Smith,Stevie Smith,‘Parrot‘ depicts the declining health of a won...,The old sick green parrot\nHigh in a dingy cag...,https://revise.wales/pastPapers/A-level/Englis...,"This old parrot, sick and full of rage, longs ..."
2,2,2,Dust of Snow by Robert Frost,Robert Frost,"The simplicity, in the end, is the key element...",The way a crow\nShook down on me\nThe dust of ...,https://www.poetryfoundation.org/poems/44262/d...,The sight of a crow shaking snow from a tree t...
3,3,3,Suburban Sonnet by Gwen Harwood,Gwen Harwood,'Suburban Sonnet' by Gwen Harwood is a poem ab...,"She practises a fugue, though it can matter\nt...",https://genius.com/Gwen-harwood-suburban-sonne...,"A mother practices music, but her children int..."
4,4,4,Unending Love by Rabindranath Tagore,Rabindranath Tagore,'Unending Love' by Rabindranath Tagore is a he...,"I seem to have loved you in numberless forms, ...",https://allpoetry.com/Unending-Love,The speaker expresses their eternal love for s...


In [None]:
model_path = 'your_model_path'  # replace with your model path
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'

model = AutoModelForCausalLM.from_pretrained(model_path).half().eval().to(device)
tokenizer = AutoTokenizer.from_pretrained(model_path)

In [None]:
prompt_template = """In this task, your goal is to expand the user's short query into a detailed and well-structured English prompt for generating short videos.

Please ensure that the generated video prompt adheres to the following principles:

1. **Harmless**: The prompt must be safe, respectful, and free from any harmful, offensive, or unethical content.  
2. **Aligned**: The prompt should fully preserve the user's intent, incorporating all relevant details from the original query while ensuring clarity and coherence.  
3. **Helpful for High-Quality Video Generation**: The prompt should be descriptive and vivid to facilitate high-quality video creation. Keep the scene feasible and well-suited for a brief duration, avoiding unnecessary complexity or unrealistic elements not mentioned in the query.

User Query:{}

Video Prompt:"""

# ------------------------------
# 4. Function to generate video prompt
# ------------------------------
def generate_video_prompt(text):
    message = [{'role': 'user', 'content': prompt_template.format(text)}]

    # For transformers with chat template method
    model_inputs = tokenizer.apply_chat_template(message, add_generation_prompt=True, tokenize=True, return_tensors="pt").to(device)

    with torch.no_grad():
        output = model.generate(
            **model_inputs,
            max_new_tokens=1024,
            do_sample=True,
            top_p=1.0,
            temperature=0.7,
            num_beams=1
        )
    # Extract generated text
    resp = tokenizer.decode(output[0]).split('<|start_header_id|>assistant<|end_header_id|>')[1].split('<|eot_id|>')[0].strip()
    return resp

def process_batch(batch, text_column='our_summery'):
    batch['prompt_summery'] = batch[text_column].apply(generate_video_prompt)
    return batch

def save_batch(batch, output_file, mode='a', header=False):
    batch.to_csv(output_file, mode=mode, header=header, index=False)
    

In [None]:
generate_video_prompt(df[0]['prompt_summery'])

In [None]:
batch_size = 20  # you can adjust based on GPU memory
output_file =DATASET_PATH+"dataset_with_prompt.csv"

def process_dataset_in_batches(df, output_file, batch_size=20, text_column='our_summery'):
    # Determine CSV mode
    if os.path.exists(output_file):
        mode = 'a'
        header = False
    else:
        mode = 'w'
        header = True

    for start_idx in range(0, len(df), batch_size):
        end_idx = min(start_idx + batch_size, len(df))
        batch = df.iloc[start_idx:end_idx].copy()

        # Process batch
        batch = process_batch(batch, text_column=text_column)

        # Save batch
        save_batch(batch, output_file, mode=mode, header=header)

        # After first batch, switch to append mode
        mode = 'a'
        header = False

        print(f"Processed rows {start_idx} to {end_idx-1}")

    print("All batches processed and saved!")

In [None]:
process_dataset_in_batches(df, "dataset_with_prompt.csv", batch_size=batch_size)