In [1]:
import os
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chat_models import ChatOpenAI
import torch
import unicodedata
import re
from typing import Dict, List, Tuple
import time
from langchain.schema import HumanMessage
import backoff
from openai import OpenAIError

class GreekPoetryRAG:
    def __init__(self):
        self.embeddings = HuggingFaceEmbeddings(
            model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
            model_kwargs={'device': 'cuda' if torch.cuda.is_available() else 'cpu'},
            encode_kwargs={'normalize_embeddings': True}
        )
        
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=1000,
            chunk_overlap=200,
            separators=["\n\n\n", "\n\n", "\n"],
            keep_separator=True
        )
        
        self.llm = ChatOpenAI(
            model_name="gpt-4-turbo-preview",
            temperature=0.7,
            max_tokens=1000,
            request_timeout=120
        )
        
        self.vectorstore = None
        self.poet_mapping = {
            "Λαπαθιώτης": "NapoleonLapathiotis",
            "Ναπολέων Λαπαθιώτης": "NapoleonLapathiotis",
            "Πολυδούρη": "MariaPolidouri",
            "Μαρία Πολυδούρη": "MariaPolidouri",
            "Ουράνης": "KostasOuranis",
            "Κώστας Ουράνης": "KostasOuranis",
            "Γιοφύλλης": "FotosGiofyllis",
            "Φώτος Γιοφύλλης": "FotosGiofyllis",
            "Παπανικολάου": "MitsosPapanikolaou",
            "Μήτσος Παπανικολάου": "MitsosPapanikolaou",
            "Φιλύρας": "RomosFiliras",
            "Ρώμος Φιλύρας": "RomosFiliras",
            "Άγρας": "TellosAgras",
            "Τέλλος Άγρας": "TellosAgras"
        }

    def clean_text(self, text: str) -> str:
        # Remove title numbering and single characters
        text = re.sub(r'^\s*\d+\.\s*', '', text, flags=re.MULTILINE)
        text = re.sub(r'^\s*[.!;,\s]+\s*$', '', text, flags=re.MULTILINE)
        text = re.sub(r'^\s*.\s*$', '', text, flags=re.MULTILINE)
        
        # Handle HTML and formatting
        text = re.sub(r'<br\s*/?>', '\n', text, flags=re.IGNORECASE)
        text = re.sub(r'<[^>]+>', ' ', text)
        
        # Normalize Unicode and clean spaces
        text = unicodedata.normalize('NFKC', text)
        text = re.sub(r'\s*\n\s*', '\n', text)
        text = re.sub(r'\s+', ' ', text)
        
        # Clean empty lines and minimal content
        lines = [line.strip() for line in text.split('\n')]
        lines = [line for line in lines if len(line) > 3]  # Remove very short lines
        text = '\n'.join(lines)
        
        # Final cleanup of multiple newlines
        text = re.sub(r'\n{3,}', '\n\n', text)
        return text.strip()

    def load_and_process(self, poetry_files: List[str]) -> FAISS:
        poems = []
        poets = {}
        
        for file_path in poetry_files:
            try:
                poet_name = os.path.splitext(os.path.basename(file_path))[0]
                with open(file_path, 'r', encoding='utf-8') as f:
                    content = f.read()
                    # Split into poems using double newlines
                    raw_poems = content.split('\n\n')
                    processed_poems = []
                    
                    for poem in raw_poems:
                        cleaned = self.clean_text(poem)
                        if len(cleaned.split()) > 10:  # Only keep substantial poems
                            processed_poems.append(cleaned)
                    
                    start_idx = len(poems)
                    poems.extend(processed_poems)
                    poets[poet_name] = list(range(start_idx, len(poems)))
                print(f"Processed {poet_name}: {len(processed_poems)} poems")
            except Exception as e:
                print(f"Error processing {file_path}: {str(e)}")
                continue

        metadatas = []
        for i, poem in enumerate(poems):
            poet_name = None
            for poet, indices in poets.items():
                if i in indices:
                    poet_name = poet
                    break
            metadatas.append({"poet": poet_name})

        print(f"Creating vectorstore with {len(poems)} total poems...")
        self.vectorstore = FAISS.from_texts(poems, self.embeddings, metadatas=metadatas)
        return self.vectorstore

    def get_similar_poems(self, query: str, vectorstore: FAISS, k: int = 3, poet_style: str = None) -> List[str]:
        if poet_style:
            poet_file = self.poet_mapping.get(poet_style)
            if poet_file:
                results = vectorstore.similarity_search(
                    query,
                    k=k*2,
                    filter={"poet": poet_file}
                )
                poems = [self.clean_text(doc.page_content) for doc in results[:k]]
                return [p for p in poems if len(p.split()) > 10]  # Only return substantial poems
        
        results = vectorstore.similarity_search(query, k=k*2)
        poems = [self.clean_text(doc.page_content) for doc in results[:k]]
        return [p for p in poems if len(p.split()) > 10]

    def generate_prompt(self, theme: str, similar_poems: List[str], poet_style: str = None) -> str:
        style_instruction = f"στο ύφος του {poet_style}" if poet_style else "στο ύφος της Νέας Αθηναϊκής Σχολής"
        
        formatted_poems = []
        for i, poem in enumerate(similar_poems, 1):
            formatted_poems.append(f"Ποίημα {i}:\n{poem}\n")
        
        prompt = f"""Δημιούργησε ένα νέο ελληνικό ποίημα {style_instruction}.
Θέμα: {theme}

Παραδείγματα παρόμοιων ποιημάτων για έμπνευση:

{'-'*40}
{chr(10).join(formatted_poems)}
{'-'*40}

Δημιούργησε ένα νέο, πρωτότυπο ποίημα που να:
1. Διατηρεί το ύφος και την τεχνοτροπία των παραδειγμάτων
2. Είναι μοναδικό στη σύλληψη και έκφραση
3. Χρησιμοποιεί πλούσιες ποιητικές εικόνες
4. Έχει συνοχή στη μορφή και το περιεχόμενο

Το ποίημα:"""
        
        return prompt

    @backoff.on_exception(
        backoff.expo,
        OpenAIError,
        max_tries=3,
        max_time=30
    )
    def generate_poem(self, prompt: str) -> str:
        try:
            messages = [HumanMessage(content=prompt)]
            response = self.llm.invoke(messages)
            return self.clean_text(response.content)
        except Exception as e:
            print(f"Error generating poem: {str(e)}")
            time.sleep(5)
            return None

def run_poetry_generation():
    # Set your OpenAI API key
    os.environ["OPENAI_API_KEY"] = "your api key"
    # Poetry files
    poetry_files = [
        "FotosGiofyllis.txt",
        "KostasOuranis.txt",
        "MariaPolidouri.txt",
        "MitsosPapanikolaou.txt",
        "NapoleonLapathiotis.txt",
        "RomosFiliras.txt",
        "TellosAgras.txt"
    ]

    # Initialize system
    print("Initializing RAG system...")
    rag = GreekPoetryRAG()
    vectorstore = rag.load_and_process(poetry_files)

    # Test cases
    test_cases = [
        ("θάλασσα", "Καρυωτάκης")
    ]

    # Generate poems
    for theme, poet in test_cases:
        print(f"\n{'='*50}")
        print(f"Generating poem about '{theme}' in the style of {poet}")
        print('='*50)
        
        try:
            # Get similar poems
            print("\nFinding similar poems...")
            similar_poems = rag.get_similar_poems(theme, vectorstore, k=3, poet_style=poet)
            print("\nFound similar poems:")
            for i, poem in enumerate(similar_poems, 1):
                print(f"\nReference {i}:")
                print(poem)
            
            # Generate prompt
            print("\nGenerating prompt...")
            prompt = rag.generate_prompt(theme, similar_poems, poet)
            print(prompt)
            
            # Generate new poem
            print("\nGenerating new poem...")
            for attempt in range(3):
                try:
                    poem = rag.generate_poem(prompt)
                    if poem:
                        print("\nGenerated Poem:")
                        print("="*30)
                        print(poem)
                        print("="*30)
                        break
                    else:
                        print(f"Attempt {attempt + 1} failed, retrying...")
                        time.sleep(5)
                except Exception as e:
                    print(f"Error in attempt {attempt + 1}: {str(e)}")
                    if attempt < 2:
                        time.sleep(5)
                    else:
                        print("All attempts failed")
            
            # Add delay between generations
            time.sleep(3)
            
        except Exception as e:
            print(f"Error in generation cycle: {str(e)}")
            continue

if __name__ == "__main__":
    run_poetry_generation()

Initializing RAG system...


  self.llm = ChatOpenAI(


Processed FotosGiofyllis: 1 poems
Processed KostasOuranis: 1 poems
Processed MariaPolidouri: 1 poems
Processed MitsosPapanikolaou: 1 poems
Processed NapoleonLapathiotis: 1 poems
Processed RomosFiliras: 1 poems
Processed TellosAgras: 1 poems
Creating vectorstore with 7 total poems...

Generating poem about 'θάλασσα' in the style of Καρυωτάκης

Finding similar poems...

Found similar poems:

Reference 1:
Ποιητής Είχα πέσει σε βύθος, είχα, πάντα, τη μαύρη κι ολαπέλπιδη νύστα του βραχνά καταλύτη’ μες στο κάμα του θέρους, τη θλιμμένη και λαύρη ποθοθάνατη ‘νείρια του οράματος νήτη. Έχω λήθαργου μοίρα κι είχα παραμιλήσει χρόνια’ κι όμως ο Στίχος, ο Ρυθμός δεν ελείπαν. Είχα ανέβει εκεί πού’ ναι μονάχα η Βρύση κι η Επιστήμη, αν δεν είχα, δε θ’ ανέβαινα, είπαν. Επειδή και είχα χάσει το ρέγουλο, είμαι ο εμπνευσμένος ονείρων και κόσμων προφήτης, ο πηγαίος ποιητής που στο σύννεφο κείμαι ο μεγάλος, ο θείος των ρυθμών υποφήτης. Μοίρα άγει Ά, στο λαό πώς μ’ έριξεν η μοίρα, πώς μ’έκρουσε στην θείαν ανα

Error generating poem: This model's maximum context length is 128000 tokens. However, your messages resulted in 152255 tokens. Please reduce the length of the messages.
Attempt 1 failed, retrying...
Error generating poem: This model's maximum context length is 128000 tokens. However, your messages resulted in 152255 tokens. Please reduce the length of the messages.
Attempt 2 failed, retrying...
Error generating poem: This model's maximum context length is 128000 tokens. However, your messages resulted in 152255 tokens. Please reduce the length of the messages.
Attempt 3 failed, retrying...


In [9]:
from typing import Dict, List, Tuple
import time
from langchain.chat_models import ChatOpenAI
from langchain.schema import HumanMessage

class PoetryExperiment:
    def __init__(self, rag_system=None, vectorstore=None):
        self.rag_system = rag_system
        self.vectorstore = vectorstore  # Store vectorstore separately
        self.llm = ChatOpenAI(
            model_name="gpt-4-turbo-preview",
            temperature=0.7,
            max_tokens=1000,
            request_timeout=120
        )
        
    def generate_base_prompt(self, theme: str, poet_style: str) -> str:
        """Generate prompt without RAG"""
        return f"""Δημιούργησε ένα νέο ελληνικό ποίημα στο ύφος του {poet_style}.
Θέμα: {theme}

Δημιούργησε ένα πρωτότυπο ποίημα που να αντικατοπτρίζει το μοναδικό ύφος και την τεχνοτροπία του ποιητή:"""

    def generate_poems(
        self, 
        theme: str, 
        poet_style: str,
        num_samples: int = 3,
        use_rag: bool = True
    ) -> Dict[str, List[Tuple[str, float]]]:
        """Generate poems with and without RAG"""
        results = {
            "rag": [],
            "base": []
        }
        
        for _ in range(num_samples):
            # Generate with RAG
            if use_rag and self.rag_system:
                start_time = time.time()
                similar_poems = self.rag_system.get_similar_poems(
                    theme, 
                    self.vectorstore,  # Use stored vectorstore
                    k=3, 
                    poet_style=poet_style
                )
                prompt = self.rag_system.generate_prompt(theme, similar_poems, poet_style)
                poem = self.rag_system.generate_poem(prompt)
                end_time = time.time()
                if poem:
                    results["rag"].append((poem, end_time - start_time))

            # Generate without RAG
            start_time = time.time()
            prompt = self.generate_base_prompt(theme, poet_style)
            messages = [HumanMessage(content=prompt)]
            response = self.llm.invoke(messages)
            end_time = time.time()
            results["base"].append((response.content, end_time - start_time))
            
            # Add delay to avoid rate limits
            time.sleep(1)
            
        return results

def run_experiment(
    rag_system,
    vectorstore,  # Add vectorstore parameter
    theme: str,
    poet_style: str,
    num_samples: int = 3
) -> Dict:
    """Run complete experiment and format results"""
    experiment = PoetryExperiment(rag_system, vectorstore)
    results = experiment.generate_poems(theme, poet_style, num_samples)
    
    # Format results for display
    formatted_results = {
        "theme": theme,
        "poet": poet_style,
        "samples": num_samples,
        "rag_poems": [],
        "base_poems": [],
        "metrics": {
            "rag_avg_time": 0,
            "base_avg_time": 0
        }
    }
    
    if results["rag"]:
        formatted_results["rag_poems"] = [poem for poem, _ in results["rag"]]
        formatted_results["metrics"]["rag_avg_time"] = sum(time for _, time in results["rag"]) / len(results["rag"])
    
    formatted_results["base_poems"] = [poem for poem, _ in results["base"]]
    formatted_results["metrics"]["base_avg_time"] = sum(time for _, time in results["base"]) / len(results["base"])
    
    return formatted_results

# Example usage:
test_cases = [
    {
        "theme": "δείλι",
        "poet": "Καρυωτάκης",
        "samples": 2
    },
    {
        "theme": "έρωτας",
        "poet": "Μαρία Πολυδούρη",
        "samples": 2
    }
]

# Run experiments
for case in test_cases:
    print(f"\n=== Testing {case['poet']} on theme: {case['theme']} ===\n")
    # Pass both rag_system and vectorstore
    results = run_experiment(rag, vectorstore, case['theme'], case['poet'], case['samples'])
    
    print("\nRAG-assisted poems:")
    for i, poem in enumerate(results['rag_poems'], 1):
        print(f"\n--- RAG Poem {i} ---")
        print(poem)
    
    print("\nBase model poems:")
    for i, poem in enumerate(results['base_poems'], 1):
        print(f"\n--- Base Poem {i} ---")
        print(poem)
    
    print("\nMetrics:")
    print(f"RAG average generation time: {results['metrics']['rag_avg_time']:.2f}s")
    print(f"Base average generation time: {results['metrics']['base_avg_time']:.2f}s")


=== Testing Καρυωτάκης on theme: δείλι ===



NameError: name 'rag' is not defined

In [7]:
from typing import Dict, List, Tuple
import time
from langchain.chat_models import ChatOpenAI
from langchain.schema import HumanMessage

class PoetryExperiment:
    def __init__(self, rag_system=None, vectorstore=None):
        self.rag_system = rag_system
        self.vectorstore = vectorstore
        self.llm = ChatOpenAI(
            model_name="gpt-4-turbo-preview",
            temperature=0.7,
            max_tokens=1000,
            request_timeout=120
        )
        
    def generate_base_prompt(self, theme: str, poet_style: str) -> str:
        """Generate prompt without RAG"""
        return f"""Δημιούργησε ένα νέο ελληνικό ποίημα στο ύφος του {poet_style}.
Θέμα: {theme}

Δημιούργησε ένα πρωτότυπο ποίημα που να αντικατοπτρίζει το μοναδικό ύφος και την τεχνοτροπία του ποιητή:"""

    def get_contrastive_poems(self, theme: str, poet_style: str, k: int = 3):
        """Get both style examples and contrasting examples"""
        # Get poems by target poet (positive examples)
        style_poems = self.rag_system.get_similar_poems(
            theme,
            self.vectorstore,
            k=k,
            poet_style=poet_style
        )
        
        # Retrieve candidate documents without using an unsupported $ne filter.
        target_poet = self.rag_system.poet_mapping.get(poet_style)
        candidate_docs = self.vectorstore.similarity_search(theme, k=k * 5)
        
        # Manually filter out documents by the target poet.
        contrast_docs = []
        for doc in candidate_docs:
            if doc.metadata.get("poet") != target_poet:
                contrast_docs.append(doc)
                if len(contrast_docs) >= k:
                    break
        contrast_poems = [doc.page_content for doc in contrast_docs]
        
        return {
            'style_examples': style_poems,
            'contrasts': contrast_poems
        }

    def generate_contrastive_prompt(self, theme: str, poems: Dict, poet_style: str) -> str:
        """Generate prompt with both style examples and contrasts"""
        return f"""Δημιούργησε ένα νέο ελληνικό ποίημα στο ύφος του {poet_style}.
Θέμα: {theme}

Παραδείγματα του ύφους του ποιητή:
{'-'*40}
{chr(10).join(poems['style_examples'])}

Παραδείγματα διαφορετικών υφών (για αντίθεση):
{'-'*40}
{chr(10).join(poems['contrasts'])}

Δημιούργησε ένα νέο ποίημα που να:
1. Ακολουθεί το χαρακτηριστικό ύφος του {poet_style} όπως φαίνεται στα πρώτα παραδείγματα
2. Διαφέρει στυλιστικά από τα παραδείγματα αντίθεσης
3. Διατηρεί την ποιητική φωνή του {poet_style}
4. Προσεγγίζει το θέμα με μοναδικό τρόπο

Το ποίημα:"""

    def generate_poems(
        self, 
        theme: str, 
        poet_style: str,
        num_samples: int = 3,
        use_rag: bool = True
    ) -> Dict[str, List[Dict]]:
        results = {
            "rag": [],
            "base": []
        }
        
        for _ in range(num_samples):
            # Generate with RAG and contrastive examples
            if use_rag and self.rag_system:
                start_time = time.time()
                poems = self.get_contrastive_poems(theme, poet_style, k=3)
                prompt = self.generate_contrastive_prompt(theme, poems, poet_style)
                poem = self.rag_system.generate_poem(prompt)
                end_time = time.time()
                if poem:
                    results["rag"].append({
                        "poem": poem,
                        "prompt": prompt,
                        "similar_poems": poems['style_examples'],
                        "contrast_poems": poems['contrasts'],
                        "time": end_time - start_time
                    })

            # Generate without RAG
            start_time = time.time()
            prompt = self.generate_base_prompt(theme, poet_style)
            messages = [HumanMessage(content=prompt)]
            response = self.llm.invoke(messages)
            end_time = time.time()
            results["base"].append({
                "poem": response.content,
                "prompt": prompt,
                "time": end_time - start_time
            })
            
            time.sleep(1)
            
        return results

def run_experiment(
    rag_system,
    vectorstore,
    theme: str,
    poet_style: str,
    num_samples: int = 3
) -> Dict:
    experiment = PoetryExperiment(rag_system, vectorstore)
    results = experiment.generate_poems(theme, poet_style, num_samples)
    
    formatted_results = {
        "theme": theme,
        "poet": poet_style,
        "samples": num_samples,
        "rag_results": [],
        "base_results": [],
        "metrics": {
            "rag_avg_time": 0,
            "base_avg_time": 0
        }
    }
    
    print(f"\n=== Testing {poet_style} on theme: {theme} ===\n")
    
    # Show RAG results with contrastive examples
    print("\nRAG-assisted generation:")
    if results["rag"]:
        for i, result in enumerate(results["rag"], 1):
            print(f"\n--- RAG Generation {i} ---")
            
            print("\nStyle examples used:")
            for j, poem in enumerate(result["similar_poems"], 1):
                print(f"\nStyle Example {j}:")
                print("-"*40)
                print(poem)
                print("-"*40)
            
            print("\nContrasting examples used:")
            for j, poem in enumerate(result["contrast_poems"], 1):
                print(f"\nContrast Example {j}:")
                print("-"*40)
                print(poem)
                print("-"*40)
            
            print("\nPrompt used:")
            print("-"*40)
            print(result["prompt"])
            print("-"*40)
            
            print("\nGenerated poem:")
            print(result["poem"])
            print(f"\nGeneration time: {result['time']:.2f}s")
            
        formatted_results["rag_results"] = results["rag"]
        formatted_results["metrics"]["rag_avg_time"] = sum(r["time"] for r in results["rag"]) / len(results["rag"])
    
    # Show base results
    print("\nBase generation:")
    for i, result in enumerate(results["base"], 1):
        print(f"\n--- Base Generation {i} ---")
        print("\nPrompt used:")
        print("-"*40)
        print(result["prompt"])
        print("-"*40)
        print("\nGenerated poem:")
        print(result["poem"])
        print(f"\nGeneration time: {result['time']:.2f}s")
    
    formatted_results["base_results"] = results["base"]
    formatted_results["metrics"]["base_avg_time"] = sum(r["time"] for r in results["base"]) / len(results["base"])
    
    print("\nMetrics:")
    print(f"RAG average generation time: {formatted_results['metrics']['rag_avg_time']:.2f}s")
    print(f"Base average generation time: {formatted_results['metrics']['base_avg_time']:.2f}s")
    
    return formatted_results

# Example usage
test_cases = [
    {
        "theme": "δείλι",
        "poet": "Τέλλος Άγρας",
        "samples": 2
    },
    {
        "theme": "έρωτας",
        "poet": "Μαρία Πολυδούρη",
        "samples": 2
    }
]

for case in test_cases:
    results = run_experiment(rag, vectorstore, case['theme'], case['poet'], case['samples'])


NameError: name 'rag' is not defined