In [None]:
import os
import pandas as pd
import numpy as np
import random
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
import ollama
import time

# Set random seeds for reproducibility
SEED = 1
random.seed(SEED)
np.random.seed(SEED)

# Unified data loading with label padding
def load_data(folder_path: str) -> tuple[pd.DataFrame, int]:
    encodings = ['utf-8', 'latin-1', 'cp1252']
    all_dfs = []
    expected_labels = 11  # Based on your dataset (adjust if needed)
    
    for filename in os.listdir(folder_path):
        if filename.endswith('.csv'):
            csv_path = os.path.join(folder_path, filename)
            for encoding in encodings:
                try:
                    df = pd.read_csv(csv_path, encoding=encoding, delimiter=';', quotechar='"', on_bad_lines='warn')
                    if not df.empty:
                        break
                except UnicodeDecodeError:
                    continue
            else:
                raise UnicodeDecodeError(f"Failed to decode {csv_path} with tried encodings: {encodings}")
            
            # Ensure consistent columns
            text_column = 'Text'
            all_columns = df.columns.tolist()
            if 'SANTA_ID' in all_columns:
                df = df.rename(columns={'SANTA_ID': 'ID'})
            else:
                df['ID'] = [f"ID_{i}" for i in range(len(df))]
            if text_column not in all_columns:
                raise ValueError(f"Text column not found in {csv_path}")
            
            # Remove extra columns (e.g., last 3 columns as in your original code)
            label_columns = [col for col in all_columns if col not in ['ID', text_column, 'none']]
            if len(label_columns) > expected_labels:
                label_columns = label_columns[:expected_labels]
            
            # Ensure numeric labels and pad if necessary
            df[label_columns] = df[label_columns].apply(pd.to_numeric, errors='coerce').fillna(0)
            for i in range(len(label_columns), expected_labels):
                df[f'Label_{i}'] = 0.0
            label_columns = [col for col in df.columns if col not in ['ID', text_column, 'none']]
            
            # Select relevant columns
            df = df[['ID', text_column] + label_columns]
            all_dfs.append(df)
    
    # Concatenate all DataFrames
    combined_df = pd.concat(all_dfs, ignore_index=True)
    return combined_df, expected_labels

# Your existing few-shot examples
examples = [
    """ACI08;"Mr. Hosmer Angel came to the house again and proposed that we 
should marry before father came back. He was in dreadful earnest 
and made me swear, with my hands on the Testament, that whatever 
happened I would always be true to him. Mother said he was quite 
right to make me swear, and that it was a sign of his passion. 
Mother was all in his favour from the first and was even fonder 
of him than I was. Then, when they talked of marrying within the 
week, I began to ask about father; but they both said never to 
mind about father, but just to tell him afterwards, and mother 
said she would make it all right with him.";0;0;0;1;0;1;1;0;0;0;none;0;0""",
    """ACI05;"I had had so many reasons to believe in my friend's subtle powers 
of reasoning and extraordinary energy in action that I felt that 
he must have some solid grounds for the assured and easy 
demeanour with which he treated the singular mystery which he had 
been called upon to fathom. Once only had I known him to fail, in 
the case of the King of Bohemia and of the Irene Adler 
photograph; but when I looked back to the weird business of the 
Sign of Four, and the extraordinary circumstances connected with 
the Study in Scarlet, I felt that it would be a strange tangle 
indeed which he could not unravel. 
I left him then, still puffing at his black clay pipe, with the 
conviction that when I came again on the next evening I would 
find that he held in his hands all the clues which would lead up 
to the identity of the disappearing bridegroom of Miss Mary 
Sutherland. 
A professional case of great gravity was engaging my own 
attention at the time, and the whole of next day I was busy at 
the bedside of the sufferer. It was not until close upon six 
o'clock that I found myself free and was able to spring into a 
hansom and drive to Baker Street, half afraid that I might be too 
late to assist at the dénouement of the little mystery. I found 
Sherlock Holmes alone, however, half asleep, with his long, thin 
form curled up in the recesses of his armchair. A formidable 
array of bottles and test-tubes, with the pungent cleanly smell 
of hydrochloric acid, told me that he had spent his day in the 
chemical work which was so dear to him.";0;1;0;0;0;1;1;1;0;0;none;0;0""",
    """TAIN27;"On the previous morning, two gentlemen had called to see his 
master. They were Italians, and the elder of the two, a man of about 
forty, gave his name as Signor Ascanio. 
The younger was a well-dressed lad of about twenty-four. 
Count Foscatini was evidently prepared for their visit and 
immediately sent Graves out upon some trivial errand. 
Here the man paused and hesitated in his story. 
In the end, however, he admitted that, curious as to the purport 
of the interview, he had not obeyed immediately, but had lingered 
about endeavouring to hear something of what was going on. 
The conversation was carried on in so low a tone that he was not 
as successful as he had hoped; but he gathered enough to make it 
clear that some kind of monetary proposition was being discussed, 
and that the basis of it was a threat. 
The discussion was anything but amicable. 
In the end, Count Foscatini raised his voice slightly, and the 
listener heard these words clearly:  'I have no time to argue 
further now, gentlemen. 
If you will dine with me to-morrow night at eight o’clock, we will
 resume the discussion.'  Afraid of being discovered listening, 
Graves had then hurried out to do his master’s errand. 
This evening the two men had arrived punctually at eight. 
During dinner they had talked of indifferent matters—politics, the
 weather, and the theatrical world. 
When Graves had placed the port upon the table and brought in the 
coffee his master told him that he might have the evening off. 
'Was that a usual proceeding of his when he had guests?' asked the
 inspector. 
'No, sir; it wasn’t. 
That’s what made me think it must be some business of a very 
unusual kind that he was going to discuss with these gentlemen.'  
That finished Graves’s story. 
He had gone out about 8.30, and, meeting a friend, had accompanied
 him to the Metropolitan Music Hall in Edgware Road. 
Nobody had seen the two men leave, but the time of the murder was 
fixed clearly enough at 8.47. 
A small clock on the writing-table had been swept off by 
Foscatini’s arm, and had stopped at that hour, which agreed with 
Miss Rider’s telephone summons. 
The police surgeon had made his examination of the body, and it 
was now lying on the couch. 
I saw the face for the first time—the olive complexion, the long 
nose, the luxuriant black moustache, and the full red lips drawn 
back from the dazzlingly white teeth. 
Not altogether a pleasant face. 
'Well,' said the inspector, refastening his notebook. 
'The case seems clear enough. 
The only difficulty will be to lay our hands on this Signor 
Ascanio. 
I suppose his address is not in the dead man’s pocket-book by any 
chance?'  As Poirot had said, the late Foscatini was an orderly 
man. 
Neatly written in small, precise handwriting was the inscription, 
'Signor Paolo Ascanio, Grosvenor Hotel.'  The inspector busied 
himself with the telephone, then turned to us with a grin. 
'Just in time. 
Our fine gentleman was off to catch the boat train to the 
Continong. 
Well, gentlemen, that’s about all we can do here. 
It’s a bad business, but straightforward enough. 
One of these Italian vendetta things, as likely as not.'  Thus 
airily dismissed, we found our way downstairs. 
Dr. Hawker was full of excitement. 
'Like the beginning of a novel, eh? 
Real exciting stuff. 
Wouldn’t believe it if you read about it.'  Poirot did not speak. 
He was very thoughtful. 
All the evening he had hardly opened his lips. 
'What says the master detective, eh?' asked Hawker, clapping him 
on the back. 
'Nothing to work your grey cells over this time.'  'You think 
not?'  'What could there be?'  'Well, for example, there is the 
window.'  'The window? 
But it was fastened. 
Nobody could have got out or in that way. 
I noticed it specially.'  'And why were you able to notice it?'  
The doctor looked puzzled. 
Poirot hastened to explain. 
'It is to the curtains I refer. 
They were not drawn. 
A little odd, that. 
And then there was the coffee. 
It was very black coffee.'  'Well, what of it?'  'Very black,' 
repeated Poirot. 
'In conjunction with that let us remember that very little of the 
rice soufflé was eaten, and we get—what?'  'Moonshine,' laughed 
the doctor. 
'You’re pulling my leg.'  'Never do I pull the leg. 
Hastings here knows that I am perfectly serious.'  'I don’t know 
what you are getting at, all the same,' I confessed. 
'You don’t suspect the manservant, do you? 
He might have been in with the gang, and put some dope in the 
coffee. 
I suppose they’ll test his alibi?'  'Without doubt, my friend; but
 it is the alibi of Signor Ascanio that interests me.'  'You think
 he has an alibi?'  'That is just what worries me. 
I have no doubt that we will soon be enlightened on that point.'
 The _Daily Newsmonger_ enabled us to become conversant with 
succeeding events. 
Signor Ascanio was arrested and charged with the murder of Count 
Foscatini. 
When arrested, he denied knowing the Count, and declared he had 
never been near Regent’s Court either on the evening of the crime 
or on the previous morning. 
The younger man had disappeared entirely. 
Signor Ascanio had arrived alone at the Grosvenor Hotel from the 
Continent two days before the murder. 
All efforts to trace the second man failed. 
Ascanio, however, was not sent for trial. 
No less a personage than the Italian Ambassador himself came 
forward and testified at the police-court proceedings that Ascanio
 had been with him at the Embassy from eight till nine that 
evening. 
The prisoner was discharged. 
Naturally, a lot of people thought that the crime was a political 
one, and was being deliberately hushed up. 
Poirot had taken a keen interest in all these points. 
Nevertheless, I was somewhat surprised when he suddenly informed 
me one morning that he was expecting a visitor at eleven o’clock, 
and that that visitor was none other than Ascanio himself. 
'He wishes to consult you?'  '_Du tout_, Hastings. 
I wish to consult him.'  'What about?'  'The Regent’s Court 
murder.'  'You are going to prove that he did it?'  'A man cannot 
be tried twice for murder, Hastings. 
Endeavour to have the common sense. 
Ah, that is our friend’s ring. A few minutes later Signor 
Ascanio was ushered in—a small, thin man with a secretive and 
furtive glance in his eyes. 
He remained standing, darting suspicious glances from one to the 
other of us. 
'Monsieur Poirot?'  My little friend tapped himself gently on the 
chest. 
'Be seated, signor. 
You received my note. 
I am determined to get to the bottom of this mystery. 
In some small measure you can aid me. 
Let us commence. 
You—in company with a friend—visited the late Count Foscatini on 
the morning of Tuesday the 9th——'  The Italian made an angry 
gesture. 
'I did nothing of the sort. 
I have sworn in court——'  '_Précisément_—and I have a little idea 
that you have sworn falsely.'  'You threaten me? 
Bah! 
I have nothing to fear from you. 
I have been acquitted.'  'Exactly; and as I am not an imbecile, it
 is not with the gallows I threaten you—but with publicity. 
Publicity! 
I see that you do not like the word. 
I had an idea that you would not. 
My little ideas, you know, they are very valuable to me. 
Come, signor, your only chance is to be frank with me. 
I do not ask to know whose indiscretions brought you to England. 
I know this much, you came for the especial purpose of seeing 
Count Foscatini.'  'He was not a count,' growled the Italian. 
'I have already noted the fact that his name does not appear in 
the _Almanach de Gotha_. 
Never mind, the title of count is often useful in the profession 
of blackmailing.'  'I suppose I might as well be frank. 
You seem to know a good deal.'  'I have employed my grey cells to 
some advantage. 
Come, Signor Ascanio, you visited the dead man on the Tuesday 
morning—that is so, is it not?'  'Yes; but I never went there on 
the following evening. 
There was no need. 
I will tell you all.';0;0;0;0;0;0;1;0;0;0;none;0;0"""
]
examples_str = "\n".join(examples)

# Modified annotation function to use training fold as context
def get_ollama_predictions(test_df: pd.DataFrame, train_df: pd.DataFrame, num_labels: int) -> np.ndarray:
    # Use a few examples from the training fold as additional context
    train_examples = train_df.sample(n=min(3, len(train_df)), random_state=SEED)[['ID', 'Text'] + [col for col in train_df.columns if col not in ['ID', 'Text']]].to_csv(index=False, sep=';')
    train_examples_str = train_examples.replace('\n', '\n')

    prompt_template = f"""
You are an expert in narratology, annotating literary texts based on the modular guidelines from Heyns and Van Zaanen (2024) for mystery novels (whodunits). For each text segment, output a CSV row with these columns: ID;Text;Scene;Summary;Descriptive_passage;Analepsis;Prolepsis;Extradiegetic;Intradiegetic;Metadiegetic;Focalization;Voice_homodiegetic;Voice_heterodiegetic.

Here are the definitions for each tag to guide your annotation:
Scene: A segment of narrative discourse that presents the histoire (story), typically involving a coherent sequence of events with specific characters, time, and place, annotated using the SCENE tag.
Summary: A non-scene where events are condensed or narrated briefly, often as a sub-scene within a broader scene, assigned as a property of NON-SCENE.
Descriptive_passage: A non-scene focused on description rather than events, providing detailed pauses in the narrative for setting or character details, assigned as a property of NON-SCENE.
Analepsis: A flashback that shifts the narrative time from the present to the past, tagged as ANALEPSIS, which can be embedded or interruptive.
Prolepsis: A flash-forward occurring in forms like visions, prophecies, or foreshadowing, shifting narrative time to the future, tagged as PROLEPSIS, which can be embedded or interruptive.
Extradiegetic: The level of the narrator or implied author outside the story world, annotated with NARRATOR and value 0, potentially including metatextuality (meta).
Intradiegetic: The diegetic level of characters and events within the story, annotated with value 1 (and letters like 1a, 1b for sequential arrangement).
Metadiegetic: A secondary narrative embedded within the primary diegetic level, such as stories told by characters, annotated with value 2 (and letters for arrangement).
Focalization: The perspective from which the narrative is seen, indicating the narrator's access to information; can be zero/unrestricted (omniscient, knows more than characters), internal (limited to a character's knowledge), or external (observes without internal access); tag with FOCALIZATION and properties like EMBEDDED or INTERRUPTIVE.
Voice_homodiegetic: When the narrator appears in the story as a character, usually referring to themselves in the first person.
Voice_heterodiegetic: When the narrator does not appear in the story, with narration mostly in the third person.

Use 1 for 'yes' and 0 for 'no' in binary columns (e.g., Scene). 
Examples:
{examples_str}
Additional training examples from the current fold:
{train_examples_str}

Now annotate the following segments. Output one CSV row per segment, separated by newlines:
{{test_data}}
"""
    
    annotations = []
    start_time = time.time()
    for i, row in test_df.iterrows():
        print(f"Processing row {i+1} of {len(test_df)} in test fold...")
        test_data = f"ID: {row['ID']}\nText: {row['Text']}"
        prompt = prompt_template.format(test_data=test_data)
        response = ollama.chat(model='llama3:8b', messages=[{'role': 'user', 'content': prompt}], options={'temperature': 0})
        annotated_row = response['message']['content'].strip()
        annotations.append(annotated_row)
    
    end_time = time.time()
    print(f"Total processing time for fold: {end_time - start_time:.2f} seconds")
    
    # Parse annotations
    pred_labels = []
    for row in annotations:
        parts = row.split(';')[2:]  # Skip ID and Text
        pred_labels.append([float(x) if x in ['0', '1'] else 0.0 for x in parts])
    return np.array(pred_labels)

# Main function to process folds and evaluate
def evaluate_folds():
    folder_path = 'Finaal_data/MD'  # Adjust path as needed
    combined_df, num_labels = load_data(folder_path)
    
    # Define label columns
    label_columns = [col for col in combined_df.columns if col not in ['ID', 'Text']]
    
    # Initialize KFold
    kf = KFold(n_splits=5, shuffle=True, random_state=SEED)
    fold_results = []
    
    # Process each fold
    for fold, (train_idx, test_idx) in enumerate(kf.split(combined_df)):
        print(f"\nProcessing Fold {fold + 1}/5...")
        train_df = combined_df.iloc[train_idx].reset_index(drop=True)
        test_df = combined_df.iloc[test_idx].reset_index(drop=True)
        
        # Get predictions for test fold
        pred_labels = get_ollama_predictions(test_df, train_df, num_labels)
        
        # Get true labels
        true_labels = test_df[label_columns].values
        if len(pred_labels) != len(true_labels):
            print(f"Warning: Mismatch in number of predictions ({len(pred_labels)}) and true labels ({len(true_labels)}) in fold {fold + 1}")
            continue
        
        # Compute metrics
        f1 = f1_score(true_labels, pred_labels, average='micro')
        roc_auc = roc_auc_score(true_labels, pred_labels, average='micro')
        acc = accuracy_score(true_labels, pred_labels)
        
        fold_results.append({
            'Fold': fold + 1,
            'F1': f1,
            'Roc_Auc': roc_auc,
            'Accuracy': acc
        })
        
        # Save annotations for this fold
        output_df = test_df.copy()
        output_df['Annotated_Row'] = [';'.join([str(row['ID']), row['Text']] + [str(x) for x in pred]) for row, pred in zip(test_df.to_dict('records'), pred_labels)]
        output_df.to_csv(f'llm_annotations_fold_{fold + 1}.csv', index=False, sep=';')
        print(f"Annotations for fold {fold + 1} saved to llm_annotations_fold_{fold + 1}.csv")
    
    # Create summary DataFrame
    summary_df = pd.DataFrame(fold_results)
    
    # Save to Excel
    with pd.ExcelWriter('fold_metrics.xlsx') as writer:
        for fold in range(1, 6):
            fold_df = pd.DataFrame([r for r in fold_results if r['Fold'] == fold])
            fold_df = fold_df[['Fold', 'F1', 'Roc_Auc', 'Accuracy']]
            fold_df.to_excel(writer, sheet_name=f'Fold {fold}', index=False)
        summary_df.to_excel(writer, sheet_name='Summary', index=False)
    
    print("\nFold Metrics Summary:")
    print(summary_df)
    print("Results saved to fold_metrics.xlsx")

if __name__ == "__main__":
    evaluate_folds()


Processing Fold 1/5...
Processing row 1 of 58 in test fold...
