In [41]:
from datasets import load_dataset

dataset = load_dataset('csv', data_files='data/cleaned_data.csv')

# split into train, validation and test
train_test_dataset = dataset['train'].train_test_split(test_size=0.4)
val_test_dataset = train_test_dataset['test'].train_test_split(test_size=0.5)

dataset['train'] = train_test_dataset['train']
dataset['validation'] = val_test_dataset['train']
dataset['test'] = val_test_dataset['test']

In [42]:
dataset

DatasetDict({
    train: Dataset({
        features: ['course', 'book_id', 'topic', 'bookclub', 'chat_crew', 'pseudonym', 'message', 'time', 'is_answer', 'page', 'response_number', 'discussion_type', 'dialogic_spell', 'uptake', 'question', 'pivot', 'chat', 'chat_history'],
        num_rows: 543
    })
    validation: Dataset({
        features: ['course', 'book_id', 'topic', 'bookclub', 'chat_crew', 'pseudonym', 'message', 'time', 'is_answer', 'page', 'response_number', 'discussion_type', 'dialogic_spell', 'uptake', 'question', 'pivot', 'chat', 'chat_history'],
        num_rows: 181
    })
    test: Dataset({
        features: ['course', 'book_id', 'topic', 'bookclub', 'chat_crew', 'pseudonym', 'message', 'time', 'is_answer', 'page', 'response_number', 'discussion_type', 'dialogic_spell', 'uptake', 'question', 'pivot', 'chat', 'chat_history'],
        num_rows: 181
    })
})

In [96]:
messages = []
messages.append(("system", "Here is the chat history of the children discussion:"))
messages.append(("placeholder", "{history}"))
messages.append(("system", "Classify this sentence into one class of the codebook."))
messages.append(("human", "{input}"))

In [97]:
from langchain_core.prompts import ChatPromptTemplate
prompt_template = ChatPromptTemplate.from_messages(messages)
prompt_template.format(input = {'input': 'ciao', 'chat_history': 'ciao'})

"System: Here is the chat history of the children discussion:\nSystem: Classify the sentence into one class of the codebook.\nHuman: {'input': 'ciao', 'chat_history': 'ciao'}"

In [140]:
import pandas as pd
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split

dataset_csv_file = 'data/cleaned_data.csv'
text_field = 'message'
CLASS = 'discussion_type'
window_size = 5
combine_fields = ['pseudonym', 'message']
separator = ': '

data = pd.read_csv(dataset_csv_file)

data[text_field] = data[combine_fields].apply(lambda x: separator.join(x.dropna().astype(str)), axis=1)

history = []
for i in range(len(data)):
    if i >= 1 and not data.iloc[i][['book_id', 'bookclub', 'course']].equals(data.iloc[i-1][['book_id', 'bookclub', 'course']]):
        history = []

    data.at[i, 'history'] = '\n'.join(history) if history else pd.NA

    history.append(data.iloc[i][text_field])
    if len(history) > window_size:
        history.pop(0)

train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

requests = train_data.apply(lambda x: {'input': x[text_field], 'history': [("human", chat) for chat in x['history'].split('\n')] if not pd.isna(x['history']) else []}, axis = 1)
prompt_template = ChatPromptTemplate.from_messages(messages)
prompts = prompt_template.batch(list(requests))
prompts = list(map(lambda x: x.to_string(), prompts))

dataset_dict = DatasetDict({
    'train': Dataset.from_dict({'text': prompts, 'label': train_data[CLASS]})
})

# train, validation and test split
train_test_dataset = dataset_dict['train'].train_test_split(test_size=0.2, seed=42)

dataset_dict['train'] = train_test_dataset['train']
dataset_dict['validation'] = train_test_dataset['test']

In [141]:
dataset_dict

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 579
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 145
    })
})

In [None]:
sentence = """

**Social**

"""

items = ['Deliberation', 'Social']

# find the item that appears first inside sentence 
def find_first(sentence, items):
    first = len(sentence)
    item = None
    for i in items:
        index = sentence.find(i)
        if index != -1 and index < first:
            first = index
            item = i
    return item

find_first(sentence, items)

In [2]:
dataset = load_dataset("imdb", split="train")

In [3]:
dataset

Dataset({
    features: ['text', 'label'],
    num_rows: 25000
})

In [3]:
from datasets import load_dataset, load_from_disk

dataset = load_from_disk("./preprocessed/dataset_Discussion_with_history")

In [6]:
dataset['train']['prompt']

["You are an AI expert in categorizing sentences. You need to categorize the new sentence into one of the following classes: [UX, Social, Procedure, Deliberation, Seminar, Imaginative Entry, Disciplinary, Other]. If you fail to categorize the sentence, return 'None' instead of coming up with a wrong class. ### NEW SENTENCE: then click on the checkmark button ### Remember to answer with only the name of the class and nothing else. You can use the following codebook (with classes, definitions and examples) to help you classify the sentence: ### IMPORTANT CODEBOOK: ### Class: UX ### ### Definition: User‚Äôs opinion about the IMapBook interface, or media they wish we would include, user experience, media, relationship with the media. ### ### Example: ‚ÄúI'm finding this program a bit slow and difficult to work in.‚Äù ‚ÄúI am not a fan of the sound effect, but would be a fan of some pictures.‚Äù ###  ### Class: Social ### ### Definition: Discussion that establishes or maintains a relationsh

In [10]:
from utils import get_data_for_train_test, save_data, load_data

model_types = ['llama', 'mistral']
classes = ['Discussion', 'Uptake']

use_history = True
use_past_labels = True
num_docs = 2

for model_type in model_types:
    for class_ in classes:
        dataset = get_data_for_train_test(class_, use_history, use_past_labels, num_docs, model_name=model_type)
        print(dataset)
        save_data(dataset, model_type, class_, use_history, use_past_labels, num_docs > 0)

DatasetDict({
    train: Dataset({
        features: ['index', 'text', 'labels'],
        num_rows: 543
    })
    validation: Dataset({
        features: ['index', 'text', 'labels'],
        num_rows: 181
    })
    test: Dataset({
        features: ['index', 'text', 'labels'],
        num_rows: 181
    })
})


Saving the dataset (1/1 shards): 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 543/543 [00:00<00:00, 88046.82 examples/s] 
Saving the dataset (1/1 shards): 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 181/181 [00:00<00:00, 30126.95 examples/s]
Saving the dataset (1/1 shards): 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 181/181 [00:00<00:00, 23764.13 examples/s]


DatasetDict({
    train: Dataset({
        features: ['index', 'text', 'labels'],
        num_rows: 543
    })
    validation: Dataset({
        features: ['index', 'text', 'labels'],
        num_rows: 181
    })
    test: Dataset({
        features: ['index', 'text', 'labels'],
        num_rows: 181
    })
})


Saving the dataset (1/1 shards): 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 543/543 [00:00<00:00, 97196.44 examples/s] 
Saving the dataset (1/1 shards): 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 181/181 [00:00<00:00, 45846.31 examples/s]
Saving the dataset (1/1 shards): 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 181/181 [00:00<00:00, 27729.16 examples/s]


DatasetDict({
    train: Dataset({
        features: ['index', 'text', 'labels'],
        num_rows: 543
    })
    validation: Dataset({
        features: ['index', 'text', 'labels'],
        num_rows: 181
    })
    test: Dataset({
        features: ['index', 'text', 'labels'],
        num_rows: 181
    })
})


Saving the dataset (1/1 shards): 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 543/543 [00:00<00:00, 68068.59 examples/s]
Saving the dataset (1/1 shards): 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 181/181 [00:00<00:00, 25663.21 examples/s]
Saving the dataset (1/1 shards): 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 181/181 [00:00<00:00, 25242.53 examples/s]


DatasetDict({
    train: Dataset({
        features: ['index', 'text', 'labels'],
        num_rows: 543
    })
    validation: Dataset({
        features: ['index', 'text', 'labels'],
        num_rows: 181
    })
    test: Dataset({
        features: ['index', 'text', 'labels'],
        num_rows: 181
    })
})


Saving the dataset (1/1 shards): 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 543/543 [00:00<00:00, 101475.10 examples/s]
Saving the dataset (1/1 shards): 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 181/181 [00:00<00:00, 34983.14 examples/s]
Saving the dataset (1/1 shards): 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 181/181 [00:00<00:00, 29886.19 examples/s]


In [50]:
import pickle

dataset = load_data('llama', 'Discussion', True, True, True)

indexes = {
    'train': dataset['train']['index'],
    'validation': dataset['validation']['index'],
    'test': dataset['test']['index']
}

# save to pickle file
with open('./preprocessed/indexes.pkl', 'wb') as f:
    pickle.dump(indexes, f)


In [22]:
from utils import load_data_predicts, preprocess_data

data = load_data_predicts('./pred_results/', 'Discussion', 'mistral')
prep_data = preprocess_data(dataset_file='./data/cleaned_data.csv',
                                    class_field='Discussion',
)
                                    

In [43]:
#prep_data.iloc[data['index']]
import pandas as pd

left_data = data.drop(columns=['text'])
results = pd.merge(left_data, prep_data[['message']], left_on='index', right_index=True)

In [44]:
results

Unnamed: 0,index,labels,message
0,502,Seminar,"Hi Teresa Stewart, I felt that if he truly kne..."
1,547,UX,"Each time you submit, the lady talks back to y..."
2,386,Seminar,Probably yes
3,568,Seminar,If he knew his daughter truly loved that man b...
4,787,Deliberation,I believe it is for local food.
...,...,...,...
176,794,Deliberation,what should we write
177,765,Deliberation,Article one is gonna be a big help with agains...
178,853,Seminar,in the article it says 'agriculture comprises ...
179,18,Seminar,he would have to send his daughter to the aren...


In [42]:
initial_prompt = "You are an ensemble model. You have to predict the class of the following text, based on the results of other models."

llm_model = 'llama-3-8'

model_types = results.columns
model_types.drop(columns=['index', 'message', 'labels'])

for index, row in results.iterrows():
    if llm_model.startswith('llama'):
        messages = [{
            "role": "system",
            "content": initial_prompt
        }]
    elif llm_model.startswith('mistral'):
        messages = [{"role": "user", "content": initial_prompt},
                    {"role": "assistant", "content": "Ok, send me the text to classify and the predictions of the models."}
        ]
        
    model_message = f"The text to classify is: {row['message']}\n"
    
    for model in model_types:
        model_message += f"Model {model} predicts {row[model]}.\n"

    model_message += f"The text to classify is: {row['message']}"

    messages.append({
        "role": "user",
        "content": model_message
    })

Unnamed: 0,index,labels,message
0,502,Seminar,"Hi Teresa Stewart, I felt that if he truly kne..."
1,547,UX,"Each time you submit, the lady talks back to y..."
2,386,Seminar,Probably yes
3,568,Seminar,If he knew his daughter truly loved that man b...
4,787,Deliberation,I believe it is for local food.
...,...,...,...
176,794,Deliberation,what should we write
177,765,Deliberation,Article one is gonna be a big help with agains...
178,853,Seminar,in the article it says 'agriculture comprises ...
179,18,Seminar,he would have to send his daughter to the aren...
