<a href="https://colab.research.google.com/github/Slebbon/TextGeneration_Projet_PSL_EnC/blob/main/Flan_Eval.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
# Install necessary packages
!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install -q datasets
!pip install -q -U bitsandbytes

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [9]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [10]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, BitsAndBytesConfig, pipeline
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
import json


In [11]:
#Load the pre-trained text-to-text model (Flan-T5)
flan_model_path = '/content/drive/MyDrive/Flan_10Percent'  # Path to the pre-trained Flan-T5 model
flan_tokenizer = AutoTokenizer.from_pretrained(flan_model_path)
flan_model = AutoModelForSeq2SeqLM.from_pretrained(flan_model_path)


In [12]:
# Check if GPU is available and use it if possible
device = 0 if torch.cuda.is_available() else -1

In [13]:
# Define the pipeline for the Flan-T5 model
flan_text_generator = pipeline("text2text-generation", model=flan_model, tokenizer=flan_tokenizer, device=0)

In [14]:
# Function to generate responses with the Flan-T5 model
def get_answer_flan(prompt, print_answer=False, save=True, max_new_tokens=128):
    if isinstance(prompt, str):
        prompt = [prompt]
    answers = flan_text_generator(prompt, max_new_tokens=max_new_tokens, truncation=True)
    answers = list(map(lambda x: x['generated_text'], answers))
    if print_answer:
        for p, a in zip(prompt, answers):
            print(f'Prompt: {p}')
            print(f'Answer: {a}\n')
    if save:
        return answers
    else:
        return None

In [15]:
# Classifier
from transformers import AutoModelForSequenceClassification, pipeline
from peft import PeftModel

id2label = {0:'Shakespeare', 1: 'Trump', 2:'Other'}
label2id = {value:key for key,value in id2label.items()}

class_model = AutoModelForSequenceClassification.from_pretrained("google-bert/bert-base-uncased", device_map=0, id2label=id2label, label2id=label2id)
class_model = PeftModel.from_pretrained(class_model, '/content/drive/MyDrive/classifier_shrump')
class_tokenizer = AutoTokenizer.from_pretrained('/content/drive/MyDrive/classifier_shrump', fast=True)
classifier = pipeline(model=class_model, tokenizer=class_tokenizer, task='text-classification', device_map=0)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The model 'PeftModelForSequenceClassification' is not supported for text-classification. Supported models are ['AlbertForSequenceClassification', 'BartForSequenceClassification', 'BertForSequenceClassification', 'BigBirdForSequenceClassification', 'BigBirdPegasusForSequenceClassification', 'BioGptForSequenceClassification', 'BloomForSequenceClassification', 'CamembertForSequenceClassification', 'CanineForSequenceClassification', 'LlamaForSequenceClassification', 'ConvBertForSequenceClassification', 'CTRLForSequenceClassification', 'Data2VecTextForSequenceClassification', 'DebertaForSequenceClassification', 'DebertaV2ForSequenceClassification', 'DistilBertForSequenceClassification', 'Ele

In [16]:
# Load the dataset
splits = {'test': 'all/validation-00000-of-00001.parquet', 'validation': 'abstract_algebra/validation-00000-of-00001.parquet', 'dev': 'abstract_algebra/dev-00000-of-00001.parquet'}
df = pd.read_parquet("hf://datasets/cais/mmlu/" + splits["test"])

In [17]:
# Get 10% of the stratified dataset
main_90, test_10 = train_test_split(df['question'].tolist(), stratify=df['subject'].tolist(), test_size=0.1)


In [18]:
# Generate responses with the Flan-T5 model
answers_flan = get_answer_flan(test_10, print_answer=False, save=True)

In [19]:
# Save the responses to a JSON file
with open('answers_flan.json', 'w') as f:
    json.dump(answers_flan, f)


In [20]:
# Classify the responses generated by the Flan-T5 model
classification_flan = classifier(answers_flan, truncation=True)

# Calculate the label distribution for the generated responses
values_flan = pd.Series(list(map(lambda x: x['label'], classification_flan))).value_counts(normalize=True)

# Display the label distribution
print(values_flan)

# Example of generated responses
print(f'Example of response generated by the Flan-T5 model: {answers_flan[0]}')


Other          0.766234
Shakespeare    0.207792
Trump          0.025974
Name: proportion, dtype: float64
Example of response generated by the Flan-T5 model: Scenario 2
