In [None]:
!pip install transformers[torch] scikit-learn datasets openai==1.57.0

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting openai==1.57.0
  Downloading openai-1.57.0-py3-none-any.whl.metadata (24 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading openai-1.57.0-py3-none-any.whl (389 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m389.9/389.9 kB[0m [31m17.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m 

In [None]:
from datasets import load_dataset

dataset = load_dataset("imdb")
dataset["train"] = dataset["train"].shuffle(seed=42).select(range(1000))
dataset["test"] = dataset["test"].shuffle(seed=42).select(range(100))
train_data = dataset["train"].to_pandas()
test_data = dataset["test"].to_pandas()

README.md:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

unsupervised-00000-of-00001.parquet:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier

# params for experimentation
ngram_range = (1, 2)
stop_words = "english"
min_df = 1
max_features = 5000
# -------------------------

tfidf = TfidfVectorizer(ngram_range=ngram_range, stop_words=stop_words, min_df=min_df, max_features=max_features)
tfidf.fit(train_data["text"])

X_train = tfidf.transform(train_data["text"])
y_train = train_data["label"]

sgd = SGDClassifier()
sgd.fit(X_train, y_train)

X_test = tfidf.transform(test_data["text"])
y_test = test_data["label"]

y_pred = sgd.predict(X_test)

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.84      0.77      0.80        53
           1       0.76      0.83      0.80        47

    accuracy                           0.80       100
   macro avg       0.80      0.80      0.80       100
weighted avg       0.80      0.80      0.80       100



In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
import os

os.environ["WANDB_DISABLED"] = "1"

# params for experimentation
model_name = "distilbert-base-uncased"
batch_size = 16
learning_rate = 2e-5
num_epochs = 3
weight_decay = 0.01
# -------------------------

model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

dataset = dataset.map(lambda x: tokenizer(x["text"], truncation=True, padding="max_length", max_length=512), batched=True)

training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=learning_rate,
    per_device_train_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=weight_decay
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
)
trainer.train()

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [None]:
y_pred = trainer.predict(dataset["test"]).label_ids

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        53
           1       1.00      1.00      1.00        47

    accuracy                           1.00       100
   macro avg       1.00      1.00      1.00       100
weighted avg       1.00      1.00      1.00       100



In [None]:
from openai import OpenAI
from google.colab import userdata


OPENAI_KEY = userdata.get("OPENAI_KEY")

class LLM:
  def __init__(self,api_key=OPENAI_KEY):
    self.client = OpenAI(api_key=api_key)

  def __call__(self, prompt):
    response = self.client.chat.completions.create(
      model="gpt-4o-mini",
      messages=[
        {"role": "user", "content": prompt}
      ]
    )
    message = response.choices[0].message.content
    return message

llm = LLM()

In [None]:
from tqdm import tqdm

# experiment with few shot, chain of thought etc
prompt = """
  You are a sentiment classifier for movie reviews

  If a review is negative output 0, if it is positive output 1

  Only output 0 or 1, nothing else

  {review}
"""
y_pred = [int(llm(prompt.format(review=text))) for text in tqdm(test_data["text"])]

print(classification_report(y_test, y_pred))

100%|██████████| 100/100 [00:58<00:00,  1.71it/s]

              precision    recall  f1-score   support

           0       0.96      0.98      0.97        53
           1       0.98      0.96      0.97        47

    accuracy                           0.97       100
   macro avg       0.97      0.97      0.97       100
weighted avg       0.97      0.97      0.97       100






In [None]:
# This is the prompt you need to optimise
generation_prompt = """
  You are a movie representative responding to movie reviews

  Try to provide a thoughtful answer to the person that wrote a review
"""

answers = [llm(generation_prompt.format(review=text)) for text in tqdm(test_data["text"])]

evaluation_prompt = """
  You are an evaluator for responses given to movie reviews

  I will give you the review and the answer given from a repesentative of the movie

  I want you to rate from 0 to 5 how helpful the response has been.

  Only output an integer between 0 and 5.

  Review:
  {review}

  Answer:
  {answer}
"""

scores = [int(llm(evaluation_prompt.format(review=review, answer=answer))) for review, answer in tqdm(list(zip(test_data["text"], answers)))]

print(sum(scores)/len(scores))

100%|██████████| 100/100 [01:29<00:00,  1.12it/s]
100%|██████████| 100/100 [00:56<00:00,  1.76it/s]

0.0



