In [1]:
!pip install -q accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 transformers==4.31.0 trl==0.4.7

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.9/116.9 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.2/244.2 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.9/72.9 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.5/92.5 MB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m52.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.4/77.4 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m40.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m472.7/472.7 kB[0m [31m17.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [17]:
import pandas as pd
df = pd.read_csv("/content/movie_genre_API.csv")
df.head(10)

Unnamed: 0,prompt,response
0,"In a dystopian future, a skilled warrior named...",action
1,The movie tells the story of a young girl name...,family
2,The film is set in the 19th century and revolv...,romance
3,The story revolves around a group of thieves l...,crime
4,The movie follows the journey of a young boy n...,fantasy
5,"In this film, a group of interstellar outlaws,...",action
6,The movie follows the story of a young lion pr...,family
7,The film is centered around a young woman name...,romance
8,The film follows the life of a notorious mobst...,crime
9,"In this movie, a young hobbit named Frodo is c...",fantasy


In [18]:
# romance,family,action,crime,fantasy
genres_to_drop = ["horror", "scifi", "adventure", "mystery", "thriller"]
df = df[~df['genre'].isin(genres_to_drop)]
df = df.drop(['id', 'movie_name'], axis=1)

In [19]:
df = df.rename(columns={'synopsis': 'prompt', 'genre': 'response'})
df.head(10)

In [20]:
df_train_kaggle = df.sample(frac=0.8, random_state=42)
df_test_kaggle = df.drop(df_train_kaggle.index)

In [4]:
df_train_kaggle.to_json('train.jsonl', orient='records', lines=True)
df_test_kaggle.to_json('test.jsonl', orient='records', lines=True)

In [5]:
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer

In [6]:
model_name = "NousResearch/llama-2-7b-chat-hf" # use this if you have access to the official LLaMA 2 model "meta-llama/Llama-2-7b-chat-hf", though keep in mind you'll need to pass a Hugging Face key argument
dataset_name = "/content/train.jsonl"
new_model = "llama-2-7b-custom"
lora_r = 64
lora_alpha = 16
lora_dropout = 0.1
use_4bit = True
bnb_4bit_compute_dtype = "float16"
bnb_4bit_quant_type = "nf4"
use_nested_quant = False
output_dir = "./results"
num_train_epochs = 1
fp16 = False
bf16 = False
per_device_train_batch_size = 4
per_device_eval_batch_size = 4
gradient_accumulation_steps = 1
gradient_checkpointing = True
max_grad_norm = 0.3
learning_rate = 2e-4
weight_decay = 0.001
optim = "paged_adamw_32bit"
lr_scheduler_type = "constant"
max_steps = -1
warmup_ratio = 0.03
group_by_length = True
save_steps = 25
logging_steps = 5
max_seq_length = None
packing = False
device_map = {"": 0}

In [7]:
system_message = 'Given a summary of a movie, determine its genre as either "action," "family," "romance," "crime," or "fantasy."'

In [8]:
# Load datasets
train_dataset = load_dataset('json', data_files='/content/train.jsonl', split="train")
valid_dataset = load_dataset('json', data_files='/content/test.jsonl', split="train")

# Preprocess datasets
train_dataset_mapped = train_dataset.map(lambda examples: {'text': [f'[INST] <>\n{system_message.strip()}\n<>\n\n' + prompt + ' [/INST] ' + response for prompt, response in zip(examples['prompt'], examples['response'])]}, batched=True)
valid_dataset_mapped = valid_dataset.map(lambda examples: {'text': [f'[INST] <>\n{system_message.strip()}\n<>\n\n' + prompt + ' [/INST] ' + response for prompt, response in zip(examples['prompt'], examples['response'])]}, batched=True)

compute_dtype = getattr(torch, bnb_4bit_compute_dtype)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/54 [00:00<?, ? examples/s]

Map:   0%|          | 0/14 [00:00<?, ? examples/s]

In [9]:
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map=device_map
)
model.config.use_cache = False
model.config.pretraining_tp = 1
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"
peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM",
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/583 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/200 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/746 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/21.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/435 [00:00<?, ?B/s]

In [10]:
# Set training parameters
training_arguments = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    fp16=fp16,
    bf16=bf16,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=group_by_length,
    lr_scheduler_type=lr_scheduler_type,
    report_to= [],
    evaluation_strategy="steps",
    eval_steps=4  # Evaluate every 20 steps
)

In [11]:
# Set supervised fine-tuning parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset_mapped,
    eval_dataset=valid_dataset_mapped,  # Pass validation dataset here
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    args=training_arguments,
    packing=packing,
)
trainer.train()
trainer.model.save_pretrained(new_model)



Map:   0%|          | 0/54 [00:00<?, ? examples/s]

Map:   0%|          | 0/14 [00:00<?, ? examples/s]

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  return fn(*args, **kwargs)


Step,Training Loss,Validation Loss
4,No log,2.160488
8,2.342400,1.758675
12,1.870400,1.481931


In [12]:
prompt1 = "In a dystopian future, a skilled warrior named Max is forced to help a group of women escape from a tyrant named Immortan Joe. Max and the women must traverse a dangerous wasteland while being pursued by Joe's army. Along the way, they encounter various obstacles and enemies, but also form unexpected alliances. The film is filled with high-speed chases, explosive battles, and intense hand-to-hand combat."
prompt = f'[INST] <>\n{system_message}\n<>\n' + prompt1 +' [/INST]'


In [14]:
logging.set_verbosity(logging.CRITICAL)
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)
result = pipe(prompt)
print((result[0]['generated_text'].split(' [/INST]')[-1]).split("\n")[0])

 action


In [15]:
prompt2 = "The film tells the story of a young woman named Rose who falls in love with a poor artist named Jack aboard the ill-fated R.M.S. Titanic. Despite their different social classes, they form a deep connection. The movie is filled with romantic moments, heartbreaking decisions, and a tragic ending that underscores the power of love and sacrifice."
prompt3 = f'[INST] <>\n{system_message}\n<>\n' + prompt2 +' [/INST]'

logging.set_verbosity(logging.CRITICAL)
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)
result = pipe(prompt3)
print((result[0]['generated_text'].split(' [/INST]')[-1]).split("\n")[0])

 fantasy


In [16]:
prompt4 = "The film follows the life of a notorious mobster who rises to power in the Italian mafia. As he navigates the dangerous underworld of organized crime, he must deal with rival gangs, law enforcement, and his own personal demons. The movie is filled with violent confrontations, intense dialogue, and a deep exploration of the criminal mind."
prompt5 = f'[INST] <>\n{system_message}\n<>\n' + prompt2 +' [/INST]'

logging.set_verbosity(logging.CRITICAL)
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)
result = pipe(prompt5)
print((result[0]['generated_text'].split(' [/INST]')[-1]).split("\n")[0])

  return fn(*args, **kwargs)


 fantasy


In [21]:
df_test = pd.read_json('/content/test.jsonl', lines=True)
df_test.head()

Unnamed: 0,prompt,response
0,The movie follows the story of a former Green ...,action
1,"In this film, a young wizard named Harry Potte...",fantasy
2,The movie is about a group of astronauts who e...,action
3,The film is set in a world where magic is real...,fantasy
4,The film tells the story of a young woman name...,romance


In [22]:
df_test.shape

(14, 2)

In [26]:
test_prompt_list = df_test["prompt"].tolist()
test_response_list = df_test["response"].tolist()

In [28]:
test_predicted_list = []
for i in range(14):
  print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
  print(str(i) + "th test case =", test_prompt_list[i])
  print("----------------------------------------------------------------------")
  print("ground_truth = ",test_response_list[i])
  print("----------------------------------------------------------------------")
  prompt = f'[INST] <>\n{system_message}\n<>\n' + test_prompt_list[i] +' [/INST]'

  logging.set_verbosity(logging.CRITICAL)
  pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)
  result = pipe(prompt)
  print("predicted_value =",(result[0]['generated_text'].split(' [/INST]')[-1]).split("\n")[0])
  test_predicted_list.append((result[0]['generated_text'].split(' [/INST]')[-1]).split("\n")[0])

++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
0th test case = The movie follows the story of a former Green Beret, John Rambo, who is haunted by memories of Vietnam. When his old commanding officer informs him that his unit has been killed by a deadly disease, Rambo embarks on a solo rescue mission. The film is filled with intense combat scenes, daring escapes, and a deep exploration of the effects of war on a soldier's psyche.
----------------------------------------------------------------------
ground_truth =  action
----------------------------------------------------------------------


  return fn(*args, **kwargs)


predicted_value =  action
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
1th test case = In this film, a young wizard named Harry Potter discovers his magical heritage and attends Hogwarts School of Witchcraft and Wizardry. There, he makes friends, learns about magic, and uncovers the dark secrets of his past. The movie is filled with magical creatures, spellbinding duels, and quests in a world where magic is real.
----------------------------------------------------------------------
ground_truth =  fantasy
----------------------------------------------------------------------
predicted_value =  fantasy
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
2th test case = The movie is about a group of astronauts who embark on a dangerous mission to save mankind. After Earth becomes uninhabitable, they must travel through a wormhole in search of a new home for humanity. The film is filled with thrilling space travel, perilous adventures on alien

In [29]:
test_predicted_list

[' action',
 ' fantasy',
 ' Fantasy',
 ' fantasy',
 ' fantasy',
 ' fantasy',
 ' crime',
 ' romance',
 ' family',
 ' fantasy',
 ' fantasy',
 ' fantasy',
 ' fantasy',
 ' crime']

In [30]:
result_df = pd.DataFrame({"content":test_prompt_list, "ground truth":test_response_list, "predicted":test_predicted_list})
result_df.head(14)

Unnamed: 0,content,ground truth,predicted
0,The movie follows the story of a former Green ...,action,action
1,"In this film, a young wizard named Harry Potte...",fantasy,fantasy
2,The movie is about a group of astronauts who e...,action,Fantasy
3,The film is set in a world where magic is real...,fantasy,fantasy
4,The film tells the story of a young woman name...,romance,fantasy
5,The film follows the journey of a hobbit named...,fantasy,fantasy
6,The movie follows the life of a notorious mobs...,crime,crime
7,The film revolves around a young woman named A...,romance,romance
8,The film follows the life of a young boy named...,family,family
9,The film revolves around a young woman named B...,romance,fantasy


In [31]:
result_df.to_csv("llama2_fintune_api_data.csv")