In [None]:
%%capture
import torch
major_version, minor_version = torch.cuda.get_device_capability()
# Must install separately since Colab has torch 2.2.1, which breaks packages
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
if major_version >= 8:
    # Use this for new GPUs like Ampere, Hopper GPUs (RTX 30xx, RTX 40xx, A100, H100, L40)
    !pip install --no-deps packaging ninja einops flash-attn xformers trl peft accelerate bitsandbytes
else:
    # Use this for older GPUs (V100, Tesla T4, RTX 20xx)
    !pip install --no-deps xformers trl peft accelerate bitsandbytes
pass

In [None]:
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer
from unsloth import FastLanguageModel
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import TextStreamer
from trl import SFTTrainer
from transformers import TrainingArguments

In [None]:
import json
import pandas as pd
import numpy as np
from datasets import Dataset

In [None]:
from google.colab import files

In [None]:
# device = 'cuda'

In [None]:
CNN_tokenizer = AutoTokenizer.from_pretrained("Sardean-UChicago/CNN_Mistral-7B-Headline-QLoRA")

In [None]:
CNN_model = AutoModelForCausalLM.from_pretrained("Sardean-UChicago/CNN_Mistral-7B-Headline-QLoRA")

In [None]:
test_CNN_instructions_df = pd.read_json('test_CNN_instructions.jsonl', lines=True)
test_CNN_bodies_df = pd.read_json('test_CNN_bodies.jsonl', lines=True)
test_CNN_titles_df = pd.read_json('test_CNN_titles.jsonl', lines=True)

test_CNN_bodies_df.head()

In [None]:
true_CNN_titles = test_CNN_titles_df['text'] # true titles accompanying text
true_CNN_titles_100 = true_CNN_titles[:100]
true_CNN_titles_100

In [None]:
# true_CNN_titles_10samples = true_CNN_titles[:10]
# #true_titles_500 = true_titles[:500]
# true_CNN_titles_10samples

In [None]:
def LLM_generate_test(model,tokenizer,instructions_df,bodies_df,titles_df):
  generated_texts = []
  for i in range(len(instructions_df)):
    prompt = f"""Give a title to this writing in my style

### Text
{bodies_df['text'].iloc[i]}

### Title

"""

    inputs = tokenizer(prompt,
        return_tensors="pt",
        truncation=True,
        max_length=163804
    )
    outputs = model.generate(
        **inputs,
        max_length=30000,
        num_return_sequences=1,
        use_cache=True
    )
    generated_text = tokenizer.batch_decode(outputs, skin_special_tokens=True)
    generated_texts.append(generated_text)
    if i%10 ==0:
      print(f"Epoch {i}")
  return generated_texts

In [None]:
# sample_CNN_instructions_df = test_CNN_instructions_df[:10]
# sample_CNN_bodies_df = test_CNN_bodies_df[:10]
# sample_CNN_titles_df = test_CNN_titles_df[:10]

In [None]:
# sample_generate = LLM_generate_test(CNN_model,CNN_tokenizer,sample_CNN_instructions_df,sample_CNN_bodies_df,sample_CNN_titles_df)

In [None]:
def title_parse(generated_text):
  titles = []
  for item in generated_text:
    for text in item:
      if '### Title' in text:
        title = text.split('### Title')[1].strip().split('</s>')[0].strip()
        titles.append(title)
  return titles

In [None]:
# pred_titles = title_parse(sample_generate)

In [None]:
def create_df(true_titles,pred_titles):
  new_df = pd.DataFrame(
      {
          'true_titles': true_titles,
          'pred_titles': pred_titles
      }
  )

  return new_df

In [None]:
# pred_titles

In [None]:
# create_df(true_CNN_titles_10samples,pred_titles)

In [None]:
def test_pipeline(model,tokenizer,instructions_df,bodies_df,titles_df):
  true_titles = titles_df['text']
  generated_text = LLM_generate_test(model,tokenizer,instructions_df,bodies_df,titles_df)
  pred_titles = title_parse(generated_text)
  new_df = create_df(true_titles,pred_titles)
  return new_df

In [None]:
CNN_model_on_CNN_test_100 = test_pipeline(CNN_model,CNN_tokenizer,
                                     test_CNN_instructions_df[:100],
                                     test_CNN_bodies_df[:100],
                                     test_CNN_titles_df[:100])

In [None]:
CNN_model_on_CNN_test_100.to_csv('CNN_model_on_CNN_test_100.csv',index=False)

In [None]:
files.download('CNN_model_on_CNN_test_100.csv')

In [None]:
# prompt = """Give a title to this writing in my style

# ### Text

# Malaysia and Indonesia have issued separate statements calling for restraint after Iran's retaliatory strikes on Israel.

# Both Southeast Asian countries, where Islam is the predominant religion, reiterated support for Palestinian rights and causes. Malaysia and Indonesia do not recognize Israel.

# "Malaysia strongly urges all parties in the Middle East region to refrain and exercise great caution and not to escalate the already tense situation," its Ministry of Foreign Affairs said Sunday.

# ### Title

# """

In [None]:
# test_prompt1 = f"""Give a title to this writing in my style

# ### Text
# {CNN_prompt1[:1000]}

# ### Title

# """
# test_prompt1

In [None]:
# inputs = CNN_tokenizer(test_prompt1,
#     return_tensors="pt",
#     truncation=True,
#     max_length=163804
# )


# outputs = CNN_model.generate(
#     **inputs,
#     max_length=30000,
#     num_return_sequences=1,
#     use_cache=True
# )

# #generated_text = CNN_tokenizer.batch_decode(outputs, skin_special_tokens=True)
# CNN_generated_text1 = CNN_tokenizer.batch_decode(outputs, skin_special_tokens=True)

In [None]:
# len(prompt), len(CNN_prompt1)

In [None]:
FOX_tokenizer = AutoTokenizer.from_pretrained("Sardean-UChicago/FOX_Mistral-7B-Headline-QLoRA")

In [None]:
FOX_model = AutoModelForCausalLM.from_pretrained("Sardean-UChicago/FOX_Mistral-7B-Headline-QLoRA")

In [None]:
# sample_generate_FOX = LLM_generate_test(FOX_model,FOX_tokenizer,sample_CNN_instructions_df,sample_CNN_bodies_df,sample_CNN_titles_df)

In [None]:
# pred_titles_sample_FOX = title_parse(sample_generate_FOX)

In [None]:
# create_df(true_CNN_titles_10samples,pred_titles_sample_FOX)

In [None]:
test_FOX_instructions_df = pd.read_json('test_FOX_instructions.jsonl', lines=True)
test_FOX_bodies_df = pd.read_json('test_FOX_bodies.jsonl', lines=True)
test_FOX_titles_df = pd.read_json('test_FOX_titles.jsonl', lines=True)

test_FOX_bodies_df.head()

In [None]:
FOX_true_titles = test_FOX_titles_df['text'] # true titles accompanying text

In [None]:
FOX_model_on_FOX_test_100 = test_pipeline(FOX_model,FOX_tokenizer,
                                     test_FOX_instructions_df[:100],
                                     test_FOX_bodies_df[:100],
                                     test_FOX_titles_df[:100])

In [None]:
FOX_model_on_FOX_test_100.to_csv('FOX_model_on_FOX_test_100.csv',index=False)

In [None]:
files.download('FOX_model_on_FOX_test_100.csv')

In [None]:
CNN_model_on_FOX_test_100 = test_pipeline(CNN_model,CNN_tokenizer,
                                     test_FOX_instructions_df[:100],
                                     test_FOX_bodies_df[:100],
                                     test_FOX_titles_df[:100])

In [None]:
CNN_model_on_FOX_test_100.to_csv('CNN_model_on_FOX_test_100.csv',index=False)

In [None]:
files.download('CNN_model_on_FOX_test_100.csv')

In [None]:
FOX_model_on_CNN_test_100 = test_pipeline(FOX_model,FOX_tokenizer,
                                     test_CNN_instructions_df[:100],
                                     test_CNN_bodies_df[:100],
                                     test_CNN_titles_df[:100])

In [None]:
FOX_model_on_CNN_test_100.to_csv('FOX_model_on_CNN_test_100.csv',index=False)

In [None]:
files.download('FOX_model_on_CNN_test_100.csv')