In [None]:
!huggingface-cli login --token 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

In [None]:
!pip uninstall bitsandbytes -y
!pip uninstall accelerate -y
!pip install -U bitsandbytes
!pip install -U accelerate

In [None]:
import bitsandbytes as bnb
import accelerate

print(f"bitsandbytes version: {bnb.__version__}")
print(f"accelerate version: {accelerate.__version__}")

In [None]:
import os
import sys
import torch
import warnings
import numpy as np
import pandas as pd

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

warnings.filterwarnings('ignore')
torch.backends.cuda.enable_mem_efficient_sdp(False)
torch.backends.cuda.enable_flash_sdp(False)

In [None]:
def get_k_most_similar_texts_by_tfidf(df, target_text, texts=None, k=5):
    texts = []
    for index, row in df.iterrows():
        texts.append((row[1], row[2], row[4]))

    vectorizer = TfidfVectorizer(ngram_range=(1, 3))
    text_vectors = vectorizer.fit_transform([text[0] for text in texts] + [target_text])

    cosine_similarities = cosine_similarity(text_vectors[-1], text_vectors[:-1])
    cosine_similarities = cosine_similarities[0]  # Extract the first row from the 2D array

    top_indices = cosine_similarities.argsort()[::-1][:k]

    results = [(texts[i][0], get_label(texts[i][-1]), cosine_similarities[i]) for i in top_indices]
    return results

In [None]:
def get_label(text):
  if text == 1:
      return "58"
  elif text == 0:
      return "47"

In [None]:
class AYA23Generator:
    def __init__(self, model_name, quantize_4bit=True, use_flash_attention=False):
        self.model_name = model_name
        self.quantize_4bit = quantize_4bit
        self.use_flash_attention = use_flash_attention
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model = None
        self.tokenizer = None
        self._load_model()

    def _load_model(self):
        quantization_config = None
        if self.quantize_4bit:
            quantization_config = BitsAndBytesConfig(
                load_in_4bit=True,
                bnb_4bit_quant_type="nf4",
                bnb_4bit_use_double_quant=True,
                bnb_4bit_compute_dtype=torch.bfloat16,

            )


        attn_implementation = None
        if self.use_flash_attention:
            attn_implementation = "flash_attention_2"

        self.model = AutoModelForCausalLM.from_pretrained(
            self.model_name,
            quantization_config=quantization_config,
            attn_implementation=attn_implementation,
            torch_dtype=torch.bfloat16,
            device_map="auto",
        )
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        print("Model and tokenizer loaded successfully.")

    def get_message_format(self, prompts):
        return [[{"role": "user", "content": p}] for p in prompts]

    def generate_responses(self, prompts, temperature=0.3, top_p=0.75, top_k=0, max_new_tokens=1024):
        messages = self.get_message_format(prompts)
        input_ids = self.tokenizer.apply_chat_template(
            messages,
            tokenize=True,
            add_generation_prompt=True,
            padding=True,
            return_tensors="pt",
        ).to(self.device)
        prompt_padded_len = len(input_ids[0])
        gen_tokens = self.model.generate(
            input_ids,
            temperature=temperature,
            top_p=top_p,
            top_k=top_k,
            max_new_tokens=max_new_tokens,
            do_sample=True,
        )
        gen_tokens = [gt[prompt_padded_len:] for gt in gen_tokens]
        return self.tokenizer.batch_decode(gen_tokens, skip_special_tokens=True)

In [None]:
test_df = pd.read_csv('/kaggle/input/news-dataset/test.csv')
train_df = pd.read_csv('/kaggle/input/news-dataset/train.csv')
aya_df = pd.read_csv('/kaggle/input/news-dataset/Symbol_Tuning_Results_All.csv')
column_name_to_write = 'predicted_k_0'

In [None]:
# nan_indices = aya_df.index[pd.isna(aya_df[column_name_to_write])].tolist()
# if nan_indices:
#     start_row = nan_indices[0]
#     print(f"The first NaN value is at index: {start_row}")
# else:
#     print(f"No NaN values found in column {column_name_to_write}")

In [None]:
# columns_to_update = ['predicted_k_0', 'predicted_k_1', 'predicted_k_2', 'predicted_k_3',
#                      'predicted_k_4', 'predicted_k_5', 'predicted_k_10', 'predicted_k_20',
#                      'predicted_k_50']
# aya_df[columns_to_update] = aya_df[columns_to_update].applymap(lambda x: np.nan if x in [0, 1] else x)

In [None]:
train_df.head(3)

In [None]:
aya_df.head(10)

In [None]:
start_row = aya_df.index[pd.isna(aya_df[column_name_to_write])].tolist()[0]
print(start_row)

In [None]:
sys.path.append('/content')
MODEL_NAME = "CohereForAI/aya-23-8B"
generator = AYA23Generator(MODEL_NAME)

In [None]:
with open('/kaggle/input/news-prompt/base_prompt.txt', 'r', encoding='utf-8') as f:
      base_prompt = f.read()

with open('/kaggle/input/news-prompt/kshot_prompt.txt', 'r', encoding='utf-8') as f:
      kshot_prompt = f.read()

In [None]:
os.makedirs('Prompt', exist_ok=True)

In [None]:
for i in range(start_row, len(aya_df)):

      k_shot = int(column_name_to_write.split('_')[2])
      if k_shot == 0:
          prompt_fa_kshot = base_prompt
      else:
          prompt_fa_kshot = kshot_prompt

      test_df_counter = i % len(test_df)
      print(f"test_df_counter is {test_df_counter}")

      target_text = aya_df['text'][i]
      if (len(target_text) > 10000):
          target_text = target_text[:8000]

      new_prompt = prompt_fa_kshot.replace("^^body^^",  target_text)

      if k_shot != 0:
          sample_str = ''
          for _ in range(k_shot):
              sample_str += 'متن نمونه: {}\n' + '{}'

          new_prompt = new_prompt.replace('SAMPLES_HERE', sample_str)
          samples = []
          similar_texts = get_k_most_similar_texts_by_tfidf(train_df, test_df['title'][test_df_counter] + '\n' + test_df['text'][test_df_counter], k=k_shot)

          for text in similar_texts:
              samples.append(text[0] + ' خروجی: ' + text[1])
              samples.append('\n')

          new_prompt = new_prompt.format(*samples)

      with open(f'Prompt/prompt{i}.txt', 'w', encoding='utf-8') as f:
        f.write(new_prompt)

      new_prompt = [new_prompt]
      result = int(generator.generate_responses(new_prompt)[0])

      torch.cuda.empty_cache()
      aya_df.at[i, column_name_to_write] = result

      print(f"answer of row {i} is {result} and k is {k_shot}.     Text type: {aya_df['text_type'][i]}  Real tag: {aya_df['real_tag'][i]}")

      if i % 20 == 0:
          aya_df.to_csv('Symbol_Tuning_Results_All.csv', index=False)
          print(f"dataframe saved to csv file at iteration {i}")