In [1]:
!huggingface-cli login --token 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

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [2]:
!pip uninstall bitsandbytes -y
!pip uninstall accelerate -y
!pip install -U bitsandbytes
!pip install -U accelerate

[0mFound existing installation: accelerate 0.32.1
Uninstalling accelerate-0.32.1:
  Successfully uninstalled accelerate-0.32.1
Collecting bitsandbytes
  Downloading bitsandbytes-0.43.3-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)
Downloading bitsandbytes-0.43.3-py3-none-manylinux_2_24_x86_64.whl (137.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m137.5/137.5 MB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.43.3
Collecting accelerate
  Downloading accelerate-0.33.0-py3-none-any.whl.metadata (18 kB)
Downloading accelerate-0.33.0-py3-none-any.whl (315 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m315.1/315.1 kB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: accelerate
Successfully installed accelerate-0.33.0


In [3]:
import bitsandbytes as bnb
import accelerate

print(f"bitsandbytes version: {bnb.__version__}")
print(f"accelerate version: {accelerate.__version__}")

bitsandbytes version: 0.43.3
accelerate version: 0.33.0


In [4]:
import os
import sys
import torch
import warnings
import numpy as np
import pandas as pd

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

warnings.filterwarnings('ignore')
torch.backends.cuda.enable_mem_efficient_sdp(False)
torch.backends.cuda.enable_flash_sdp(False)

In [5]:
def get_k_most_similar_texts_by_tfidf(df, target_text, texts=None, k=5):
    texts = []
    for index, row in df.iterrows():
        texts.append((row[1], row[2], row[4]))

    vectorizer = TfidfVectorizer(ngram_range=(1, 3))
    text_vectors = vectorizer.fit_transform([text[0] for text in texts] + [target_text])

    cosine_similarities = cosine_similarity(text_vectors[-1], text_vectors[:-1])
    cosine_similarities = cosine_similarities[0]  # Extract the first row from the 2D array

    top_indices = cosine_similarities.argsort()[::-1][:k]

    results = [(texts[i][0], get_label(texts[i][-1]), cosine_similarities[i]) for i in top_indices]
    return results

In [6]:
def get_label(text):
  if text == 1:
      return "58"
  elif text == 0:
      return "47"

In [7]:
class AYA23Generator:
    def __init__(self, model_name, quantize_4bit=True, use_flash_attention=False):
        self.model_name = model_name
        self.quantize_4bit = quantize_4bit
        self.use_flash_attention = use_flash_attention
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model = None
        self.tokenizer = None
        self._load_model()

    def _load_model(self):
        quantization_config = None
        if self.quantize_4bit:
            quantization_config = BitsAndBytesConfig(
                load_in_4bit=True,
                bnb_4bit_quant_type="nf4",
                bnb_4bit_use_double_quant=True,
                bnb_4bit_compute_dtype=torch.bfloat16,

            )


        attn_implementation = None
        if self.use_flash_attention:
            attn_implementation = "flash_attention_2"

        self.model = AutoModelForCausalLM.from_pretrained(
            self.model_name,
            quantization_config=quantization_config,
            attn_implementation=attn_implementation,
            torch_dtype=torch.bfloat16,
            device_map="auto",
        )
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        print("Model and tokenizer loaded successfully.")

    def get_message_format(self, prompts):
        return [[{"role": "user", "content": p}] for p in prompts]

    def generate_responses(self, prompts, temperature=0.3, top_p=0.75, top_k=0, max_new_tokens=1024):
        messages = self.get_message_format(prompts)
        input_ids = self.tokenizer.apply_chat_template(
            messages,
            tokenize=True,
            add_generation_prompt=True,
            padding=True,
            return_tensors="pt",
        ).to(self.device)
        prompt_padded_len = len(input_ids[0])
        gen_tokens = self.model.generate(
            input_ids,
            temperature=temperature,
            top_p=top_p,
            top_k=top_k,
            max_new_tokens=max_new_tokens,
            do_sample=True,
        )
        gen_tokens = [gt[prompt_padded_len:] for gt in gen_tokens]
        return self.tokenizer.batch_decode(gen_tokens, skip_special_tokens=True)

In [8]:
test_df = pd.read_csv('/kaggle/input/news-dataset/test.csv')
train_df = pd.read_csv('/kaggle/input/news-dataset/train.csv')
aya_df = pd.read_csv('/kaggle/input/news-dataset/Symbol_Tuning_Results_Version2.csv')
column_name_to_write = 'predicted_k_50'

In [9]:
# nan_indices = aya_df.index[pd.isna(aya_df[column_name_to_write])].tolist()
# if nan_indices:
#     start_row = nan_indices[0]
#     print(f"The first NaN value is at index: {start_row}")
# else:
#     print(f"No NaN values found in column {column_name_to_write}")

In [10]:
# columns_to_update = ['predicted_k_0', 'predicted_k_1', 'predicted_k_2', 'predicted_k_3',
#                      'predicted_k_4', 'predicted_k_5', 'predicted_k_10', 'predicted_k_20',
#                      'predicted_k_50']
# aya_df[columns_to_update] = aya_df[columns_to_update].applymap(lambda x: np.nan if x in [0, 1] else x)

In [11]:
train_df.head(3)

Unnamed: 0,link,title,text,tags,tag,title_tr,text_tr,tags_tr,summary_english,summary_persian
0,https://www.khabaronline.ir/news/1686919/این-و...,این ورزش سیاسی است!,ریحانه اسکندری: علی‌رغم این که سیاست‌مداران جه...,"['مدیریت ورزشی', 'تنیس', 'بسکتبال', 'کاراته', ...",0,"""This is political exercise!""",Rihaneh Eskandari: Despite politicians around ...,"['Sports management', 'tennis', 'basketball', ...",The impact of political decisions on sports is...,تیم ملی بسکتبال سه نفره زنان ایران به دلیل تاخ...
1,https://www.tabnak.ir/fa/news/1164909/جزئیات-ن...,جزئیات نشست غیرعلنی امروز مجلس/آیا اولویت ، مو...,تابناک _ دلهره و اضطراب مردم از نابسامانی بازا...,"['مجلس شورای اسلامی', 'نمایندگان مجلس', 'مجلس ...",0,The details of today's closed session of the p...,Tehran - People's anxiety and distress over th...,"['Islamic Consultative Assembly', 'Members of ...",People's anxiety and distress over the instabi...,نمایندگان مجلس شورای اسلامی با ارسال گزارش است...
2,https://www.hamshahrionline.ir/news/745272/کاه...,کاهش قیمت طلا و سکه در بازار؛ سکه طرح قدیم چند...,به گزارش همشهری آنلاین، امروز سه شنبه ۹ اسفند ...,"['خبر مهم', 'قیمت طلا و فلزات گرانبها - ایران'...",0,Decrease in gold and coin prices in the market...,"According to Hamshahri Online, as of today, We...","['Important news', 'gold and precious metals p...","As of today, Wednesday, 9th of Esfand, up to t...",قیمت هر قطعه سکه تمام بهار آزادی طرح قدیم امرو...


In [12]:
aya_df.head(10)

Unnamed: 0,text,text_type,real_tag,predicted_k_0,predicted_k_1,predicted_k_2,predicted_k_3,predicted_k_4,predicted_k_5,predicted_k_10,predicted_k_20,predicted_k_50
0,واکنش کنسولگری ایران در استانبول به ریجکت شدن ...,only_title,0,47.0,58.0,,,,47.0,,47.0,
1,برگزاری دادگاه پرونده کثیرالشاکی شرکت کاغذی «آ...,only_title,0,58.0,47.0,,,,47.0,,47.0,
2,خبر جدید وزیر بهداشت درباره بازگشایی مدارس در ...,only_title,1,58.0,58.0,,,,58.0,,58.0,
3,شکایت باشگاه استقلال از عیسی آل کثیر,only_title,1,58.0,47.0,,,,47.0,,47.0,
4,دولت فرانسه مسئول عواقب اهانت بی‌شرمانه علیه م...,only_title,0,58.0,58.0,,,,58.0,,58.0,
5,تامین روشنایی کنارگذر نواب,only_title,0,58.0,58.0,,,,58.0,,47.0,
6,شما نظر دهید/ ریشه و پیامدهای خشونت‌های اخیر د...,only_title,0,47.0,47.0,,,,58.0,,47.0,
7,۲۰ استان کشور متاثر از شرایط جوی / رهاسازی ۲۶۸...,only_title,0,58.0,58.0,,,,58.0,,58.0,
8,همه علیه کنعانی زادگان، حتی پرسپولیسی ها!,only_title,0,58.0,47.0,,,,47.0,,47.0,
9,رضا فیاضی به دلیل ابتلا به کرونا بستری شد,only_title,0,58.0,58.0,,,,58.0,,47.0,


In [13]:
start_row = aya_df.index[pd.isna(aya_df[column_name_to_write])].tolist()[0]
print(start_row)

0


In [14]:
sys.path.append('/content')
MODEL_NAME = "CohereForAI/aya-23-8B"
generator = AYA23Generator(MODEL_NAME)

config.json:   0%|          | 0.00/640 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/21.0k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/142 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/17.0k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/16.5M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Model and tokenizer loaded successfully.


In [15]:
with open('/kaggle/input/news-prompt/base_prompt_version2.txt', 'r', encoding='utf-8') as f:
      base_prompt = f.read()

with open('/kaggle/input/news-prompt/kshot_prompt_version2.txt', 'r', encoding='utf-8') as f:
      kshot_prompt = f.read()

In [16]:
os.makedirs('Prompt', exist_ok=True)

In [17]:
for i in range(start_row, len(aya_df)):

      k_shot = int(column_name_to_write.split('_')[2])
      if k_shot == 0:
          prompt_fa_kshot = base_prompt
      else:
          prompt_fa_kshot = kshot_prompt

      test_df_counter = i % len(test_df)
      print(f"test_df_counter is {test_df_counter}")

      target_text = aya_df['text'][i]
      if (len(target_text) > 10000):
          target_text = target_text[:8000]

      new_prompt = prompt_fa_kshot.replace("^^body^^",  target_text)

      if k_shot != 0:
          sample_str = ''
          for _ in range(k_shot):
              sample_str += 'متن نمونه: {}\n' + '{}'

          new_prompt = new_prompt.replace('SAMPLES_HERE', sample_str)
          samples = []
          similar_texts = get_k_most_similar_texts_by_tfidf(train_df, test_df['title'][test_df_counter] + '\n' + test_df['text'][test_df_counter], k=k_shot)

          for text in similar_texts:
              samples.append(text[0] + ' خروجی: ' + text[1])
              samples.append('\n')

          new_prompt = new_prompt.format(*samples)

      with open(f'Prompt/prompt{i}.txt', 'w', encoding='utf-8') as f:
        f.write(new_prompt)

      new_prompt = [new_prompt]
      result = int(generator.generate_responses(new_prompt)[0])

      torch.cuda.empty_cache()
      aya_df.at[i, column_name_to_write] = result

      print(f"answer of row {i} is {result} and k is {k_shot}.     Text type: {aya_df['text_type'][i]}  Real tag: {aya_df['real_tag'][i]}")

      if i % 20 == 0:
            aya_df.to_csv('Symbol_Tuning_Results_Version2.csv', index=False)
            print(f"dataframe saved to csv file at iteration {i}")
            if i == 400:
                print('Finished.')
                break

test_df_counter is 0


2024-08-04 11:03:12.018562: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-04 11:03:12.018715: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-04 11:03:12.131222: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


answer of row 0 is 47 and k is 50.     Text type: only_title  Real tag: 0
dataframe saved to csv file at iteration 0
test_df_counter is 1
answer of row 1 is 47 and k is 50.     Text type: only_title  Real tag: 0
test_df_counter is 2
answer of row 2 is 47 and k is 50.     Text type: only_title  Real tag: 1
test_df_counter is 3
answer of row 3 is 47 and k is 50.     Text type: only_title  Real tag: 1
test_df_counter is 4
answer of row 4 is 58 and k is 50.     Text type: only_title  Real tag: 0
test_df_counter is 5
answer of row 5 is 47 and k is 50.     Text type: only_title  Real tag: 0
test_df_counter is 6
answer of row 6 is 47 and k is 50.     Text type: only_title  Real tag: 0
test_df_counter is 7
answer of row 7 is 58 and k is 50.     Text type: only_title  Real tag: 0
test_df_counter is 8
answer of row 8 is 47 and k is 50.     Text type: only_title  Real tag: 0
test_df_counter is 9
answer of row 9 is 47 and k is 50.     Text type: only_title  Real tag: 0
test_df_counter is 10
answer