In [1]:
from datasets import load_dataset
from sklearn.model_selection import train_test_split

# ჩამოვტვირთოთ ჩემი არჩეული დატასეტი
dataset = load_dataset('valurank/News_Articles_Categorization')

# გავყოთ დატასეტი სატრენინგო და ვალიდაციის დატასეტებად
train_data, val_data = train_test_split(dataset['train'], test_size=0.2, random_state=42)

Downloading readme:   0%|          | 0.00/857 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/19.3M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3722 [00:00<?, ? examples/s]

In [2]:
# ვნახულობ როგორ გამოიყურება
print("Category:", train_data["Category"][0])
print("Text:", train_data["Text"][0][:200])

print("\n --- All Categories --- \n", list(set(train_data["Category"])))

Category: Tech
Text: Credit...Jason Henry for The New York TimesJune 28, 2018SAN FRANCISCO California has passed a digital privacy law granting consumers more control over and insight into the spread of their personal inf

 --- All Categories --- 
 ['Business', 'Tech', 'Entertainment', 'Health', 'science', 'Sports', 'Politics', 'World']


In [3]:
# დავალების ამოცანის label-ების მაპინგი
LABELS = {
    'business': 0,
    'entertainment': 1,
    'politics': 2,
    'sports': 3,
    'technology': 4,
    'undefined': 5,
}

# დატასეტში არსებული label-ების მაპინგი ამ დავალების ამოცანის label-ებზე.
LABEL_MAP = {
    'Business': LABELS["business"],
    "science": LABELS["technology"],
    'Sports': LABELS["sports"],
    "Health": LABELS["undefined"],
    "Entertainment": LABELS["entertainment"],
    "Tech": LABELS["technology"],
    "World": LABELS["politics"],
    "Politics": LABELS["politics"],
}


In [4]:
import pandas as pd

# პანდას დატაფრეიმებს ვიყენებ რომ უკეთესად დავინახო მონაცემები
train_data = pd.DataFrame.from_dict(train_data)
val_data = pd.DataFrame.from_dict(val_data)

In [5]:
print(train_data)
print(val_data)

# იმის მაგივრად რომ კატეგორიაში ჰქონდეს სტრინგი კატეგორიის, ამას ვანაცვლებ 0-4-ის ჩათვლით შესაბამისი რიცხვით
train_data["Category"] = train_data["Category"].map(LABEL_MAP)
val_data["Category"] = val_data["Category"].map(LABEL_MAP)

# წავიკითხოთ რომ ნაღდად თუ შეიცვალა
print(train_data)
print(val_data)


                                                   Text  Category
0     Credit...Jason Henry for The New York TimesJun...      Tech
1     Science Times at 40Many women in science thoug...   science
2     Researchers compared 10 cases in Italy with pr...    Health
3     Credit...Walt Disney Studios Motion PicturesDe...  Business
4     Credit...Miguel Medina/Agence France-Presse Ge...     World
...                                                 ...       ...
2972  RoundupFeb. 8, 2014Melvin Ejim scored a Big 12...    Sports
2973  Credit...Drew Angerer for The New York TimesDe...  Business
2974  Dec. 14, 2015Credit...Fred R. Conrad/The New Y...  Business
2975  Behind the Masks, a Mystery: How Often Do the ...    Health
2976  Politics|Prosecutors mull charges for theft of...  Politics

[2977 rows x 2 columns]
                                                  Text       Category
0    Roddy Piper Would Be 'Damn Proud' of Rousey .....  Entertainment
1    Jen Selter Drink it All In ... Not Jus

In [6]:
print(len(train_data))
print(len(val_data))

# ამით ვყრი ისეთ მონაცემებს რომლებსაც ვერ ვუსაბამებ ამ დავალებაში რომელიმე კატეგორიას
# ასეთი კატეგორიაა Health.
# კი, ეს წესით უნდა იყოს Science/Technology კატეგორიაში, მაგრამ რომ გადავხედე მონაცემებს მთლად არ იყო ასე და ზოგი არტიკლი არ ჯდებოდა ამ კატეგორიაში
train_data = train_data.drop(train_data[train_data["Category"] == LABELS["undefined"]].index)
val_data = val_data.drop(val_data[val_data["Category"] == LABELS["undefined"]].index)

# Preview the dataset
print(len(train_data))
print(len(val_data))

2977
745
2612
643


In [7]:
import pandas as pd
from transformers import AutoTokenizer
import torch

# გადავწყვიტე რომ გამოვიყენო bert მოდელი
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

# აქ ვუკეთებ ტოკენიზირებას ჩემს დატას
def tokenize_data(df, tokenizer):
    return tokenizer(df['Text'].tolist(), padding=True, truncation=True, return_tensors='pt')

train_encodings = tokenize_data(train_data, tokenizer)
val_encodings = tokenize_data(val_data, tokenizer)

# ვაქცევ ტენზორებად
train_labels = torch.tensor(train_data['Category'].values)
val_labels = torch.tensor(val_data['Category'].values)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [8]:
# შევამოწმოთ რომ ნამდვილად სწორად გავაკეთე ზედა ნაწილი
print(len(train_labels))

2612


In [9]:
from torch.utils.data import Dataset

class NewsDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

# აქ ვქმნი უკვე დატასეტს ენკოდინგებით და ლეიბელებით
train_dataset = NewsDataset(train_encodings, train_labels)
val_dataset = NewsDataset(val_encodings, val_labels)

print(train_dataset)
print(val_dataset)

<__main__.NewsDataset object at 0x7c5534cfa3b0>
<__main__.NewsDataset object at 0x7c5534cfa410>


In [10]:
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments
import wandb

# wandb ლოგირებას ვრთავ
wandb.init(project="news-classification")

2024-06-30 12:43:07.886237: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-30 12:43:07.886369: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-30 12:43:08.050380: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ·································


[34m[1mwandb[0m: [32m[41mERROR[0m API key must be 40 characters long, yours was 33
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [11]:
# მოდელად, როგორც წინაზე ვთქვი ვიყენებ bert-ს
model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=5)

# აქ დასატრენინგებელ ჰიპერპარამეტრებს ვსეტავ, უბრალოდ რანდომად ავიღე ყველაზე ნორმალური რაც მომაფიქრდა.
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    report_to="wandb",
)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

In [13]:
# გავუშვათ დატრენინგება : )
trainer.train()



Step,Training Loss
10,1.7109
20,1.6813
30,1.6475
40,1.5158
50,1.4257
60,1.3438
70,1.205
80,1.0943
90,0.9741
100,0.8596


TrainOutput(global_step=246, training_loss=0.7172756786268901, metrics={'train_runtime': 397.3353, 'train_samples_per_second': 19.721, 'train_steps_per_second': 0.619, 'total_flos': 2061793764347904.0, 'train_loss': 0.7172756786268901, 'epoch': 3.0})

In [14]:
model.save_pretrained("./news_classification_model")
tokenizer.save_pretrained("./news_classification_tokenizer")

('./news_classification_tokenizer/tokenizer_config.json',
 './news_classification_tokenizer/special_tokens_map.json',
 './news_classification_tokenizer/vocab.txt',
 './news_classification_tokenizer/added_tokens.json',
 './news_classification_tokenizer/tokenizer.json')

In [15]:
გავუშვათ ევალუაცია და დავლოგოთ ეს wandb-ში
eval_results = trainer.evaluate()
print(eval_results)

wandb.log(eval_results)

{'eval_loss': 0.09831658750772476, 'eval_runtime': 9.9862, 'eval_samples_per_second': 64.389, 'eval_steps_per_second': 0.601, 'epoch': 3.0}


### EVALUATE MODEL ON RANDOM NEWS ARTICLE I GOT FROM THE INTERNET

In [8]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

# Load the tokenizer and model
model_path = "./news_classification_model"
tokenizer_path = "./news_classification_tokenizer"

tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
model_eval = AutoModelForSequenceClassification.from_pretrained(model_path)

model_eval.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [9]:
def predict_category(article, tokenizer, model):
    inputs = tokenizer(article, return_tensors='pt', padding=True, truncation=True, max_length=512)
    
    with torch.no_grad():
        outputs = model(**inputs)
    
    logits = outputs.logits
    predicted_class = torch.argmax(logits, dim=1).item()
    
    category_map = {
        0: 'Business',
        1: 'Entertainment',
        2: 'Politics',
        3: 'Sports',
        4: 'Technology'
    }
    
    return category_map[predicted_class]

In [10]:
# რანდომული არტიკლი რაც გადმოვაკოპირე ინტერნეტიდან, რამდენიმე ვარიანტზე ვქენი და ხელით გატესტვით ნორმალური ჩანს ეს მოდელი
article = """India’s Ganges River shifted abruptly due to a distant yet massive earthquake around 2,500 years ago, new geologic evidence suggests.
Such changes have been observed in other rivers in recent times but only far upstream. This ancient Ganges shift occurred in the delta, about 200 kilometers from where the river empties into the Bay of Bengal. Flooding from similar shifts of other rivers today could threaten hundreds of millions of people who live in the modern cities built on river deltas worldwide.
As rivers chew their way across the landscape, they naturally wander — especially in their relatively flat deltas, where sediments can pile up and divert the river one way or another (SN: 4/1/14). Course changes can unfold gradually over time spans ranging from years to centuries, says Elizabeth Chamberlain, a geophysicist at Wageningen University & Research in the Netherlands. But channel-jumping triggered by an earthquake can occur in weeks or days, she says.
"""

# უნდა დაბეჭდოს "Technology"

# წინასწარმეტყველება
predicted_category = predict_category(article, tokenizer, model_eval)
print(f"The predicted category for the article is: {predicted_category}")

The predicted category for the article is: Technology


In [11]:
# დაზიპვა ყველაფრის working directory-ში

import shutil

shutil.make_archive("submission", 'zip', root_dir=".", base_dir=".")

'/kaggle/working/submission.zip'