In [1]:
!pip install transformers
!pip install mecab-python3
!pip install unidic-lite

Collecting mecab-python3
  Downloading mecab_python3-1.0.9-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (581 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m581.7/581.7 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: mecab-python3
Successfully installed mecab-python3-1.0.9
Collecting unidic-lite
  Downloading unidic-lite-1.0.8.tar.gz (47.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.4/47.4 MB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: unidic-lite
  Building wheel for unidic-lite (setup.py) ... [?25l[?25hdone
  Created wheel for unidic-lite: filename=unidic_lite-1.0.8-py3-none-any.whl size=47658818 sha256=6f0c8817e141ee37c162730736f3ade83f89095fc0fc11f8b09dfa4400f9b227
  Stored in directory: /root/.cache/pip/wheels/89/e8/68/f9ac36b8cc6c8b3c96888cd57434abed96595d444f42243853
Successfully built 

In [4]:
!pip install --upgrade torch torchvision torchaudio
!pip install --upgrade transformers

Collecting torch
  Downloading torch-2.3.1-cp310-cp310-manylinux1_x86_64.whl (779.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m779.1/779.1 MB[0m [31m953.5 kB/s[0m eta [36m0:00:00[0m
Collecting torchvision
  Downloading torchvision-0.18.1-cp310-cp310-manylinux1_x86_64.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m78.8 MB/s[0m eta [36m0:00:00[0m
Collecting torchaudio
  Downloading torchaudio-2.3.1-cp310-cp310-manylinux1_x86_64.whl (3.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m76.9 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Usi

Collecting transformers
  Downloading transformers-4.42.4-py3-none-any.whl (9.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.3/9.3 MB[0m [31m55.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.41.2
    Uninstalling transformers-4.41.2:
      Successfully uninstalled transformers-4.41.2
Successfully installed transformers-4.42.4


In [1]:
import pandas as pd
import re
import MeCab

from tqdm import tqdm
from transformers import pipeline
import torch

from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder

In [None]:
# Загрузка файла
file_path = '/content/full_yahoo_news.xlsx'
data = pd.read_excel(file_path)

# Иcпользование MeCab - токенизатор для японского языка
mecab = MeCab.Tagger('-Owakati')

def preprocess(text):
    # Токенизация
    tokens = mecab.parse(text).strip().split()
    text = ' '.join(tokens)
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\n', ' ', text)
    return text

# Сохранение токенизированного текста
data['cleaned_text'] = data['text'].apply(preprocess)

# Модель классификатора
device = 0 if torch.cuda.is_available() else -1
classifier = pipeline("zero-shot-classification", model="joeddav/xlm-roberta-large-xnli", device=device)

# Категории: "международные новости", "экономика", "политика", "спорт"
labels = ['国際', '経済', '政治', 'スポーツ']

def classify_with_progress(text):
    return classifier(text, candidate_labels=labels)['labels'][0]

# Шкала прогресса
tqdm.pandas(desc="Classifying")

# Функция для сохранения данных по частям
def save_at_intervals(df, interval, path):
    for start in range(0, len(df), interval):
        end = min(start + interval, len(df))
        df_chunk = df.iloc[start:end].copy()
        df_chunk['label'] = df_chunk['cleaned_text'].progress_apply(classify_with_progress)
        df.loc[start:end-1, 'label'] = df_chunk['label']
        df.to_excel(path, index=False)
        print(f"Saved {end} articles")

# Классификация и сохранение полученных данных каждые 100 статей
save_at_intervals(data, 100, '/content/classified_yahoo_news.xlsx')

# Печать первых 4 статей
print(data[['text', 'cleaned_text', 'label']].head())

# Сохранение всех данных в один файл
data.to_excel('/content/classified_yahoo_news_final.xlsx', index=False)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/734 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

Some weights of the model checkpoint at joeddav/xlm-roberta-large-xnli were not used when initializing XLMRobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

Classifying: 100%|██████████| 100/100 [38:49<00:00, 23.30s/it]


Saved 100 articles


Classifying: 100%|██████████| 100/100 [38:36<00:00, 23.17s/it]


Saved 200 articles


Classifying: 100%|██████████| 100/100 [39:39<00:00, 23.80s/it]


Saved 300 articles


Classifying: 100%|██████████| 100/100 [38:39<00:00, 23.20s/it]


Saved 400 articles


Classifying: 100%|██████████| 100/100 [38:04<00:00, 22.84s/it]


Saved 500 articles


Classifying: 100%|██████████| 100/100 [38:28<00:00, 23.08s/it]


Saved 600 articles


Classifying: 100%|██████████| 100/100 [38:30<00:00, 23.10s/it]


Saved 700 articles


Classifying: 100%|██████████| 100/100 [39:19<00:00, 23.60s/it]


Saved 800 articles


Classifying: 100%|██████████| 100/100 [38:43<00:00, 23.23s/it]


Saved 900 articles


Classifying: 100%|██████████| 100/100 [38:20<00:00, 23.00s/it]


Saved 1000 articles


Classifying: 100%|██████████| 68/68 [26:21<00:00, 23.26s/it]


Saved 1068 articles
                                                text  \
0  ［モスクワ ２４日 ロイター］ - ロシア国防省は、捕虜のウクライナ兵士６５人を乗せた軍輸送...   
1  弾道ミサイル攻撃に備え、東京都が都営地下鉄・麻布十番駅の構内に住民らが避難・滞在できる「地下...   
2   3月18日に開幕するセンバツ高校野球の出場校が26日、発表されました。2024年から、東海...   
3  （週刊大阪日日新聞 論説委員 畑山博史）\n\n パレスチナ自治区ガザ地区を統治する「ハマス...   
4  ロシアによるウクライナ侵略で、露国防省は２９日、露軍がウクライナ東部ハリコフ州の集落タバエフ...   

                                        cleaned_text label  
0   モスクワ ２４ 日 ロイター   ロシア 国防 省 は  捕虜 の ウクライナ 兵士 ６５...    政治  
1  弾道 ミサイル 攻撃 に 備え  東京 都 が 都営 地下 鉄  麻布十番 駅 の 構内 に...  スポーツ  
2  3 月 18 日 に 開幕 する センバツ 高校 野球 の 出場 校 が 26 日  発表 ...  スポーツ  
3   週刊 大阪 日日 新聞 論説 委員 畑山 博史  パレスチナ 自治 区 ガザ 地区 を 統...    政治  
4  ロシア に よる ウクライナ 侵略 で  露 国防 省 は ２９ 日  露軍 が ウクライナ...    政治  


In [3]:
# Загрузка файла с классификацией модели и классификацией, выполненной вручную (столбец 'human_label')
df = pd.read_excel('/content/classified_yahoo_news_model and human labelled.xlsx')

# Кодировка категорий
label_encoder = LabelEncoder()
df['label_encoded'] = label_encoder.fit_transform(df['label'])
df['human_label_encoded'] = label_encoder.transform(df['human_label'])

# Оценка работы модели - сравнение оценок модели и оценок, сделанных человеком
print("Classification Report:\n")
print(classification_report(df['human_label_encoded'], df['label_encoded'], target_names=label_encoder.classes_, zero_division=0))

Classification Report:

              precision    recall  f1-score   support

        スポーツ       0.80      0.76      0.78       290
          国際       0.72      0.38      0.50       574
          政治       0.19      0.57      0.29       122
          経済       0.52      0.80      0.63        82

    accuracy                           0.54      1068
   macro avg       0.56      0.63      0.55      1068
weighted avg       0.66      0.54      0.56      1068

