In [1]:
import os
import pandas as pd
import chardet
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

# Путь к папке с данными
data_path = r"C:\Users\epg_F\Homework_6\bbc"

# Список категорий (названия папок)
categories = ["business", "entertainment", "politics", "sport", "tech"]

# Создаем списки для хранения данных
texts = []
labels = []

# Функция для определения кодировки
def detect_encoding(file_path):
    with open(file_path, "rb") as f:
        result = chardet.detect(f.read())
    return result["encoding"]

# Читаем файлы из каждой категории
for category in categories:
    category_path = os.path.join(data_path, category)
    for filename in os.listdir(category_path):
        file_path = os.path.join(category_path, filename)
        encoding = detect_encoding(file_path)  # Определяем кодировку
        with open(file_path, "r", encoding=encoding, errors="replace") as file:
            texts.append(file.read())
            labels.append(category)

# Создаем DataFrame
df = pd.DataFrame({"text": texts, "label": labels})

# Разделение на train/test
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df["text"], df["label"], test_size=0.2, random_state=42, stratify=df["label"]
)

# Функция предобработки текста
def preprocess_text(text):
    text = re.sub(r'[^a-zA-Z ]', '', text)  # Удаляем все, кроме букв и пробелов
    text = text.lower().strip()  # Приводим к нижнему регистру
    return text

# Применяем предобработку
train_texts = train_texts.apply(preprocess_text)
test_texts = test_texts.apply(preprocess_text)

# Создание и обучение SVM-модели с TF-IDF
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),  # Преобразуем текст в векторы
    ('clf', SVC(probability=True))  # Классифицируем SVM
])

pipeline.fit(train_texts, train_labels)  # Обучаем модель
predictions = pipeline.predict(test_texts)  # Делаем предсказания

# Выводим метрики качества
print(classification_report(test_labels, predictions))


               precision    recall  f1-score   support

     business       0.99      0.98      0.99       101
entertainment       1.00      0.97      0.99        77
     politics       0.98      0.98      0.98        84
        sport       1.00      1.00      1.00       103
         tech       0.96      1.00      0.98        80

     accuracy                           0.99       445
    macro avg       0.99      0.99      0.99       445
 weighted avg       0.99      0.99      0.99       445



In [3]:
import os
import pandas as pd
import chardet

# Путь к папке с данными
data_path = r"C:\Users\epg_F\Homework_6\bbc"

# Список категорий (названия папок)
categories = ["business", "entertainment", "politics", "sport", "tech"]

# Создаем списки для хранения данных
texts = []
labels = []

# Функция для определения кодировки
def detect_encoding(file_path):
    with open(file_path, "rb") as f:
        result = chardet.detect(f.read())
    return result["encoding"]

# Читаем файлы из каждой категории
for category in categories:
    category_path = os.path.join(data_path, category)
    for filename in os.listdir(category_path):
        file_path = os.path.join(category_path, filename)
        encoding = detect_encoding(file_path)  # Определяем кодировку
        with open(file_path, "r", encoding=encoding, errors="replace") as file:
            texts.append(file.read())
            labels.append(category)

# Создаем DataFrame
df = pd.DataFrame({"text": texts, "label": labels})

# Проверяем загруженные данные
print(df.head())



                                                text     label
0  Ad sales boost Time Warner profit\n\nQuarterly...  business
1  Dollar gains on Greenspan speech\n\nThe dollar...  business
2  Yukos unit buyer faces loan claim\n\nThe owner...  business
3  High fuel prices hit BA's profits\n\nBritish A...  business
4  Pernod takeover talk lifts Domecq\n\nShares in...  business


In [5]:
from sklearn.model_selection import train_test_split

train_texts, test_texts, train_labels, test_labels = train_test_split(
    df["text"], df["label"], test_size=0.2, random_state=42, stratify=df["label"]
)

In [7]:
import re

def preprocess_text(text):
    text = re.sub(r'[^a-zA-Z ]', '', text)  # Удаляем все, кроме букв и пробелов
    text = text.lower().strip()  # Приводим к нижнему регистру
    return text

train_texts = train_texts.apply(preprocess_text)
test_texts = test_texts.apply(preprocess_text)


In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),  # Преобразуем текст в векторы
    ('clf', SVC(probability=True))  # Классифицируем SVM
])

pipeline.fit(train_texts, train_labels)  # Обучаем модель
predictions = pipeline.predict(test_texts)  # Делаем предсказания

print(classification_report(test_labels, predictions))  # Выводим метрики


               precision    recall  f1-score   support

     business       0.99      0.98      0.99       101
entertainment       1.00      0.97      0.99        77
     politics       0.98      0.98      0.98        84
        sport       1.00      1.00      1.00       103
         tech       0.96      1.00      0.98        80

     accuracy                           0.99       445
    macro avg       0.99      0.99      0.99       445
 weighted avg       0.99      0.99      0.99       445



In [11]:
import torch
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
from sklearn.preprocessing import LabelEncoder
import numpy as np

# Проверяем доступность GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# Кодируем лейблы в числовые значения
label_encoder = LabelEncoder()
train_labels_encoded = label_encoder.fit_transform(train_labels)
test_labels_encoded = label_encoder.transform(test_labels)

# Загружаем токенизатор
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

# Функция токенизации
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)

# Создаем датасеты Hugging Face
train_dataset = Dataset.from_dict({"text": train_texts.tolist(), "label": train_labels_encoded})
test_dataset = Dataset.from_dict({"text": test_texts.tolist(), "label": test_labels_encoded})

train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# Загружаем модель DistilBERT
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=len(categories)).to(device)

# Аргументы для тренировки
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    num_train_epochs= 0.1,  ####################### иначе работает 5 часов, было 3
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.05,   ################# иначе работает 5 часов. было 0,01
    logging_steps=10,
    load_best_model_at_end=True
)

# Trainer API Hugging Face
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

# Обучаем модель
trainer.train()

# Делаем предсказания
predictions = trainer.predict(test_dataset)
pred_labels = np.argmax(predictions.predictions, axis=1)
pred_labels_text = label_encoder.inverse_transform(pred_labels)

# Выводим метрики
print(classification_report(test_labels, pred_labels_text))


Using device: cpu


Map:   0%|          | 0/1776 [00:00<?, ? examples/s]

Map:   0%|          | 0/445 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
0,1.6055,1.594041


               precision    recall  f1-score   support

     business       0.38      0.33      0.35       101
entertainment       0.06      0.04      0.05        77
     politics       0.48      0.76      0.59        84
        sport       0.23      0.39      0.29       103
         tech       0.00      0.00      0.00        80

     accuracy                           0.31       445
    macro avg       0.23      0.30      0.25       445
 weighted avg       0.24      0.31      0.27       445



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
import joblib

# Заново создаем и обучаем векторизатор
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(train_texts)

# Создаем и обучаем SVM
svm_model = SVC(kernel="linear")
svm_model.fit(X_train_tfidf, train_labels)

# Сохраняем обученную модель и векторизатор
joblib.dump(vectorizer, "tfidf_vectorizer.pkl")
joblib.dump(svm_model, "svm_model.pkl")

print("Обучение завершено, модель и векторизатор сохранены!")


Обучение завершено, модель и векторизатор сохранены!


In [54]:
import joblib

# Сохранение модели SVM и векторизатора
joblib.dump(vectorizer, "tfidf_vectorizer.pkl")
joblib.dump(svm_model, "svm_model.pkl")
print("SVM модель и TF-IDF векторизатор сохранены.")


SVM модель и TF-IDF векторизатор сохранены.


In [56]:
import os

if os.path.exists("tfidf_vectorizer.pkl") and os.path.exists("svm_model.pkl"):
    vectorizer = joblib.load("tfidf_vectorizer.pkl")
    svm_model = joblib.load("svm_model.pkl")
    print("SVM модель и TF-IDF векторизатор загружены.")
else:
    print("Ошибка: файлы моделей не найдены! Проверь, что они сохранены.")


SVM модель и TF-IDF векторизатор загружены.


In [58]:
torch.save(model.state_dict(), "distilbert_bbc.pth")
print("Модель DistilBERT сохранена!")

Модель DistilBERT сохранена!


In [60]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
import joblib

# Обучаем заново (если файлы потеряны)
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(train_texts)
svm_model = SVC()
svm_model.fit(X_train_tfidf, train_labels)

# Сохраняем
joblib.dump(vectorizer, "tfidf_vectorizer.pkl")
joblib.dump(svm_model, "svm_model.pkl")

print("SVM и TF-IDF векторизатор сохранены!")




SVM и TF-IDF векторизатор сохранены!


In [91]:
def get_bbc_news():
    options = Options()
    options.add_argument("--headless")  # Запуск без интерфейса браузера
    options.add_argument("--disable-gpu")
    options.add_argument("--no-sandbox")

    # Инициализация драйвера
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=options)

    try:
        driver.get("https://www.bbc.com/news")

        # Ждем появления заголовков (например, в div с классом "gs-c-promo-heading__title")
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, ".gs-c-promo-heading__title"))
        )

        # Парсим контент после загрузки
        soup = BeautifulSoup(driver.page_source, "html.parser")

        # Поиск заголовков
        headlines = [h.text.strip() for h in soup.find_all("a", class_="gs-c-promo-heading__title") if h.text.strip()]
        return headlines[:10]  # Берем 10 заголовков

    finally:
        driver.quit()  # Закрываем браузер

# Проверяем
bbc_news = get_bbc_news()
print("BBC News Headlines:", bbc_news)


TimeoutException: Message: 
Stacktrace:
	GetHandleVerifier [0x002BB5A3+24387]
	(No symbol) [0x00245904]
	(No symbol) [0x00120753]
	(No symbol) [0x00168BA9]
	(No symbol) [0x00168EFB]
	(No symbol) [0x001B19C2]
	(No symbol) [0x0018D894]
	(No symbol) [0x001AF138]
	(No symbol) [0x0018D646]
	(No symbol) [0x0015C59F]
	(No symbol) [0x0015D8E4]
	GetHandleVerifier [0x005BD883+3179043]
	GetHandleVerifier [0x005D6CF9+3282585]
	GetHandleVerifier [0x005D167C+3260444]
	GetHandleVerifier [0x00354330+650448]
	(No symbol) [0x0024ED0D]
	(No symbol) [0x0024BAF8]
	(No symbol) [0x0024BC99]
	(No symbol) [0x0023E530]
	BaseThreadInitThunk [0x756FFCC9+25]
	RtlGetAppContainerNamedObjectPath [0x778D82AE+286]
	RtlGetAppContainerNamedObjectPath [0x778D827E+238]
	(No symbol) [0x00000000]


In [81]:
import requests
from bs4 import BeautifulSoup
import joblib
import torch
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
import numpy as np

# Загрузка моделей
vectorizer = joblib.load("tfidf_vectorizer.pkl")
svm_model = joblib.load("svm_model.pkl")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")
bert_model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=5).to(device)
bert_model.load_state_dict(torch.load("distilbert_bbc.pth", map_location=device))
bert_model.eval()

# Функция парсинга заголовков с BBC
def get_bbc_news():
    url = "https://www.bbc.com/news"
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
    headlines = [h.text.strip() for h in soup.find_all("h3")][:10]  # Берем 10 заголовков
    return headlines

# Функция предсказания SVM
def predict_svm(texts):
    vectors = vectorizer.transform(texts)
    predictions = svm_model.predict(vectors)
    return predictions

# Функция предсказания DistilBERT
def predict_bert(texts):
    inputs = tokenizer(texts, padding=True, truncation=True, max_length=512, return_tensors="pt").to(device)
    with torch.no_grad():
        outputs = bert_model(**inputs)
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=1).cpu().numpy()
    return predictions

# Классы новостей
categories = ["business", "entertainment", "politics", "sport", "tech"]

# Запуск предсказаний
 

bbc_news = get_bbc_news()
print("\nЗаголовки BBC:", bbc_news)  # Проверяем, загружены ли новости

if not bbc_news:
    raise ValueError("Ошибка: не удалось получить заголовки новостей! Проверь разметку сайта.")


svm_preds = predict_svm(bbc_news)
bert_preds = predict_bert(bbc_news)

# Вывод результатов
print("\n--- BBC News Classification ---")
for i, text in enumerate(bbc_news):
    print(f"\nNews: {text}")
    print(f"SVM Prediction: {categories[svm_preds[i]]}")
    print(f"BERT Prediction: {categories[bert_preds[i]]}")


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ModuleNotFoundError: No module named 'selenium'

In [63]:
import joblib

# Сохранение TF-IDF векторизатора
joblib.dump(vectorizer, "tfidf_vectorizer.pkl")

# Сохранение SVM модели
joblib.dump(svm_model, "svm_model.pkl")

['svm_model.pkl']

In [65]:
import os
print(os.listdir())  # Покажет файлы в текущей директории


['.ipynb_checkpoints', '0.26.0', 'bbc', 'bbc.zip', 'distilbert_bbc.pth', 'HomeWork.pptx', 'logs', 'results', 'svm_model.pkl', 'tfidf_vectorizer.pkl', 'Untitled.ipynb', 'Untitled1.ipynb', 'Untitled2.ipynb']


In [67]:
import os
print(os.getcwd())  # Покажет текущую рабочую папку

C:\Users\epg_F\Homework_6


In [69]:
import joblib
import torch
import numpy as np
import requests
from bs4 import BeautifulSoup
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from sklearn.feature_extraction.text import TfidfVectorizer
import torch.nn.functional as F

# Загрузка SVM модели и TF-IDF векторизатора
vectorizer = joblib.load("tfidf_vectorizer.pkl")
svm_model = joblib.load("svm_model.pkl")

# Загрузка DistilBERT модели и токенизатора
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
distilbert_tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
distilbert_model = DistilBertForSequenceClassification.from_pretrained("./distilbert_model").to(device)

def predict_svm(text):
    """Классификация новости с помощью SVM."""
    text_vectorized = vectorizer.transform([text])
    prediction = svm_model.predict(text_vectorized)[0]
    probabilities = svm_model.decision_function(text_vectorized)
    probabilities = np.exp(probabilities) / np.sum(np.exp(probabilities))  # Softmax-вероятности
    return prediction, probabilities

def predict_distilbert(text):
    """Классификация новости с помощью DistilBERT."""
    inputs = distilbert_tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    inputs = {key: val.to(device) for key, val in inputs.items()}
    with torch.no_grad():
        outputs = distilbert_model(**inputs)
    probabilities = F.softmax(outputs.logits, dim=-1).cpu().numpy()[0]
    prediction = np.argmax(probabilities)
    return prediction, probabilities

def scrape_bbc_news():
    """Собираем свежие новости с BBC."""
    categories = {
        "business": "https://www.bbc.com/news/business",
        "entertainment": "https://www.bbc.com/news/entertainment_and_arts",
        "politics": "https://www.bbc.com/news/politics",
        "sport": "https://www.bbc.com/sport",
        "tech": "https://www.bbc.com/news/technology"
    }
    news_data = []
    
    for category, url in categories.items():
        response = requests.get(url)
        soup = BeautifulSoup(response.text, "html.parser")
        articles = soup.find_all("a", class_="gs-c-promo-heading")[:5]  # Берем 5 новостей
        
        for article in articles:
            title = article.text.strip()
            link = "https://www.bbc.com" + article["href"]
            news_data.append((category, title, link))
    
    return news_data

# Скрапим новости
bbc_news = scrape_bbc_news()

# Прогоняем через пайплайны
for category, title, link in bbc_news:
    svm_pred, svm_prob = predict_svm(title)
    bert_pred, bert_prob = predict_distilbert(title)
    print(f"Category: {category}")
    print(f"Title: {title}")
    print(f"Link: {link}")
    print(f"SVM Prediction: {svm_pred}, Probabilities: {svm_prob}")
    print(f"DistilBERT Prediction: {bert_pred}, Probabilities: {bert_prob}")
    print("-" * 80)

# Пример использования
text = "The government has announced a new economic policy to boost business."
print("SVM Prediction:", predict_svm(text))
print("DistilBERT Prediction:", predict_distilbert(text))


OSError: Incorrect path_or_model_id: './distilbert_model'. Please provide either the path to a local folder or the repo_id of a model on the Hub.

### 2я часть


In [None]:
!pip install --no-cache-dir gensim


In [118]:
import os
import re
import spacy
import pandas as pd
import matplotlib.pyplot as plt
import pyLDAvis.gensim_models as gensimvis
import pyLDAvis
from wordcloud import WordCloud
from sklearn.feature_extraction.text import CountVectorizer
from gensim import corpora, models
from collections import defaultdict

# Загружаем английскую модель SpaCy для лемматизации
nlp = spacy.load("en_core_web_sm")

# Путь к данным
bbc_path = r"C:\Users\epg_F\Homework_6\bbc"

# Функция для загрузки и предобработки текстов
def load_and_preprocess_data(path):
    data = []
    labels = []
    for category in os.listdir(path):
        category_path = os.path.join(path, category)
        if os.path.isdir(category_path):
            for file_name in os.listdir(category_path):
                file_path = os.path.join(category_path, file_name)
                with open(file_path, "r", encoding="utf-8", errors="ignore") as file:
                    text = file.read().lower()
                    text = re.sub(r'[^a-z\s]', '', text)  # Удаление пунктуации
                    doc = nlp(text)
                    lemmatized_text = " ".join([token.lemma_ for token in doc if not token.is_stop])
                    data.append(lemmatized_text)
                    labels.append(category)
    return pd.DataFrame({"text": data, "category": labels})

# Загружаем данные
df = load_and_preprocess_data(bbc_path)

# Функция для построения тематической модели LDA
def train_lda(df, category, num_topics=5):
    category_texts = df[df['category'] == category]['text'].tolist()
    vectorizer = CountVectorizer()
    doc_term_matrix = vectorizer.fit_transform(category_texts)
    
    # Преобразуем в формат gensim
    dictionary = corpora.Dictionary([text.split() for text in category_texts])
    corpus = [dictionary.doc2bow(text.split()) for text in category_texts]
    
    # Обучаем LDA
    lda_model = models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=10)
    
    return lda_model, corpus, dictionary

# Визуализация wordcloud для тем
def plot_wordcloud(lda_model, dictionary, num_topics):
    for i in range(num_topics):
        words = lda_model.show_topic(i, 20)
        wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(dict(words))
        plt.figure(figsize=(10, 5))
        plt.imshow(wordcloud, interpolation='bilinear')
        plt.axis("off")
        plt.title(f"Тема {i+1}")
        plt.show()

# Визуализация распределения тем
def visualize_lda(lda_model, corpus, dictionary):
    vis = gensimvis.prepare(lda_model, corpus, dictionary)
    pyLDAvis.display(vis)

# Запуск тематического моделирования для каждой категории
categories = df['category'].unique()
num_topics = 5  # Количество тем для каждой категории

for category in categories:
    print(f"Обрабатываем категорию: {category}")
    lda_model, corpus, dictionary = train_lda(df, category, num_topics)
    plot_wordcloud(lda_model, dictionary, num_topics)
    visualize_lda(lda_model, corpus, dictionary)

RuntimeError: Compiled extensions are unavailable. If you've installed from a package, ask the package maintainer to include compiled extensions. If you're building Gensim from source yourself, install Cython and a C compiler, and then run `python setup.py build_ext --inplace` to retry. 