# Распаковать файлы и загрузить данные.

In [5]:
import gzip
import pandas as pd

def load_data_from_gz(gz_path):
    with gzip.open(gz_path, 'rt', encoding='utf-8') as gz_file:
        file_content = gz_file.read().strip()
    
    data = [line.split('\t') for line in file_content.splitlines() if len(line.split('\t')) == 3]
    df = pd.DataFrame(data, columns=['category', 'title', 'content'])
    return df


df = load_data_from_gz(r"D:\Projects\Pycharm_projects\NLP\news.txt.gz")

# Предобработка текстовых данных.

In [8]:
from pymorphy2 import MorphAnalyzer
from gensim.utils import simple_preprocess
import re

morph = MorphAnalyzer()

# 定义一个函数来进行词形还原和去除停用词
def preprocess(text, stopwords=None):
    text_cleaned = re.sub(r'[^а-яА-Я]', ' ', text.lower())
    words = simple_preprocess(text_cleaned, deacc=True)
    lemmatized_words = [morph.parse(word)[0].normal_form for word in words]

    if stopwords:
        lemmatized_words = [word for word in lemmatized_words if word not in stopwords]
    return lemmatized_words


russian_stopwords = set(['и', 'в', 'во', 'не', 'что', 'он', 'на', 'я', 'с', 'со', 'как', 'а', 'то', 'все', 'она', 
                         'так', 'его', 'но', 'для', 'около', 'же', 'теперь', 'быть', 'бывать', 'этот', 'вот', 
                         'чем', 'еще', 'мочь', 'тот', 'когда', 'другой', 'первыи', 'ж', 'там', 'себя'])


df['preprocessed_content'] = df['content'].apply(lambda x: preprocess(x, russian_stopwords))

# Обучение модели Word2Vec с использованием предобработанных данных.

In [9]:
from gensim.models import Word2Vec

# 使用预处理后的内容训练Word2Vec模型
model = Word2Vec(sentences=df['preprocessed_content'], vector_size=100, window=5, min_count=1, workers=4)
model.save("word2vec.model")

# Разделение набора данных

In [10]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df['preprocessed_content'], df['category'], test_size=0.2, random_state=42)

# Векторизация документа.

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

raw_texts_train = [' '.join(doc) for doc in X_train]
raw_texts_test = [' '.join(doc) for doc in X_test]

vectorizer = TfidfVectorizer()
tfidf_matrix_train = vectorizer.fit_transform(raw_texts_train)
tfidf_matrix_test = vectorizer.transform(raw_texts_test)

feature_names = vectorizer.get_feature_names_out()

# 创建一个字典，用于快速查找词和其对应的TF-IDF值
def create_tfidf_weight_dict(tfidf_matrix, feature_names):
    docs_num = tfidf_matrix.shape[0]
    tfidf_weights = {}
    for doc_index in range(docs_num):
        feature_index = tfidf_matrix[doc_index, :].nonzero()[1]
        tfidf_scores = zip([feature_names[i] for i in feature_index], [tfidf_matrix[doc_index, x] for x in feature_index])
        tfidf_weights[doc_index] = dict(tfidf_scores)
    return tfidf_weights

tfidf_weights_train = create_tfidf_weight_dict(tfidf_matrix_train, feature_names)
tfidf_weights_test = create_tfidf_weight_dict(tfidf_matrix_test, feature_names)

def doc_vector_tfidf_weighted(doc, model, tfidf_weights_doc, model_vector_size):
    weights = tfidf_weights_doc
    weighted_words = [word for word in doc if word in model.wv and word in weights]
    
    if not weighted_words:
        return np.zeros(model_vector_size)
    
    weighted_vectors = [model.wv[word] * weights.get(word, 0) for word in weighted_words]
    return np.mean(weighted_vectors, axis=0)

X_train_vectors_tfidf = np.array([doc_vector_tfidf_weighted(doc, model, tfidf_weights_train[i], model.vector_size) for i, doc in enumerate(X_train)])
X_test_vectors_tfidf = np.array([doc_vector_tfidf_weighted(doc, model, tfidf_weights_test[i], model.vector_size) for i, doc in enumerate(X_test)])


##  Применение метода SMOTE для увеличения выборки, балансировка данных, оптимизация параметров SVM с использованием кросс-валидации и обучение модели SVM.

In [16]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC

smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_vectors_tfidf, y_train)

# 使用交叉验证优化SVM参数
param_grid = {'C': [0.1, 1, 10, 100], 'gamma': [1, 0.1, 0.01, 0.001], 'kernel': ['linear', 'rbf']}
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=2, cv=5)
grid.fit(X_train_resampled, y_train_resampled)
print("Best parameters found: ", grid.best_params_)

# 步骤 8: 训练优化后的SVM模型
svm_classifier_optimized = grid.best_estimator_
svm_classifier_optimized.fit(X_train_resampled, y_train_resampled)


Fitting 5 folds for each of 32 candidates, totalling 160 fits
[CV] END ......................C=0.1, gamma=1, kernel=linear; total time=  24.5s
[CV] END ......................C=0.1, gamma=1, kernel=linear; total time=  18.7s
[CV] END ......................C=0.1, gamma=1, kernel=linear; total time=  13.4s
[CV] END ......................C=0.1, gamma=1, kernel=linear; total time=  13.7s
[CV] END ......................C=0.1, gamma=1, kernel=linear; total time=  13.6s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=  12.8s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=  13.0s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=  12.9s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=  13.1s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=  13.0s
[CV] END ....................C=0.1, gamma=0.1, kernel=linear; total time=  13.6s
[CV] END ....................C=0.1, gamma=0.1, 

In [17]:
#Использование случайного леса.使用随机森林模型进行分类
from sklearn.ensemble import RandomForestClassifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train_resampled, y_train_resampled)


# Оценка модели.

In [18]:
from sklearn.metrics import classification_report
y_pred_svm = svm_classifier_optimized.predict(X_test_vectors_tfidf)
y_pred_rf = rf_classifier.predict(X_test_vectors_tfidf)

print("Classification report for optimized SVM:")
print(classification_report(y_test, y_pred_svm))

print("Classification report for Random Forest:")
print(classification_report(y_test, y_pred_rf))

Classification report for optimized SVM:
              precision    recall  f1-score   support

    business       0.42      0.59      0.49        79
     culture       0.86      0.80      0.83       279
   economics       0.81      0.73      0.77       266
      forces       0.70      0.88      0.78       149
        life       0.73      0.73      0.73       288
       media       0.82      0.74      0.78       299
     science       0.88      0.80      0.84       288
       sport       0.95      0.96      0.96       276
       style       0.84      0.84      0.84        38
      travel       0.41      0.68      0.51        38

    accuracy                           0.79      2000
   macro avg       0.74      0.78      0.75      2000
weighted avg       0.80      0.79      0.79      2000

Classification report for Random Forest:
              precision    recall  f1-score   support

    business       0.44      0.41      0.42        79
     culture       0.84      0.81      0.83       