# Задачи категоризации текста

## 1. Загрузка и предварительная обработка данных

In [1]:
import gzip
import pandas as pd
import re
import spacy
import pickle  # 用于保存和加载DataFrame

# 加载并解压数据
def load_data_from_gz(gz_path):
    with gzip.open(gz_path, 'rt', encoding='utf-8') as gz_file:
        file_content = gz_file.read().strip()
    data = [line.split('\t') for line in file_content.splitlines() if len(line.split('\t')) == 3]
    df = pd.DataFrame(data, columns=['category', 'title', 'content'])
    return df

df = load_data_from_gz(r"D:\MyProject\SPBU Course\NLP\nlp_task_2\news.txt.gz")

# 加载spaCy的俄语模型
nlp = spacy.load("ru_core_news_sm")

# 预处理函数：词形还原和去除停用词
def preprocess(text, stopwords=None):
    text_cleaned = re.sub(r'[^а-яА-Я]', ' ', text.lower())
    doc = nlp(text_cleaned)
    lemmatized_words = [
        token.lemma_ for token in doc 
        if not token.is_stop and not token.is_punct
    ]
    if stopwords:
        lemmatized_words = [word for word in lemmatized_words if word not in stopwords]
    return lemmatized_words

russian_stopwords = set(['и', 'в', 'во', 'не', 'что', 'он', 'на', 'я', 'с', 'со', 'как', 'а', 'то', 'все', 'она',
                         'так', 'его', 'но', 'для', 'около', 'же', 'теперь', 'быть', 'бывать', 'этот', 'вот',
                         'чем', 'еще', 'мочь', 'тот', 'когда', 'другой', 'первыи', 'ж', 'там', 'себя'])

df['preprocessed_content'] = df['content'].apply(lambda x: preprocess(x, russian_stopwords))

# 保存预处理后的DataFrame到本地
with open('preprocessed_df.pkl', 'wb') as f:
    pickle.dump(df, f)

# 查看结果
print(df[['content', 'preprocessed_content']].head())

                                             content  \
0  Парусная гонка Giraglia Rolex Cup пройдет в Ср...   
1  Шведский хоккеист Матс Сундин назначен советни...   
2  Гран-при конкурса "Брэнд года/EFFIE" получил г...   
3  Цена американской нефти WTI на лондонской бирж...   
4  Сбербанк выставил на продажу долги по 21,4 тыс...   

                                preprocessed_content  
0  [парусный, гонка,                    , пройти,...  
1  [шведский, хоккеист, матс, сундин, назначить, ...  
2  [гран, конкурса,  , брэнд, год,        , получ...  
3  [цена, американский, нефть,     , лондонский, ...  
4  [сбербанк, выставить, продажа, долг,      , ты...  


## 2. Обучение модели векторного представления слов

In [2]:
import pickle
from gensim.models import Word2Vec

# 加载预处理后的DataFrame
with open('preprocessed_df.pkl', 'rb') as f:
    df = pickle.load(f)

# 使用预处理后的内容训练Word2Vec模型
model = Word2Vec(sentences=df['preprocessed_content'], vector_size=100, window=5, min_count=1, workers=4)

# 保存训练好的Word2Vec模型到本地
model.save("word2vec.model")

## 3. Сегментация набора данных

In [3]:
import pickle
from sklearn.model_selection import train_test_split

# 加载预处理后的DataFrame
with open('preprocessed_df.pkl', 'rb') as f:
    df = pickle.load(f)

# 分割数据集为训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(
    df['preprocessed_content'], df['category'], test_size=0.2, random_state=42
)

## 4. Векторизация документов

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

raw_texts_train = [' '.join(doc) for doc in X_train]
raw_texts_test = [' '.join(doc) for doc in X_test]

vectorizer = TfidfVectorizer()
tfidf_matrix_train = vectorizer.fit_transform(raw_texts_train)
tfidf_matrix_test = vectorizer.transform(raw_texts_test)

feature_names = vectorizer.get_feature_names_out()

# 创建一个字典，用于快速查找词和其对应的TF-IDF值
def create_tfidf_weight_dict(tfidf_matrix, feature_names):
    docs_num = tfidf_matrix.shape[0]
    tfidf_weights = {}
    for doc_index in range(docs_num):
        feature_index = tfidf_matrix[doc_index, :].nonzero()[1]
        tfidf_scores = zip([feature_names[i] for i in feature_index], [tfidf_matrix[doc_index, x] for x in feature_index])
        tfidf_weights[doc_index] = dict(tfidf_scores)
    return tfidf_weights

tfidf_weights_train = create_tfidf_weight_dict(tfidf_matrix_train, feature_names)
tfidf_weights_test = create_tfidf_weight_dict(tfidf_matrix_test, feature_names)

def doc_vector_tfidf_weighted(doc, model, tfidf_weights_doc, model_vector_size):
    weights = tfidf_weights_doc
    weighted_words = [word for word in doc if word in model.wv and word in weights]
    
    if not weighted_words:
        return np.zeros(model_vector_size)
    
    weighted_vectors = [model.wv[word] * weights.get(word, 0) for word in weighted_words]
    return np.mean(weighted_vectors, axis=0)

X_train_vectors_tfidf = np.array([doc_vector_tfidf_weighted(doc, model, tfidf_weights_train[i], model.vector_size) for i, doc in enumerate(X_train)])
X_test_vectors_tfidf = np.array([doc_vector_tfidf_weighted(doc, model, tfidf_weights_test[i], model.vector_size) for i, doc in enumerate(X_test)])

## 5. Алгоритм классификации - SVM

In [5]:
from imblearn.over_sampling import SMOTE
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_vectors_tfidf, y_train)

# 使用交叉验证优化SVM参数
param_grid = {'C': [0.1, 1, 10, 100], 'gamma': [1, 0.1, 0.01, 0.001], 'kernel': ['linear', 'rbf']}
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=2, cv=5)
grid.fit(X_train_resampled, y_train_resampled)
print("Best parameters found: ", grid.best_params_)

svm_classifier_optimized = grid.best_estimator_
svm_classifier_optimized.fit(X_train_resampled, y_train_resampled)

Fitting 5 folds for each of 32 candidates, totalling 160 fits
[CV] END ......................C=0.1, gamma=1, kernel=linear; total time=   2.2s
[CV] END ......................C=0.1, gamma=1, kernel=linear; total time=   2.3s
[CV] END ......................C=0.1, gamma=1, kernel=linear; total time=   2.3s
[CV] END ......................C=0.1, gamma=1, kernel=linear; total time=   2.4s
[CV] END ......................C=0.1, gamma=1, kernel=linear; total time=   2.4s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   3.1s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   3.2s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   3.2s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   3.2s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   3.2s
[CV] END ....................C=0.1, gamma=0.1, kernel=linear; total time=   2.3s
[CV] END ....................C=0.1, gamma=0.1, 

## 6. Альтернативные методы - предложение альтернативного подхода

Простое усреднение векторов слов обычно не является хорошим отображением отношений между документами и векторами. Здесь мы пытаемся использовать подход взвешенного среднего, где веса определяются значениями TF-IDF.
Эта альтернатива уже была реализована ранее, а именно функция `doc_vector_tfidf_weighted`.

## 7. Переобучение классификатора с помощью альтернативных методов

In [6]:
from sklearn.metrics import classification_report

y_pred_svm = svm_classifier_optimized.predict(X_test_vectors_tfidf)
print("Classification report for optimized SVM using TF-IDF weighted average:")
print(classification_report(y_test, y_pred_svm))

Classification report for optimized SVM using TF-IDF weighted average:
              precision    recall  f1-score   support

    business       0.38      0.57      0.46        79
     culture       0.88      0.82      0.85       279
   economics       0.78      0.72      0.75       266
      forces       0.71      0.87      0.78       149
        life       0.72      0.72      0.72       288
       media       0.81      0.74      0.77       299
     science       0.87      0.80      0.83       288
       sport       0.94      0.95      0.94       276
       style       0.83      0.79      0.81        38
      travel       0.50      0.68      0.58        38

    accuracy                           0.79      2000
   macro avg       0.74      0.77      0.75      2000
weighted avg       0.80      0.79      0.79      2000



### Для сравнения мы также можем использовать классификатор случайного леса

In [7]:
from sklearn.ensemble import RandomForestClassifier

rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train_resampled, y_train_resampled)

y_pred_rf = rf_classifier.predict(X_test_vectors_tfidf)
print("Classification report for Random Forest using TF-IDF weighted average:")
print(classification_report(y_test, y_pred_rf))

Classification report for Random Forest using TF-IDF weighted average:
              precision    recall  f1-score   support

    business       0.49      0.42      0.45        79
     culture       0.84      0.81      0.83       279
   economics       0.74      0.80      0.77       266
      forces       0.68      0.82      0.74       149
        life       0.71      0.74      0.72       288
       media       0.77      0.71      0.74       299
     science       0.87      0.77      0.81       288
       sport       0.93      0.97      0.95       276
       style       0.83      0.79      0.81        38
      travel       0.54      0.55      0.55        38

    accuracy                           0.78      2000
   macro avg       0.74      0.74      0.74      2000
weighted avg       0.78      0.78      0.78      2000

