In [16]:
import nltk
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [17]:
RANDOM_STATE = 42

In [18]:
df = pd.read_csv('./lenta_df.csv')

In [19]:
nltk.download('stopwords')
nltk.download('punkt_tab')
russian_stopwords = set(stopwords.words('russian'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Gulfik\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Gulfik\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [20]:
def preprocess_text(text):
    tokens = word_tokenize(text, language='russian')
    filtered_tokens = [word for word in tokens if word.isalnum() and word.lower() not in russian_stopwords]
    return ' '.join(filtered_tokens)


df['text'] = df['text'].dropna().apply(preprocess_text)
cleaned_df = df.copy()
cleaned_df.shape

(36000, 3)

In [21]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(cleaned_df['text'], cleaned_df['bloc'], test_size=0.2,
                                                    random_state=RANDOM_STATE)

In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1, 2))
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)


In [23]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

model_lg = LogisticRegression(max_iter=1000, solver='lbfgs', random_state=RANDOM_STATE)
model_lg.fit(X_train_tfidf, y_train)
model_lg_pred = model_lg.predict(X_test_tfidf)

In [24]:
from sklearn.naive_bayes import MultinomialNB

model_nb = MultinomialNB()
model_nb.fit(X_train_tfidf, y_train)
model_nb_pred = model_nb.predict(X_test_tfidf)

In [25]:
from sklearn.ensemble import RandomForestClassifier

model_rf = RandomForestClassifier(random_state=RANDOM_STATE, n_estimators=200, max_depth=15)
model_rf.fit(X_train_tfidf, y_train)
model_rf_pred = model_rf.predict(X_test_tfidf)

In [29]:
from sklearn.svm import LinearSVC

model_svm = LinearSVC(random_state=RANDOM_STATE, C=1)
model_svm.fit(X_train_tfidf, y_train)
model_svm_pred = model_svm.predict(X_test_tfidf)

In [30]:
from sklearn.tree import DecisionTreeClassifier

model_tree = DecisionTreeClassifier(random_state=RANDOM_STATE, max_depth=10)
model_tree.fit(X_train_tfidf, y_train)
model_tree_pred = model_tree.predict(X_test_tfidf)

In [31]:
from xgboost import XGBClassifier

model_xgb = XGBClassifier(random_state=RANDOM_STATE, use_label_encoder=False, eval_metric='mlogloss', n_estimators=200,
                      max_depth=10)
model_xgb.fit(X_train_tfidf, y_train)
model_xgb_pred = model_xgb.predict(X_test_tfidf)

Parameters: { "use_label_encoder" } are not used.



In [32]:
print(f'Logistic regression report: {classification_report(y_test, model_lg_pred)}')
print(f'NB report: {classification_report(y_test, model_nb_pred)}')
print(f'Random forest report: {classification_report(y_test, model_rf_pred)}')
print(f'SVM report: {classification_report(y_test, model_svm_pred)}')
print(f'Decision tree report: {classification_report(y_test, model_tree_pred)}')
print(f'XGB classifier report: {classification_report(y_test, model_xgb_pred)}')

Logistic regression report:               precision    recall  f1-score   support

           0       0.85      0.84      0.84       803
           1       0.83      0.88      0.85       763
           2       0.93      0.95      0.94       774
           3       0.91      0.93      0.92       834
           4       0.99      0.99      0.99       794
           5       0.99      1.00      0.99       813
           6       0.86      0.77      0.81       806
           7       0.98      0.98      0.98       805
           8       0.94      0.95      0.95       808

    accuracy                           0.92      7200
   macro avg       0.92      0.92      0.92      7200
weighted avg       0.92      0.92      0.92      7200

NB report:               precision    recall  f1-score   support

           0       0.81      0.67      0.74       803
           1       0.74      0.79      0.76       763
           2       0.81      0.96      0.88       774
           3       0.82      0.93      

In [37]:
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('svm', LinearSVC(random_state=RANDOM_STATE))
])

param_grid = {
    # 'tfidf__max_features': [5000, 10000, 20000],
    # 'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)],
    # 'tfidf__min_df': [1, 2, 5],
    # 'tfidf__max_df': [0.8, 0.9, 1.0],
    'svm__C': [0.01, 0.1, 1, 10, 100],
    'svm__loss': ['hinge', 'squared_hinge'],
}

grid_search = GridSearchCV(pipeline, param_grid, cv=3, scoring='accuracy', verbose=2, n_jobs=-1)
grid_search.fit(X_train, y_train)

print("Лучшие параметры:", grid_search.best_params_)
print("Лучшая точность на кросс-валидации:", grid_search.best_score_)

best_pipeline = grid_search.best_estimator_
y_pred = best_pipeline.predict(X_test)
print(classification_report(y_test, y_pred))


Fitting 3 folds for each of 20 candidates, totalling 60 fits


15 fits failed out of a total of 60.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
15 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Gulfik\Desktop\petprojects\hseContest\.venv\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Gulfik\Desktop\petprojects\hseContest\.venv\Lib\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Gulfik\Desktop\petprojects\hseContest\.venv\Lib\site-packages\sklearn\pipeline.py", line 473, in fit
    self._final_estimator.fit(Xt, y, **last_step_pa

Лучшие параметры: {'svm__C': 1, 'svm__dual': True, 'svm__loss': 'hinge'}
Лучшая точность на кросс-валидации: 0.9338194444444445
              precision    recall  f1-score   support

           0       0.89      0.86      0.87       803
           1       0.87      0.88      0.88       763
           2       0.94      0.95      0.95       774
           3       0.92      0.94      0.93       834
           4       0.99      1.00      1.00       794
           5       0.99      1.00      0.99       813
           6       0.87      0.84      0.85       806
           7       0.98      0.99      0.98       805
           8       0.95      0.96      0.96       808

    accuracy                           0.94      7200
   macro avg       0.93      0.93      0.93      7200
weighted avg       0.93      0.94      0.93      7200



In [38]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', LogisticRegression(random_state=RANDOM_STATE))
])

param_grid = {
    # 'tfidf__max_features': [5000, 10000, 20000],
    # 'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)],
    # 'tfidf__min_df': [1, 2, 5],
    # 'tfidf__max_df': [0.8, 0.9, 1.0],
    'clf__C': [0.1, 1, 10, 100],
    'clf__solver': ['lbfgs', 'liblinear'],
    'clf__penalty': ['l2'],
    'clf__max_iter': [500, 1000, 1500]
}

grid_search = GridSearchCV(pipeline, param_grid, cv=3, scoring='accuracy', n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

print("Лучшие параметры:", grid_search.best_params_)

best_lg_model = grid_search.best_estimator_
best_lg_pred = best_lg_model.predict(X_test)
print(classification_report(y_test, best_lg_pred))

Fitting 3 folds for each of 24 candidates, totalling 72 fits
Лучшие параметры: {'clf__C': 100, 'clf__max_iter': 500, 'clf__penalty': 'l2', 'clf__solver': 'liblinear'}
              precision    recall  f1-score   support

           0       0.88      0.86      0.87       803
           1       0.86      0.88      0.87       763
           2       0.93      0.96      0.95       774
           3       0.92      0.94      0.93       834
           4       0.99      0.99      0.99       794
           5       0.99      1.00      0.99       813
           6       0.86      0.80      0.83       806
           7       0.98      0.99      0.98       805
           8       0.96      0.96      0.96       808

    accuracy                           0.93      7200
   macro avg       0.93      0.93      0.93      7200
weighted avg       0.93      0.93      0.93      7200



In [10]:
print("Shape of X_train_tfidf:", X_train_tfidf.shape)
print("Length of y_train:", len(y_train))

Shape of X_train_tfidf: (25600, 5000)
Length of y_train: 25600


In [16]:
from xgboost import XGBClassifier

xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
xgb_model.fit(X_train_tfidf, y_train)

y_pred_xgb = xgb_model.predict(X_test_tfidf)

print(classification_report(y_test, y_pred))

Parameters: { "use_label_encoder" } are not used.



              precision    recall  f1-score   support

           0       0.87      0.89      0.88       785
           1       0.94      0.94      0.94       819
           2       0.97      0.96      0.96       761
           3       0.93      0.95      0.94       810
           4       0.99      0.98      0.99       782
           5       0.98      0.99      0.99       785
           6       0.99      0.97      0.98       798
           7       0.97      0.96      0.97       860

    accuracy                           0.95      6400
   macro avg       0.96      0.95      0.96      6400
weighted avg       0.96      0.95      0.95      6400



In [11]:
import torch

torch.cuda.is_available()

True

In [42]:
test_df = pd.read_csv('./test_news.csv')
test_df['cleaned_text'] = test_df['content'].apply(preprocess_text)
y_pred_test = best_pipeline.predict(test_df['cleaned_text'])
pd.DataFrame({'topic': y_pred_test}).reset_index().to_csv('submission.csv', index=False)

In [40]:
test_df = pd.read_csv('./test_news.csv')
test_df['cleaned_text'] = test_df['content'].apply(preprocess_text)

X_test_tfidf = vectorizer.transform(test_df['cleaned_text'])
y_pred_test = model.predict(X_test_tfidf)

pd.DataFrame({'topic': y_pred_test}).reset_index().to_csv('submission.csv', index=False)

AttributeError: 'csr_matrix' object has no attribute 'lower'

In [41]:
test_df = pd.read_csv('./test_news.csv')
test_df['cleaned_text'] = test_df['content'].apply(preprocess_text)

X_test_tfidf = vectorizer.transform(test_df['cleaned_text'])
y_pred_test = xgb_model.predict(X_test_tfidf)

pd.DataFrame({'topic': y_pred_test}).reset_index().to_csv('submission.csv', index=False)