In [18]:
import pandas as pd

df_train = pd.read_csv('cleaned_news.csv')
df_train = df_train.fillna('')
df_test = pd.read_csv('cleaned_test_news.csv')
df_test = df_test.fillna('')

In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer

X_train = df_train['content']
y_train = df_train['theme']

X_test = df_test['content']

vectorizer = TfidfVectorizer()

X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [21]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report


model = RandomForestClassifier()

parameters = {
    'n_estimators': [100, 200, 300, 400, 500],
    'max_depth': [2, 4, 6, 8, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(estimator=model, param_grid=parameters, cv=5)
grid_search.fit(X_train, y_train)
print("Лучшие параметры: ", grid_search.best_params_)

model = RandomForestClassifier(n_estimators=grid_search.best_params_['n_estimators'], max_depth=grid_search.best_params_['max_depth'], \
min_samples_split=grid_search.best_params_['min_samples_split'], min_samples_leaf=grid_search.best_params_['min_samples_leaf'])

model.fit(X_train, y_train)

y_pred = model.predict(X_train)

print(classification_report(y_train, y_pred))

Лучшие параметры:  {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 400}


ValueError: Classification metrics can't handle a mix of continuous-multioutput and multiclass targets

In [22]:
y_pred = model.predict(X_test)
answer = pd.DataFrame()
answer['topic'] = y_pred
answer['index'] = range(len(answer))

answer.to_csv('answer.csv', index=False)

In [24]:
from sklearn.metrics import accuracy_score

y_pred = model.predict(X_train)

print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.93      0.36      0.52      2000
           1       0.88      0.94      0.91      2000
           2       0.81      0.96      0.88      2000
           3       0.69      0.93      0.79      2000
           4       0.98      0.97      0.97      2000
           5       0.92      1.00      0.95      2000
           6       0.97      0.98      0.98      1998
           7       0.97      0.97      0.97      2000
           8       0.97      0.90      0.93      2000

    accuracy                           0.89     17998
   macro avg       0.90      0.89      0.88     17998
weighted avg       0.90      0.89      0.88     17998



In [36]:
model1 = RandomForestClassifier(n_estimators=200, max_depth=grid_search.best_params_['max_depth'], \
min_samples_split=grid_search.best_params_['min_samples_split'], min_samples_leaf=grid_search.best_params_['min_samples_leaf'])

model1.fit(X_train, y_train)

y_pred = model1.predict(X_train)

print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.89      0.32      0.47      2000
           1       0.89      0.94      0.91      2000
           2       0.82      0.95      0.88      2000
           3       0.67      0.93      0.78      2000
           4       0.97      0.97      0.97      2000
           5       0.90      1.00      0.95      2000
           6       0.97      0.98      0.98      1998
           7       0.97      0.97      0.97      2000
           8       0.96      0.91      0.93      2000

    accuracy                           0.88     17998
   macro avg       0.89      0.88      0.87     17998
weighted avg       0.89      0.88      0.87     17998



In [38]:
from catboost import CatBoostClassifier

# Определите модель
model2 = CatBoostClassifier()

# Определите параметры для поиска
parameters = {
    'iterations': [1, 2, 3],
    'depth': [2, 4, 6, 8, 10],
    'learning_rate': [0.01, 0.05, 0.1],
    'l2_leaf_reg': [1, 3, 5, 7, 9]
}

# Определите поиск по сетке
grid_search = GridSearchCV(estimator=model2, param_grid=parameters, cv=5)

# Выполните поиск по сетке
grid_search.fit(X_train, y_train)

# Выведите лучшие параметры
print("Лучшие параметры: ", grid_search.best_params_)

# Обучите модель с лучшими параметрами
model2 = CatBoostClassifier(iterations=grid_search.best_params_['iterations'], 
                           depth=grid_search.best_params_['depth'], 
                           learning_rate=grid_search.best_params_['learning_rate'], 
                           l2_leaf_reg=grid_search.best_params_['l2_leaf_reg'])

model2.fit(X_train, y_train)

# Предсказание
y_pred = model2.predict(X_train)

# Отчет о классификации
print(classification_report(y_train, y_pred))


0:	learn: 2.1868852	total: 425ms	remaining: 0us
0:	learn: 2.1861179	total: 324ms	remaining: 0us
0:	learn: 2.1865105	total: 374ms	remaining: 0us
0:	learn: 2.1866869	total: 359ms	remaining: 0us
0:	learn: 2.1853856	total: 322ms	remaining: 0us
0:	learn: 2.1466957	total: 366ms	remaining: 0us
0:	learn: 2.1429447	total: 352ms	remaining: 0us
0:	learn: 2.1448633	total: 370ms	remaining: 0us
0:	learn: 2.1457243	total: 396ms	remaining: 0us
0:	learn: 2.1393645	total: 352ms	remaining: 0us
0:	learn: 2.0996540	total: 358ms	remaining: 0us
0:	learn: 2.0923992	total: 380ms	remaining: 0us
0:	learn: 2.0961071	total: 379ms	remaining: 0us
0:	learn: 2.0977633	total: 354ms	remaining: 0us
0:	learn: 2.0854804	total: 352ms	remaining: 0us
0:	learn: 2.1870716	total: 368ms	remaining: 0us
0:	learn: 2.1863012	total: 413ms	remaining: 0us
0:	learn: 2.1866974	total: 448ms	remaining: 0us
0:	learn: 2.1868710	total: 414ms	remaining: 0us
0:	learn: 2.1855794	total: 425ms	remaining: 0us
0:	learn: 2.1475828	total: 408ms	remaini

KeyboardInterrupt: 