In [4]:
import spacy
import pandas as pd
import pickle
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from spacy.lang.ru.stop_words import STOP_WORDS 

In [5]:
nlp = spacy.load("ru_core_news_md")

#Предобработка текста
def preprocess(text):
    text = text.lower().strip() #приводим к нижнему регистру и избавляеся от лишних пробелов
    text = re.sub(r'<.*?>', '', text)#убираем htlm таги
    text = re.sub(r'[^\w\s]', ' ', text) #убираем пунктицию и символы как ',", # итд
    text = re.sub(r'\s+', ' ', text)# убираем доп пробелы
    return text

#Убирает Шумовые слова как: этом, пока, одном итд
def stopword(string):
    a = [word for word in string.split() if word not in STOP_WORDS]
    return ' '.join(a)

#Приводит слово к изначальному типу
def lemmatizer(string):
    a = nlp(string)
    a = [word.lemma_ for word in a]
    return " ".join(a)

In [16]:
df = pd.read_csv('./data/lenta-ru-processed.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 306331 entries, 0 to 306330
Data columns (total 6 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   Unnamed: 0  306331 non-null  int64 
 1   index       306331 non-null  int64 
 2   text        306331 non-null  object
 3   tags        306331 non-null  object
 4   tag_code    306331 non-null  int64 
 5   new_text    306331 non-null  object
dtypes: int64(3), object(3)
memory usage: 14.0+ MB


In [30]:
df_train = df.drop(columns=['index'])

In [8]:
X_train, X_val, y_train, y_val = train_test_split(df_train["new_text"],
                                                  df_train["tag_code"].values,
                                                  test_size=0.20,
                                                  shuffle=True)

In [9]:
tfidf_vectorizer = TfidfVectorizer(use_idf=True)
X_train_vectors_tfidf = tfidf_vectorizer.fit_transform(X_train) 
X_val_vectors_tfidf = tfidf_vectorizer.transform(X_val)

In [53]:
#обучаем модель используя Логическую регрессию(tf-idf)  
lr_tfidf=LogisticRegression(solver = 'liblinear', C=10, penalty = 'l2')
lr_tfidf.fit(X_train_vectors_tfidf, y_train)

y_predict = lr_tfidf.predict(X_val_vectors_tfidf)
y_prob = lr_tfidf.predict_proba(X_val_vectors_tfidf)[:,1]
 
print(classification_report(y_val,y_predict))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      8085
           1       0.99      0.99      0.99      5306

    accuracy                           0.99     13391
   macro avg       0.99      0.99      0.99     13391
weighted avg       0.99      0.99      0.99     13391



In [11]:
#Сохраняем модель
with open('./models/model_pl_sp.pkl','wb') as f:
    pickle.dump(lr_tfidf,f)

In [12]:
#Тестируем Модель
test = input('enter text', )
test_processed = lemmatizer(stopword(preprocess(test)))
X_test = [test_processed]
X_vector = tfidf_vectorizer.transform(X_test)
y_predict = lr_tfidf.predict(X_vector)
y_prob =  lr_tfidf.predict_proba(X_vector)
print(y_prob, y_predict)
print(df_train['tags'][df_train['tag_code'] == y_predict.item()].iloc[0])

[[0.00158733 0.03724978 0.0192563  0.11170433 0.12804773 0.00642478
  0.03285391 0.00681216 0.01701266 0.08143012 0.0055336  0.03633922
  0.0361661  0.02027863 0.0032269  0.0708937  0.00888181 0.03157345
  0.01112665 0.00460091 0.12082311 0.03638318 0.00258667 0.00701539
  0.01235337 0.00312557 0.01355248 0.02096655 0.0015987  0.01626132
  0.00212446 0.0010546  0.00687281 0.00969734 0.0024643  0.00678842
  0.00299095 0.00564704 0.00503647 0.00888442 0.00765735 0.00396522
  0.00580133 0.00145387 0.01466924 0.00517697 0.00404881]] [5]
Происшествия
