In [36]:
import pandas as pd
import re
import nltk
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from nltk import sent_tokenize, word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from typing import List
from deeppavlov import build_model, configs
from deeppavlov.core.data.utils import download
from sklearn.model_selection import train_test_split

Загрузка данных

In [2]:
data = pd.read_csv(
    'datasets/toxic_comments.csv', 
    sep=',', 
    quoting=1, 
    on_bad_lines='skip',
    engine='python',
    na_values=['', ' ', 'NA']
)

In [3]:
data.head()

Unnamed: 0.1,Unnamed: 0,text,toxic
0,0,Explanation\nWhy the edits made under my usern...,0
1,1,D'aww! He matches this background colour I'm s...,0
2,2,"Hey man, I'm really not trying to edit war. It...",0
3,3,"""\nMore\nI can't make any real suggestions on ...",0
4,4,"You, sir, are my hero. Any chance you remember...",0


In [4]:
data.rename(columns = {'toxic' : 'label'}, inplace = True)
data = data.drop('Unnamed: 0', axis=1)

In [5]:
data.head()

Unnamed: 0,text,label
0,Explanation\nWhy the edits made under my usern...,0
1,D'aww! He matches this background colour I'm s...,0
2,"Hey man, I'm really not trying to edit war. It...",0
3,"""\nMore\nI can't make any real suggestions on ...",0
4,"You, sir, are my hero. Any chance you remember...",0


In [6]:
data.isna().sum()

text     0
label    0
dtype: int64

In [7]:
data.duplicated().sum()

0

In [8]:
data.text

0         Explanation\nWhy the edits made under my usern...
1         D'aww! He matches this background colour I'm s...
2         Hey man, I'm really not trying to edit war. It...
3         "\nMore\nI can't make any real suggestions on ...
4         You, sir, are my hero. Any chance you remember...
                                ...                        
113076              (UTC)\nUpdated:  19:55, 8 February 2006
113077    I am Lucy Lennon 4me. I only made a comment he...
113078                    It's alright, didn't even notice.
113079    Aisha Azzouzi is a clever girl and really piff...
113080    "\n\nI shall ignore your stupid personal insul...
Name: text, Length: 113081, dtype: object

Лемматизация

In [16]:
lemmatizer = WordNetLemmatizer()

In [17]:
def clear_and_lemmatize_series(series: pd.Series) -> pd.Series:
    return series.apply(_clear_and_lemmatize_text)

def _clear_and_lemmatize_text(text: str) -> str:
    return _clear_text(_lemmatize_text(text))

def _clear_text(text: str) -> str:
    return re.sub(r'[^\w\s]', ' ', text)

def penn2morphy(penntag):
    """ Converts Penn Treebank tags to WordNet. """
    morphy_tag = {'NN':'n', 'JJ':'a',
                  'VB':'v', 'RB':'r'}
    try:
        return morphy_tag[penntag[:2]]
    except:
        return 'n'

def _lemmatize_text(text: str) -> str:
    # Лемматизация и объединение списка слов в строку
    lemmatized_words = [lemmatizer.lemmatize(word.lower(), pos=penn2morphy(tag))
                        for word, tag in pos_tag(word_tokenize(text))]
    return ' '.join(lemmatized_words)

In [18]:
data['prepared_text'] = clear_and_lemmatize_series(data['text'])

Векторизация

In [21]:
vectorizer = TfidfVectorizer()

In [25]:
def vectorize_series(series: pd.Series) -> List[str]:
    # Преобразуем серию в матрицу и возвращаем список строк
    return list(vectorizer.fit_transform(series))

In [26]:
data['vectorized_text'] = vectorize_series(data['text'])

In [27]:
data.head()

Unnamed: 0,text,label,prepared_text,vectorized_text
0,Explanation\nWhy the edits made under my usern...,0,explanation why the edits make under my userna...,"(0, 3446)\t0.16769576946881884\n (0, 4373)\..."
1,D'aww! He matches this background colour I'm s...,0,d aww he match this background colour i m s...,"(0, 142295)\t0.16412463608924402\n (0, 2613..."
2,"Hey man, I'm really not trying to edit war. It...",0,hey man i m really not try to edit war it...,"(0, 69411)\t0.20448767581830993\n (0, 8935)..."
3,"""\nMore\nI can't make any real suggestions on ...",0,more i ca n t make any real suggestion on i...,"(0, 137467)\t0.16225009518550423\n (0, 5929..."
4,"You, sir, are my hero. Any chance you remember...",0,you sir be my hero any chance you rememb...,"(0, 146787)\t0.18634179087764524\n (0, 1136..."


In [44]:
print("\n=== DeepPavlov for English ===")


=== DeepPavlov for English ===


In [40]:
model = build_model('insults_kaggle_bert', download=True, install=True)

2025-02-21 23:09:10.550 INFO in 'deeppavlov.core.data.utils'['utils'] at line 97: Downloading from http://files.deeppavlov.ai/deeppavlov_data/classifiers/insults_kaggle_torch_bert_v5.tar.gz to C:\Users\Home\.deeppavlov\models\insults_kaggle_torch_bert_v5.tar.gz
100%|██████████| 1.09G/1.09G [01:42<00:00, 10.7MB/s] 
2025-02-21 23:10:53.437 INFO in 'deeppavlov.core.data.utils'['utils'] at line 284: Extracting C:\Users\Home\.deeppavlov\models\insults_kaggle_torch_bert_v5.tar.gz archive into C:\Users\Home\.deeppavlov\models\classifiers
2025-02-21 23:11:05.982 INFO in 'deeppavlov.core.data.utils'['utils'] at line 97: Downloading from http://files.deeppavlov.ai/datasets/insults_data.tar.gz to C:\Users\Home\.deeppavlov\insults_data.tar.gz
100%|██████████| 682k/682k [00:00<00:00, 1.53MB/s]
2025-02-21 23:11:06.893 INFO in 'deeppavlov.core.data.utils'['utils'] at line 284: Extracting C:\Users\Home\.deeppavlov\insults_data.tar.gz archive into C:\Users\Home\.deeppavlov\downloads
To support symlinks

In [41]:
data['result'] = model(data['prepared_text'].tolist())

RuntimeError: [enforce fail at C:\actions-runner\_work\pytorch\pytorch\builder\windows\pytorch\c10\core\impl\alloc_cpu.cpp:72] data. DefaultCPUAllocator: not enough memory: you tried to allocate 22232629248 bytes.

In [42]:
def unify_result_series(series):
    series = series.str.replace('Not Insult', '0').str.replace('Insult', '1').astype(int)

In [43]:
data['result'] = unify_result_series(data['result'])

KeyError: 'result'

In [None]:
print("\n=== DeepPavlov for Russian ===")

In [None]:
data = pd.read_csv(
    'datasets/rusentitweet_full.csv', 
    sep=',', 
    quoting=1, 
    on_bad_lines='skip',
    engine='python',
    na_values=['', ' ', 'NA']
)

In [None]:
data = data.drop('Unnamed: 0', axis=1)

In [None]:
data.isna().sum()

In [None]:
data.duplicated().sum()

In [None]:
data['label'] = data['label'].str.replace('skip', '0').str.replace('neutral', '0').str.replace('speech', '0').str.replace('positive', '0').str.replace('negative', '1').astype(int)

In [None]:
lemmatizer = pymorphy2.MorphAnalyzer()

In [None]:
def clear_and_lemmatize_series(series: pd.Series) -> pd.Series:
    return series.apply(_clear_and_lemmatize_text)


def _clear_and_lemmatize_text(text: str) -> str:
    return _clear_text(_lemmatize_text(text))
    

def _clear_text(text: str) -> str:
    return re.sub(r'[^\w\s]', ' ', text)


def _lemmatize_text(text: str) -> str:
    return ' '.join(lemmatizer.parse(word)[0].normal_form for word in text.lower().split())


def vectorize_series(series: pd.Series) -> List[str]:
    return list(vectorizer.fit_transform(series))

In [None]:
data['prepared_text'] = clear_and_lemmatize_series(data['text'])

In [None]:
data['vectorized_text'] = vectorize_series(data['text'])

In [None]:
df.head()

In [None]:
model = build_model('rusentiment_convers_bert', download=True, install=True)

In [None]:
data['result'] = model(data['prepared_text'].tolist())

In [None]:
data['result'] = unify_result_series(data['result'])