# Проект для «Викишоп»

Интернет-магазин «Викишоп» запускает новый сервис. Теперь пользователи могут редактировать и дополнять описания товаров, как в вики-сообществах. То есть клиенты предлагают свои правки и комментируют изменения других. Магазину нужен инструмент, который будет искать токсичные комментарии и отправлять их на модерацию.  

#### Необходимо

Обучить модель классифицировать комментарии на позитивные и негативные. В распоряжении имеется набор данных с разметкой о токсичности правок.
Построить модель со значением метрики качества F1 не меньше 0.75.

In [26]:
!pip install pymystem3

Collecting pymystem3
  Downloading pymystem3-0.2.0-py3-none-any.whl (10 kB)
Installing collected packages: pymystem3
Successfully installed pymystem3-0.2.0


In [53]:
import numpy as np
import pandas as pd
import nltk
import re
from pymystem3 import Mystem
from nltk.corpus import wordnet
from nltk.corpus import stopwords as nltk_stopwords
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from tqdm import notebook
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [55]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /home/iuser24/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/iuser24/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/iuser24/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /home/iuser24/nltk_data...


True

In [49]:
lemmatizer = WordNetLemmatizer()

In [2]:
df = pd.read_csv('https://code.s3.yandex.net/datasets/toxic_comments.csv', index_col='Unnamed: 0')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 159292 entries, 0 to 159450
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   text    159292 non-null  object
 1   toxic   159292 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 3.6+ MB


In [58]:
df.head()

Unnamed: 0,text,toxic,lemm_text
0,Explanation\nWhy the edits made under my usern...,0,explanation why the edits make under my userna...
1,D'aww! He matches this background colour I'm s...,0,d aww he match this background colour i m seem...
2,"Hey man, I'm really not trying to edit war. It...",0,hey man i m really not try to edit war it s ju...
3,"""\nMore\nI can't make any real suggestions on ...",0,more i ca n t make any real suggestion on impr...
4,"You, sir, are my hero. Any chance you remember...",0,you sir be my hero any chance you remember wha...


In [28]:
def lemmatize(text):
    m = Mystem()
    lemm_list = m.lemmatize(text)
    lemm_text = "".join(lemm_list)
    return lemm_text

In [36]:
def lemmatize_text(text):
    text = text.lower()
    tokens = word_tokenize(text)
    pos_tags = pos_tag(tokens)
    lemmatized_tokens = [
        lemmatizer.lemmatize(token, get_wordnet_pos(pos)) for token, pos in pos_tags
    ]
    lemm_text = " ".join(lemmatized_tokens)
    cleared_text = re.sub(r'[^a-zA-Z]', ' ', lemm_text)
    return " ".join(cleared_text.split())

In [51]:
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [29]:
def clear_text(text):
    clr_txt = re.sub(r'[^а-яА-ЯёЁ ]', ' ', text)
    clr_txt_list = clr_txt.split() 
    return ' '.join(clr_txt_list)

In [56]:
df['lemm_text'] = df['text'].apply(lemmatize_text)

In [59]:
df = df.drop(['text'], axis=1)

In [60]:
df

Unnamed: 0,toxic,lemm_text
0,0,explanation why the edits make under my userna...
1,0,d aww he match this background colour i m seem...
2,0,hey man i m really not try to edit war it s ju...
3,0,more i ca n t make any real suggestion on impr...
4,0,you sir be my hero any chance you remember wha...
...,...,...
159446,0,and for the second time of ask when your view ...
159447,0,you should be ashamed of yourself that be a ho...
159448,0,spitzer umm theres no actual article for prost...
159449,0,and it look like it be actually you who put on...


In [64]:
nltk.download('stopwords')
stopwords = list(nltk_stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/iuser24/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [80]:
train, test = train_test_split(df, test_size=0.1)

In [81]:
count_tf_idf = TfidfVectorizer(stop_words=stopwords)
tf_idf_train = count_tf_idf.fit_transform(train['lemm_text'])
tf_idf_test = count_tf_idf.transform(test['lemm_text'])

In [82]:
print("Размер матрицы train:", tf_idf_train.shape)
print("Размер матрицы test:", tf_idf_test.shape)

Размер матрицы train: (143362, 148652)
Размер матрицы test: (15930, 148652)


In [83]:
lr = LogisticRegression(random_state=42)

In [None]:
lr = lr.fit(tf_idf_train, train['toxic'], sample_weight=None)

In [76]:
toxic_pred = lr.predict(tf_idf_train)

In [77]:
accuracy_score(train['toxic'], toxic_pred)

0.9434330752598172

In [78]:
toxic_pred = lr.predict(tf_idf_test)

In [79]:
accuracy_score(test['toxic'], toxic_pred)

0.9309478970495919