# Classification of comments

## Preparation

In [1]:
import pandas as pd
import string
import re
import nltk
from nltk.corpus import stopwords as nltk_stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import make_scorer, f1_score
from sklearn.tree import DecisionTreeClassifier
import lightgbm as lgbm

In [2]:
df = pd.read_csv('../../datasets/toxic_comments.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159571 entries, 0 to 159570
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   text    159571 non-null  object
 1   toxic   159571 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 2.4+ MB


In [4]:
df.head()

Unnamed: 0,text,toxic
0,Explanation\nWhy the edits made under my usern...,0
1,D'aww! He matches this background colour I'm s...,0
2,"Hey man, I'm really not trying to edit war. It...",0
3,"""\nMore\nI can't make any real suggestions on ...",0
4,"You, sir, are my hero. Any chance you remember...",0


In [5]:
df.duplicated().sum()

0

In [6]:
df.isna().sum()

text     0
toxic    0
dtype: int64

The file contains 159571 objects, no duplicates or missing values.

In [7]:
df['toxic'].value_counts()

0    143346
1     16225
Name: toxic, dtype: int64

The sample is unbalanced, with almost 9 times fewer toxic comments than non-toxic comments.

Let's remove punctuation marks, break it up into separate words, and lemmatize each review.

In [8]:

def tokenization(text):
    text  = "".join([char for char in text if char not in string.punctuation])
    text = re.sub('^a-zA-Z', '', text)
    text = re.split('\W+', text)
    return text

In [9]:
def lemmatizer(text):
    l = WordNetLemmatizer()
    text = [l.lemmatize(word, pos='v') for word in text]
    return text
    



In [10]:
def comma_remove(text):
    text  = " ".join([char for char in text if char not in string.punctuation])
    return text

In [11]:
nltk.download('wordnet')


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ROKO000000109\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [12]:
df['lemmatized'] = df['text'].apply(tokenization).apply(lemmatizer).apply(comma_remove)

In [13]:
df.head()

Unnamed: 0,text,toxic,lemmatized
0,Explanation\nWhy the edits made under my usern...,0,Explanation Why the edit make under my usernam...
1,D'aww! He matches this background colour I'm s...,0,Daww He match this background colour Im seemin...
2,"Hey man, I'm really not trying to edit war. It...",0,Hey man Im really not try to edit war Its just...
3,"""\nMore\nI can't make any real suggestions on ...",0,More I cant make any real suggestions on impro...
4,"You, sir, are my hero. Any chance you remember...",0,You sir be my hero Any chance you remember wha...


The lemmatization looks correct, you can divide the dataset into training and test samples.

In [14]:
test = df.sample(frac=0.2,random_state=42).copy()
train = df[~df.index.isin(test.index)].copy()

In [15]:
test.shape

(31914, 3)

In [16]:
train.shape

(127657, 3)

The models will be checked using cross validation, we will not create a validation sample.

In [17]:
X = train.drop(['toxic', 'text'], axis = 1)
y = train['toxic']
X_test = test.drop(['toxic', 'text'], axis = 1)
y_test = test['toxic']

In [18]:
corpus = X['lemmatized'].values.astype('U')
nltk.download('stopwords')
stopwords = set(nltk_stopwords.words('english'))
count_tf_idf = TfidfVectorizer(stop_words=stopwords)
tf_idf = count_tf_idf.fit_transform(corpus)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ROKO000000109\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Training

The data is prepared, let's train some models.

### Logistic regression

In [19]:
parameters = {'C': [3,4],
              'penalty': ['l1'],
              'max_iter':[200]}
clf = GridSearchCV(LogisticRegression(solver='liblinear'), parameters, cv=3, scoring=make_scorer(f1_score))
clf.fit(tf_idf, y)

GridSearchCV(cv=3, estimator=LogisticRegression(solver='liblinear'),
             param_grid={'C': [3, 4], 'max_iter': [200], 'penalty': ['l1']},
             scoring=make_scorer(f1_score))

In [20]:
clf.best_params_

{'C': 3, 'max_iter': 200, 'penalty': 'l1'}

In [21]:
logreg_f1_cv = clf.best_score_
logreg_f1_cv

0.7704664669269269

In [22]:
corpus_test = X_test['lemmatized'].values.astype('U')
tf_idf_test = count_tf_idf.transform(corpus_test)

In [23]:
logreg_f1_test = f1_score(y_test, clf.predict(tf_idf_test))
logreg_f1_test

0.7841945288753799

### Decision Tree

In [24]:
parameters = {'max_depth': [60,70,100],
             'random_state':[42]}
dtc = GridSearchCV(DecisionTreeClassifier(), parameters, cv=3, scoring=make_scorer(f1_score))
dtc.fit(tf_idf, y)

GridSearchCV(cv=3, estimator=DecisionTreeClassifier(),
             param_grid={'max_depth': [60, 70, 100], 'random_state': [42]},
             scoring=make_scorer(f1_score))

In [25]:
dtc.best_params_

{'max_depth': 100, 'random_state': 42}

In [26]:
dtc_f1_cv = dtc.best_score_
dtc_f1_cv

0.713704885092208

In [27]:
dtc_f1_test = f1_score(y_test, dtc.predict(tf_idf_test))
dtc_f1_test

0.7306648575305291

### LightGBM

In [28]:
parameters = {'boosting_type':['gbdt'],
        'objective':['binary'],
        'learning_rate':[0.02,0.05, 0.1],
        'num_leaves':[50,75],
        'n_estimators':[120]
        }

lgb = GridSearchCV(lgbm.LGBMClassifier(), parameters, cv=3, scoring=make_scorer(f1_score))

In [29]:
lgb.fit(tf_idf,y)

GridSearchCV(cv=3, estimator=LGBMClassifier(),
             param_grid={'boosting_type': ['gbdt'],
                         'learning_rate': [0.02, 0.05, 0.1],
                         'n_estimators': [120], 'num_leaves': [50, 75],
                         'objective': ['binary']},
             scoring=make_scorer(f1_score))

In [30]:
lgb.best_params_

{'boosting_type': 'gbdt',
 'learning_rate': 0.1,
 'n_estimators': 120,
 'num_leaves': 75,
 'objective': 'binary'}

In [31]:
lgb_f1_cv = lgb.best_score_
lgb_f1_cv

0.7623186660329315

In [32]:
lgb_f1_test = f1_score(y_test, lgb.predict(tf_idf_test))
lgb_f1_test

0.7769056340484912

In [33]:
result = pd.DataFrame({'Модель': ['LogisticRegression', 'DecisionTreeClassifier', 'LightgbmClassifier'],
                        'F1_valid':[logreg_f1_cv.round(3), dtc_f1_cv.round(3), lgb_f1_cv.round(3)],
                       'F1_test':[logreg_f1_test.round(3), dtc_f1_test.round(3),lgb_f1_test.round(3)]})

In [34]:
result

Unnamed: 0,Модель,F1_valid,F1_test
0,LogisticRegression,0.77,0.784
1,DecisionTreeClassifier,0.714,0.731
2,LightgbmClassifier,0.762,0.777


## Выводы

- Prepared data: tokenization and lemmatization of feedback texts were performed. 
- Three models were trained and tested: Logistic regression, decision tree and LightGBM
- The best result was obtained when using Logistic regression and LighGBM models, the *F1* metric values on the test sample were 0.784 and 0.777, respectively. 