In [5]:
from sklearn.metrics import confusion_matrix
import numpy as np
import matplotlib.pyplot as plt
import itertools

# sklearn
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Матрица ошибок',
                          cmap=plt.cm.Blues,
                          plot_place=[0, 0, 0]):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """

    plt.subplot(*plot_place)
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('Истина')
    plt.xlabel('Прогноз')

In [6]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split

SPLIT_NUMBER = 4

def confusionMatrices(estimator, classes = []):
    classes_data = data['class'].apply(lambda x: int(x))
    splits = SPLIT_NUMBER
    kf = KFold(n_splits=splits)
    n = 1
    plt.figure(figsize=(10, 10))
    for train, test in kf.split(vectorized_data):
        estimator.fit(vectorized_data[train], classes_data.iloc[train].values.ravel())
        predicted = estimator.predict(vectorized_data[test])

        matrix = confusion_matrix(classes_data.iloc[test], predicted)
        plot_confusion_matrix(matrix, classes, plot_place=[int(splits / 2) + splits % 2, 2, n])
        n += 1

def crossScores(estimator):
    classes_data = data['class'].apply(lambda x: int(x))
    crossScoreAccuracy = cross_val_score(estimator, scoring='accuracy', X=vectorized_data, y=classes_data.tolist(), cv=SPLIT_NUMBER)
    crossScoreF = cross_val_score(estimator, scoring='f1', X=vectorized_data, y=classes_data.tolist(), cv=SPLIT_NUMBER)
    crossScoreRocAuc = cross_val_score(estimator, scoring='roc_auc', X=vectorized_data, y=classes_data.tolist(), cv=SPLIT_NUMBER)

    fig, axs = plt.subplots(SPLIT_NUMBER, figsize=(10, 20))
    
    axs[0].plot(crossScoreAccuracy)
    axs[0].set_title("Точность")
    axs[0].set(xlabel='Порядковый номер разбиения', ylabel='Значение метрики')

    axs[1].plot(crossScoreF)
    axs[1].set_title("F1")
    axs[1].set(xlabel='Порядковый номер разбиения', ylabel='Значение метрики')
    
    axs[2].plot(crossScoreRocAuc)
    axs[2].set_title("ROC AUC")
    axs[2].set(xlabel='Порядковый номер разбиения', ylabel='Значение метрики')


from sklearn.model_selection import GridSearchCV

def gridSearch(vectorized_data, estimator, paramGrid):
    classes_data = data['class'].apply(lambda x: int(x))
    
    xTrain, xTest, yTrain, yTest = train_test_split(vectorized_data, classes_data, test_size=1 / SPLIT_NUMBER)

    searchCV = GridSearchCV(estimator, param_grid=paramGrid, n_jobs=-1, scoring='f1', refit=False)
    searchCV.fit(xTrain, yTrain)
    print(searchCV.best_params_)
    searchCV.cv_results_

In [7]:
import pandas as pd
from sklearn.utils import shuffle

suicidal = pd.read_csv("PreparedDatasets/suicidal.csv")
non_suicidal = pd.read_csv("PreparedDatasets/non_suicidal.csv").head(1500)

data = suicidal.append(non_suicidal)

data = shuffle(data)
data

  data = suicidal.append(non_suicidal)


Unnamed: 0.1,Unnamed: 0,text,class
220,220,почему в этой жизни у всех всё есть кроме меня...,1
1075,1358,щас бы умывашкой для лица за 2к мыть ноги,0
816,1034,"которую вообще-то должна же делать я, а это зн...",0
624,783,"Хотите узнать, насколько я лох по жизни? После...",0
1471,1860,сегодня был такой ужасный день. он начаться с ...,0
...,...,...,...
1004,1277,ахахахха как он не хотел спасать стариков но с...,0
1234,1565,"у меня появилось мыло для бровей, поэтому я ул...",0
431,542,меня сфоткали на планерке...,0
715,715,Я только угроза для других...,1


In [8]:
from pymorphy3 import MorphAnalyzer
import nltk
from nltk.corpus import stopwords

# nltk.download('stopwords')

an = MorphAnalyzer(lang='ru')
stops = stopwords.words('russian')

def getClearSentences(sentences):
    return " ".join(str(s) + "" for s in (an.normal_forms(y)[0] for y in filter(lambda x: x not in stops, nltk.word_tokenize(str(sentences)))))

data['text'] = data['text'].apply(lambda x: getClearSentences(x))
corpus = data['text']
corpus

220           почему жизнь всё кроме ? проказить родиться
1075                    сейчас умывашка лицо 2к мыть нога
816     который вообще-то должный делать , это значит ...
624     хотеть узнать , насколько лох жизнь ? после 12...
1471    сегодня ужасный день . начаться делать закончи...
                              ...                        
1004      ахахахха хотеть спасать старик сердце приказать
1234    появиться мыло бровь , поэтому уложить волос н...
431                                 сфоткать планёрка ...
715                                   я угроза другой ...
502     жизнь наладиться , светить кроме учёба какой-н...
Name: text, Length: 2499, dtype: object

In [9]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
vectorized_data_bag = vectorizer.fit_transform(corpus)
vectorized_data_bag

<2499x9470 sparse matrix of type '<class 'numpy.int64'>'
	with 48142 stored elements in Compressed Sparse Row format>

In [10]:
from transformers import BertTokenizer, BertModel
from pandas import DataFrame
from scipy.sparse import csr_matrix

tokenizer = BertTokenizer.from_pretrained('cointegrated/rubert-tiny2')

bert_tokenized = corpus.apply(lambda ser: tokenizer.convert_tokens_to_ids(tokenizer.tokenize(ser)))
bert_list = bert_tokenized.tolist()

nRows = len(bert_list)
nCols = max(max(row) if (len(row) > 0) else 0 for row in bert_list) + 1

dataIn = []
indices = []
indptr = [0]

for row in bert_list:
    indices.extend(row)
    dataIn.extend([1] * len(row))
    indptr.append(len(indices))

vectorized_data_bert = csr_matrix((dataIn, indices, indptr), shape=(nRows, nCols))
vectorized_data_bert

<2499x83818 sparse matrix of type '<class 'numpy.int64'>'
	with 95770 stored elements in Compressed Sparse Row format>

# Градиентный бустинг

## Bag

In [11]:
from sklearn.ensemble import GradientBoostingClassifier

estimator = GradientBoostingClassifier()
gridSearch(vectorized_data_bag, estimator, paramGrid={'learning_rate': [0.1, 0.5, 1, 2], 'min_samples_split': [2, 3, 6], 'n_estimators': [20, 40, 60]})

{'learning_rate': 1, 'min_samples_split': 3, 'n_estimators': 60}


In [None]:
from sklearn.ensemble import GradientBoostingClassifier

estimator = GradientBoostingClassifier(learning_rate=0.5, min_samples_split=6, n_estimators=60)
confusionMatrices(estimator=estimator, classes=["суицидальное", "обычное"])

In [None]:
crossScores(estimator)

In [None]:
estimator.predict(vectorizer.transform([getClearSentences("Ща сдохну от смеха")]))

## BERT

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

estimator = GradientBoostingClassifier()
gridSearch(vectorized_data_bert, estimator, paramGrid={'learning_rate': [0.1, 0.5, 1, 2], 'min_samples_split': [2, 3, 6], 'n_estimators': [20, 40, 60]})

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

estimator = GradientBoostingClassifier(learning_rate=0.5, min_samples_split=6, n_estimators=60)
confusionMatrices(estimator=estimator, classes=["суицидальное", "обычное"])

In [None]:
crossScores(estimator)

In [None]:
estimator.predict(vectorizer.transform([getClearSentences("Ща сдохну от смеха")]))

# Random Forest

## Bag

In [None]:
from sklearn.ensemble import RandomForestClassifier

estimator = RandomForestClassifier()
gridSearch(vectorized_data_bag, estimator, paramGrid={'n_jobs': [-1],'n_estimators': [50, 100, 150], 'max_depth':[50, 100, 150], 'class_weight': [None, 'balanced', 'balanced_subsample'], 'max_features': ['sqrt', 'log2', None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4]})

In [None]:
from sklearn.ensemble import RandomForestClassifier

estimator = RandomForestClassifier(class_weight=None, max_depth=100, max_features='sqrt', min_samples_leaf=1, n_estimators=50, n_jobs=-1)
plt.figure(figsize=(20, 10))
confusionMatrices(estimator, ["суицидальное", "обычное"])

In [None]:
crossScores(estimator)

## BERT

In [None]:
from sklearn.ensemble import RandomForestClassifier

estimator = RandomForestClassifier()
gridSearch(vectorized_data_bert, estimator, paramGrid={'n_jobs': [-1],'n_estimators': [50, 100, 150], 'max_depth':[50, 100, 150], 'class_weight': [None, 'balanced', 'balanced_subsample'], 'max_features': ['sqrt', 'log2', None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4]})

In [None]:
from sklearn.ensemble import RandomForestClassifier

estimator = RandomForestClassifier(class_weight=None, max_depth=100, max_features='sqrt', min_samples_leaf=1, n_estimators=50, n_jobs=-1)
plt.figure(figsize=(20, 10))
confusionMatrices(estimator, ["суицидальное", "обычное"])

In [None]:
crossScores(estimator)

# SVC

## Bag

In [None]:
from sklearn.svm import SVC

estimator = SVC()
gridSearch(vectorized_data_bag, estimator, paramGrid={'C': [1.0, 2.0, 10.0, 100.0], 'degree': [3, 4, 6, 7], 'kernel': ['linear', 'poly', 'rbf']})

In [None]:
estimator = SVC(C=1.0, degree=3, kernel='linear')
confusionMatrices(estimator, ["суицидальное", "обычное"])

In [None]:
crossScores(estimator)

## BERT

In [None]:
from sklearn.svm import SVC

estimator = SVC()
gridSearch(vectorized_data_bert, estimator, paramGrid={'C': [1.0, 2.0, 10.0, 100.0], 'degree': [3, 4, 6, 7], 'kernel': ['linear', 'poly', 'rbf']})

In [None]:
estimator = SVC(C=1.0, degree=3, kernel='linear')
confusionMatrices(estimator, ["суицидальное", "обычное"])

In [None]:
crossScores(estimator)

# KNN

## Bag

In [None]:
from sklearn.neighbors import KNeighborsClassifier

estimator = KNeighborsClassifier()
print(gridSearch(vectorized_data_bag, estimator, paramGrid={'n_neighbors': [3, 5, 10], 'weights': ['uniform', 'distance'], 'leaf_size': [20, 30, 40], 'p': [1, 2, 4], 'metric': ['euclidean', 'manhattan']}))

In [None]:
estimator = KNeighborsClassifier(n_neighbors=3, weights='distance', leaf_size=20, p=1, metric='euclidean')
confusionMatrices(estimator, ["суицидальное", "обычное"])

In [None]:
crossScores(estimator)

## BERT

In [None]:
from sklearn.neighbors import KNeighborsClassifier

estimator = KNeighborsClassifier()
print(gridSearch(vectorized_data_bert, estimator, paramGrid={'n_neighbors': [3, 5, 10], 'weights': ['uniform', 'distance'], 'leaf_size': [20, 30, 40], 'p': [1, 2, 4], 'metric': ['euclidean', 'manhattan']}))

In [None]:
estimator = KNeighborsClassifier(n_neighbors=3, weights='distance', leaf_size=20, p=1, metric='euclidean')
confusionMatrices(estimator, ["суицидальное", "обычное"])

In [None]:
crossScores(estimator)

# Logistic Regression

## Bag

In [None]:
from sklearn.linear_model import LogisticRegression

estimator = LogisticRegression()
gridSearch(vectorized_data_bag, estimator, paramGrid={'penalty': ['l2'], 'C': [2.1, 2.2, 2.3, 2.4], 'class_weight': [{0: 1, 1: 2}, {0: 2, 1: 1}, 'balanced', None], 'solver': ['lbfgs', 'liblinear', 'newton-ct']})

In [None]:
estimator = LogisticRegression(penalty='l2', C= 2.2, class_weight={0:1, 1:2}, solver='liblinear')
confusionMatrices(estimator, ["суицидальное", "обычное"])

In [None]:
crossScores(estimator)

## BERT

In [None]:
from sklearn.linear_model import LogisticRegression

estimator = LogisticRegression()
gridSearch(vectorized_data_bert, estimator, paramGrid={'penalty': ['l2'], 'C': [2.1, 2.2, 2.3, 2.4], 'class_weight': [{0: 1, 1: 2}, {0: 2, 1: 1}, 'balanced', None], 'solver': ['lbfgs', 'liblinear', 'newton-ct']})

In [None]:
estimator = LogisticRegression(penalty='l2', C= 2.2, class_weight={0:1, 1:2}, solver='liblinear')
confusionMatrices(estimator, ["суицидальное", "обычное"])

In [None]:
crossScores(estimator)

# Perceptron

## Bag

In [None]:
from sklearn.linear_model import Perceptron

estimator = Perceptron()
gridSearch(vectorized_data_bag, estimator, paramGrid={'penalty': ['l2', 'l1', 'elasticnet'], 'alpha': [0.0001, 0.0005, 0.001], 'n_jobs': [-1], 'max_iter': [500, 1000, 1500], 'class_weight': [{0:1, 1:2}, {0:2, 1:1}, 'balanced', None]})

In [None]:
estimator = Perceptron(alpha=0.0001, class_weight={0:2, 1:1}, max_iter=500, penalty='l1', n_jobs=-1)
confusionMatrices(estimator, ["суицидальное", "обычное"])

In [None]:
crossScores(estimator)

## BERT

In [None]:
from sklearn.linear_model import Perceptron

estimator = Perceptron()
gridSearch(vectorized_data_bert, estimator, paramGrid={'penalty': ['l2', 'l1', 'elasticnet'], 'alpha': [0.0001, 0.0005, 0.001], 'n_jobs': [-1], 'max_iter': [500, 1000, 1500], 'class_weight': [{0:1, 1:2}, {0:2, 1:1}, 'balanced', None]})

In [None]:
estimator = Perceptron(alpha=0.0001, class_weight={0:2, 1:1}, max_iter=500, penalty='l1', n_jobs=-1)
confusionMatrices(estimator, ["суицидальное", "обычное"])

In [None]:
crossScores(estimator)