In [82]:
from sklearn.metrics import confusion_matrix
import numpy as np
import matplotlib.pyplot as plt
import itertools

# sklearn
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues,
                          plot_place=[0, 0, 0]):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """

    plt.subplot(*plot_place)
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [83]:
import pandas as pd
from matplotlib import pyplot as plt

data = pd.read_csv("presuicidal_signals_dataset_twitter.csv",delimiter="|")
non_suicidal = data.loc[data['label'] == 5].head(5000)[['text', 'label']].rename(columns={'label': 'class'}).replace(5, 0)
non_suicidal

Unnamed: 0,text,class
0,встаём завтра в 8 утра и делаем все дела,0.0
1,меня позвали на суши. и боюсь и хочу. согласил...,0.0
2,валя <emoji>Skull</emoji>,0.0
3,то есть вы не пишете на столько бессмысленную ...,0.0
4,Не знаю почему никто не сделал или я не нашëл ...,0.0
...,...,...
5952,у нас ещё сильнее снег пошёл<emoji>Grinning fa...,0.0
5953,Я пытаюсь в этих ваших китайских новеллах\nНаз...,0.0
5954,Я сейчас живу одна и вы видели счастье на моём...,0.0
5955,у меня уже спрашивают с сарказмом я говорю или...,0.0


In [76]:
data = pd.read_csv("own_shit.csv")
suicidal = data[['text', 'class']]

data = non_suicidal.append(suicidal)

  data = non_suicidal.append(suicidal)


In [77]:
data['text'] = data['text'].apply(lambda string: " ".join(remove_emojis(string)).split("\n"))
data.to_csv("suicidal.csv")

In [86]:
non_suicidal['text'] = non_suicidal['text'].apply(lambda it: remove_emojis(it))
non_suicidal.to_csv("non_suicidal.csv")

In [79]:
data = data.sample(frac=1)
data

Unnamed: 0,text,class
413,[Н е т н и о д н о й п р и ч и н ы ж и...,1.0
350,"[м ы в с е л ю д и н и , у н а с у ...",0.0
984,[Н е в и ж у с м ы с л а в с в о е й ...,1.0
584,"[П о м о г и т е , к а к и з б а в и т ь с...",1.0
152,"[Я н е в и ж у с м ы с л ж и з н и ,...",1.0
...,...,...
864,[О т н о ш е н и я Ч у и и Ж е н и н а...,0.0
671,[М о ж е т е л и в ы п р е д с т а в и т...,1.0
143,"[Г о с п о д и , п р о с т и м н е м о е...",1.0
709,"[С е г о д н я т о т с а м ы й д е н ь ,...",1.0


In [None]:
from pymorphy3 import MorphAnalyzer
import nltk
from nltk.corpus import stopwords

# nltk.download('stopwords')

an = MorphAnalyzer(lang='ru')
stops = stopwords.words('russian')

def getClearSentences(sentences):
    return " ".join(str(s) + "" for s in (an.normal_forms(y)[0] for y in filter(lambda x: x not in stops, nltk.word_tokenize(str(sentences)))))

data['text'] = data['text'].apply(lambda x: getClearSentences(x))
corpus = data['text']
corpus

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
vectorized_data = vectorizer.fit_transform(corpus)
vectorized_data

In [None]:
classes_data = data['class'].apply(lambda x: int(x))

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split

def confusionMatrices(estimator, classes = []):
    splits = 3
    kf = KFold(n_splits=splits)
    n = 1
    plt.figure(figsize=(20, 10))
    for train, test in kf.split(vectorized_data):
        estimator.fit(vectorized_data[train], classes_data.iloc[train].values.ravel())
        predicted = estimator.predict(vectorized_data[test])

        matrix = confusion_matrix(classes_data.iloc[test], predicted)
        plot_confusion_matrix(matrix, classes, plot_place=[splits, 1, n])
        n += 1

def crossScores(estimator):
    crossScoreF = cross_val_score(estimator, scoring='f1_micro', X=vectorized_data, y=classes_data.tolist(), cv=3)

    plt.plot(crossScoreF)
    plt.title("f1 micro")


from sklearn.model_selection import GridSearchCV

def gridSearch(estimator, paramGrid):

    xTrain, xTest, yTrain, yTest = train_test_split(vectorized_data, classes_data, test_size=0.2)

    searchCV = GridSearchCV(estimator, param_grid=paramGrid)
    searchCV.fit(xTrain, yTrain)
    print(searchCV.best_params_)
    searchCV.cv_results_

# Градиентный бустинг

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

estimator = GradientBoostingClassifier()
gridSearch(estimator, paramGrid={'learning_rate': [0.1, 0.5, 1, 2], 'min_samples_split': [2, 3, 6], 'n_estimators': [20, 40, 60]})

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

estimator = GradientBoostingClassifier(learning_rate=0.4, min_samples_split=3, n_estimators=60)
confusionMatrices(estimator=estimator, classes=["суицидальное", "обычное"])

In [None]:
crossScores(estimator)

In [None]:
estimator.predict(vectorizer.transform([getClearSentences("Впизду все это, заебало нахуй. Зачем я здесь? Что мне уготовлено? проще вздернуться")]))

# Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

estimator = RandomForestClassifier()
gridSearch(estimator, paramGrid={'n_jobs': [-1],'n_estimators': [1, 10, 100, 500], 'max_depth':[None, 10, 100], 'class_weight': [None, 'balanced', 'balanced_subsample']})

In [None]:
from sklearn.ensemble import RandomForestClassifier

estimator = RandomForestClassifier(class_weight='balanced_subsample', n_jobs=-1, n_estimators=500, max_depth=100)
plt.figure(figsize=(20, 10))
confusionMatrices(estimator, ["суицидальное", "обычное"])

In [None]:
estimator = RandomForestClassifier(n_jobs=-1, n_estimators=500)
crossScores(estimator)

# SVC

In [None]:
from sklearn.svm import SVC

estimator = SVC()
gridSearch(estimator, paramGrid={'C': [1.0, 2.0, 10.0, 100.0], 'degree': [3, 4, 6, 7], 'kernel': ['linear', 'poly', 'rbf']})

In [None]:
estimator = SVC(C=1.0, degree=3, kernel='linear')
confusionMatrices(estimator, ["суицидальное", "обычное"])

In [None]:
crossScores(estimator)