# Обнаружение событий, связанных с копроментацией корпоративной электронной почты, в процессе мониторинга исходящей корреспонденции на основе методов машинного обучения

## Подготовка датасетов

Используем [открьтый датасет компании Enron](https://www.cs.cmu.edu/~enron/). Разархивируем его и превратим его в нужные нам датасеты.

In [None]:
!pip install -r requirements.txt

In [2]:
import os
import mailparser
import re
import pandas as pd

Как выглядит типичное письмо из данных.

In [None]:
with open("enron_mail_20150507/maildir/delainey-d/sent/1.") as f:
    print(f.read())

Письма раскиданы по папкам. Берём все папки, в которых встречается слово sent - папки с исходящей корреспонденцией.

In [4]:
print(os.listdir("enron_mail_20150507/maildir/delainey-d"))

['deleted_items', 'discussion_threads', 'sent', 'notes_inbox', '_sent_mail', 'sent_items', 'all_documents', 'inbox']


In [5]:
def parse_emails_by_sender(path="enron_mail_20150507"):
    outgoing_emails_by_user = {}
    unsuccessful = 0

    if not os.path.exists(path):
        return
    total_emails = 0

    for root, dirs, files in os.walk(path):
        # Обрабатываем только отправленные сообщения
        if not ("sent" in root):
            continue

        for file_name in files:
            try:
                msg = mailparser.parse_from_file(os.path.join(root, file_name))
                sender = msg.headers.get("From")
                body = msg.body

                if sender:
                    sender = re.search(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', sender)
                    if sender:
                        sender = sender.group().lower()
                        if sender not in outgoing_emails_by_user:
                            outgoing_emails_by_user[sender] = set()
                        outgoing_emails_by_user[sender].add(body)
                        total_emails += 1
            except Exception:
                unsuccessful += 1

    print(f"Total unsuccessful: {unsuccessful}/{total_emails}")
    return outgoing_emails_by_user


def extract_reply_from_email(text: str, sender_email: str):
    i = text.find("\n\n\n")
    if i != -1:
        text = text[:i]
    i = text.find("----- Forwarded")
    if i != -1:
        text = text[:i]
    i = text.find("Original Message")
    if i != -1:
        text = text[:i]
    text = re.sub("-{3,50}", "", text)
    name = sender_email.split("@")[0].split(".")
    if name:
        if len(name) > 1:
            r = re.compile(re.escape(name[1]), re.IGNORECASE)
            text = r.sub("", text)
        if len(name[0]) > 2:
            r = re.compile(re.escape(name[0]), re.IGNORECASE)
            text = r.sub("", text)
    text = text.rstrip(" \n")
    text = re.sub(r'(\n\n.{3,50})$', '', text)
    text = re.sub("thanks.?", "", text, flags=re.IGNORECASE)
    text = re.sub(r"(\w{3,10}[,:](\n|\n\n)|^\w{3,10}[,:]|\n\w{3,10}[,:])", "", text)
    return text

In [None]:
emails_dict = parse_emails_by_sender(path="enron_mail_20150507")
skip_emails = {"no.address@enron.com", "40enron@enron.com"}
senders, emails = [], []
for sender, emails_ in zip(emails_dict.keys(), emails_dict.values()):
    if sender not in skip_emails:
        unique_emails = set()
        for email in emails_:
            email = extract_reply_from_email(email, sender)
            if len(email) > 10 and "Outlook Migration Team" not in email:
                unique_emails.add(email)
        for email in unique_emails:
            senders.append(sender)
            emails.append(email)
df = pd.DataFrame(list(zip(senders, emails)), columns=['sender', 'text'])
df.to_csv("raw_emails.csv")

In [None]:
import random

In [None]:
df = pd.read_csv('raw_emails.csv')
# print(df["sender"].value_counts())  # Люди с максимальным и минимальным числом писем
d = df.to_dict()
senders = list(d["sender"].values())
texts = list(d["text"].values())
datasets = dict()
for i, sender in enumerate(senders):
    if sender in datasets:
        datasets[sender].append([texts[i], 1])
    else:
        datasets[sender] = [[texts[i], 1]]
numbers_of_owned_emails = {sender: len(datasets[sender]) for sender in datasets}
for sender in datasets:
    other_texts = []
    for i, text in enumerate(texts):
        if sender != senders[i]:
            other_texts.append(text)
    for _ in range(numbers_of_owned_emails[sender]):
        datasets[sender].append([random.choice(other_texts), 0])
if not os.path.exists("datasets"):
    os.mkdir("datasets")
for sender in datasets:
    df = pd.DataFrame(list(zip([row[0] for row in datasets[sender]],
                               [row[1] for row in datasets[sender]])),
                      columns=['text', 'label'])
    df.to_csv(os.path.join("datasets", f"{sender.split('@')[0].replace('.', '_')}.csv"))

В итоге, получаем папку datasets с датасетом для каждого работника компании, в котором размечены письма (письма от этого человека - 1, другие письма - 0). Другие письма выбираются случайным образом.

In [None]:
print(os.listdir("datasets"))

Самые распространенные слова в сообщениях.

In [None]:
from collections import Counter

words_stat = dict()
for dataset in os.listdir("datasets"):
    df = pd.read_csv(f"datasets/{dataset}")
    for text in df["text"].values:
        for pair in Counter(text.replace('\n', '').split()).most_common(100):
            if pair[0] in words_stat:
                words_stat[pair[0]] += pair[1]
            else:
                words_stat[pair[0]] = pair[1]
print(sorted(words_stat.items(), key=lambda x:x[1], reverse=True)[:100])

## Выбор датасетов для обучения

Выберем 5 человек, у которых больше всего исходящих писем и отсутствуют подписи к письмам.

In [3]:
chosen_datasets = ['kate_symes.csv', 'sally_beck.csv', 'carol_clair.csv', 'michelle_cash.csv', 'chris_germany.csv']

In [None]:
for dataset in chosen_datasets:
    print(pd.read_csv(f"datasets/{dataset}"))

## Обучение моделей

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, precision_score, recall_score, \
precision_recall_curve, ConfusionMatrixDisplay
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

Будем использовать 2 способа векторизации текстов - TF-IDF и CountVectorizer

In [None]:
def tfidf(df: pd.DataFrame):
    tfidf = TfidfVectorizer()
    return np.array(tfidf.fit_transform(df['text']).todense())


def vectorize(df: pd.DataFrame):
    count_vec = CountVectorizer()
    return np.array(count_vec.fit_transform(df['text']).todense())

Определяем класс для обучения и тестирования моделей. Обучающая выборка - 70% датасета. to_vec_func - выбранный способ векторизации.

In [None]:
class ModelTrainer:

    def __init__(self, csv_path, train_dir_path):
        self.dataset = pd.read_csv(os.path.join("datasets", csv_path))
        self.dataset_name = csv_path.split(".")[0]
        self.train_info = {}
        self.train_dir_path = train_dir_path

    def update_train_info(self, info, model_name, vec_name):
        if model_name in self.train_info:
            self.train_info[model_name][vec_name] = info
        else:
            self.train_info[model_name] = {vec_name: info}

    def save_train_info_into_pickle(self):
        if not os.path.exists(self.train_dir_path):
            os.mkdir(self.train_dir_path)
        with open(os.path.join(self.train_dir_path, f"{self.dataset_name}.pickle"), "wb") as f:
            pickle.dump(self.train_info, f)

    def count_acc(self, pred_scores, scores):
        correct = 0
        res_acc = []
        pred_scores = list(pred_scores)
        scores = list(scores)
        for i in range(len(scores)):
            if pred_scores[i] == scores[i]:
                correct += 1
            res_acc.append(correct / (i + 1))
        return res_acc

    def plot_accuracy(self, train_pred_scores, train_scores, test_pred_scores, test_scores):
        plt.figure(figsize=(15, 8))
        plt.title('Accuracy')
        plt.xticks([])
        plt.plot(self.count_acc(train_pred_scores, train_scores), color='r', label="Train")
        plt.plot(self.count_acc(test_pred_scores, test_scores), color='b', label="Test")
        plt.legend()
        plt.show()

    def plot_precision_recall_curve(self, y_test, y_pred):
        precision, recall, thresholds = precision_recall_curve(y_test, y_pred)
        plt.figure(figsize=(15, 8))
        plt.xlabel('Recall')
        plt.ylabel('Presicion')
        plt.title('Precision-recall curve')
        plt.xticks(np.arange(0, 1.1, 0.1))
        plt.yticks(np.arange(0, 1.1, 0.1))
        plt.grid()
        plt.plot(recall, precision)
        plt.show()

    def plot_confusion_matrix(self, conf_matrix):
        cm = ConfusionMatrixDisplay(conf_matrix)
        cm.plot()
        plt.show()

    def train(self, model, to_vec_func):
        model_name = model.__class__.__name__
        X = to_vec_func(self.dataset)
        y = self.dataset["label"]
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1)
        model = model.fit(X_train, y_train)
        y_train_pred = model.predict(X_train)
        y_pred = model.predict(X_test)
        self.plot_accuracy(y_train_pred, y_train, y_pred, y_test)
        self.plot_precision_recall_curve(y_test, y_pred)
        conf_matrix = confusion_matrix(y_test, y_pred)
        self.plot_confusion_matrix(conf_matrix)
        self.update_train_info({
            'Accuracy': accuracy_score(y_test, y_pred),
            'Precision': precision_score(y_test, y_pred),
            'Recall': recall_score(y_test, y_pred),
            'F1 score': f1_score(y_test, y_pred, average="macro"),
            'Confusion matrix': conf_matrix
        }, model_name, to_vec_func.__name__)

In [None]:
models = [MultinomialNB, LogisticRegression, SVC, KNeighborsClassifier, DecisionTreeClassifier, RandomForestClassifier, GradientBoostingClassifier]
for dataset in chosen_datasets:
    model_trainer = ModelTrainer(dataset, train_dir_path="train")
    print(f"Start processing {dataset}")
    for i, model in enumerate(models):
        model_trainer.train(model(), vectorize)
        model_trainer.train(model(), tfidf)
        print(f"Processed model {i + 1}/{len(models)}")
    model_trainer.save_train_info_into_pickle()

In [6]:
import copy

In [None]:
def analyze_train_data(dir_path):
    res_data = {}
    for train_data in os.listdir(dir_path):
        with open(os.path.join(dir_path, train_data), 'rb') as f:
            d = pickle.load(f)
            for name in d:
                if name not in res_data:
                    res_data[name] = {'vectorize': {}, 'tfidf': {}}
                    for vec_name in d[name]:
                        for key in d[name][vec_name]:
                            res_data[name][vec_name][key] = [d[name][vec_name][key]]
                else:
                    for vec_name in d[name]:
                        for key in d[name][vec_name]:
                            res_data[name][vec_name][key].append(d[name][vec_name][key])
    tmp = copy.deepcopy(res_data)
    max_score = 0
    model_name = ""
    for m in tmp:
        for vec_name in tmp[m]:
            for key in tmp[m][vec_name]:
                if key != "Confusion matrix":
                    res_data[m][vec_name][key] = np.average(res_data[m][vec_name][key])
                    if key == "Accuracy" and res_data[m][vec_name][key] > max_score:
                        max_score = res_data[m][vec_name][key]
                        model_name = m
                else:
                    res_data[m][vec_name].pop("Confusion matrix")
    print(res_data)
    print(f"Best accuracy: {max_score}")
    print(f"Best model: {model_name}")


analyze_train_data("train")

## Препроцессинг текстов

In [None]:
import nltk
from nltk.corpus import stopwords
from string import punctuation

nltk.download("stopwords")

stop_words = stopwords.words("english")

In [None]:
def remove_punctuation(text):
    translator = str.maketrans(punctuation, ' ' * len(punctuation))
    return text.translate(translator)


def text_preprocessing(text):
    text = re.sub(r'https?://\S+|www\.\S+', ' ', text)
    text = re.sub(r'<.*?>', ' ', text)
    text = re.sub(r'{[^>]+}', ' ', text)
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"
                               u"\U0001F300-\U0001F5FF"
                               u"\U0001F680-\U0001F6FF"
                               u"\U0001F1E0-\U0001F1FF"
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    text = re.sub(r'\S*@\S*\s?', ' ', text)
    text = re.sub(r'[0-9]+', ' ', text)
    text = re.sub(r'-|«|»', ' ', text)
    text = re.sub(r'[^\w\s]', ' ', text)
    text = re.sub(r'\n', ' ', text)
    text = text.lower()
    text = remove_punctuation(text)
    text = text.strip()
    # text = ' '.join([word for i, word in enumerate(text.split()) if word not in stop_words and i < 1000])
    return text

In [None]:
def tfidf(df: pd.DataFrame):
    tfidf = TfidfVectorizer()
    return np.array(tfidf.fit_transform([text_preprocessing(t) for t in df['text'].values]).todense())


def vectorize(df: pd.DataFrame):
    count_vec = CountVectorizer()
    return np.array(count_vec.fit_transform([text_preprocessing(t) for t in df['text'].values]).todense())

In [None]:
models = [MultinomialNB, LogisticRegression, SVC, KNeighborsClassifier, DecisionTreeClassifier, RandomForestClassifier, GradientBoostingClassifier]
for dataset in chosen_datasets:
    model_trainer = ModelTrainer(dataset, "train_preprocessing")
    print(f"Start processing {dataset}")
    for i, model in enumerate(models):
        model_trainer.train(model(), vectorize)
        model_trainer.train(model(), tfidf)
        print(f"Processed model {i + 1}/{len(models)}")
    model_trainer.save_train_info_into_pickle()

In [None]:
analyze_train_data("train_preprocessing")

Исследование препроцессинга на лучшей модели SVM

In [None]:
# Raw 'Accuracy': 0.864054582489543
# Preprocessing without lemmatization 'Accuracy': 0.8566528760292869
# Save stop words + without lemmatization 0.8605192309990312
# Lemmatization + save stop words 0.8605192309990312
# Only Lower text 0.8605459869746198

В итоге, лучше всего модель работает на обычном тексте.

## Нейросети

In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense

tf.config.list_physical_devices('GPU')

In [None]:
def print_metrics(history, epochs=2):
    acc = history.history['accuracy']
    val_acc = history.history['val_accuracy']

    loss = history.history['loss']
    val_loss = history.history['val_loss']

    epochs_range = range(epochs)

    plt.figure(figsize=(15, 8))
    plt.subplot(1, 3, 1)
    plt.plot(epochs_range, acc, label='Train')
    plt.plot(epochs_range, val_acc, label='Test')
    plt.legend()
    plt.grid()
    plt.title('Accuracy')

    plt.subplot(1, 3, 2)
    plt.plot(epochs_range, loss, label='Train')
    plt.plot(epochs_range, val_loss, label='Test')
    plt.legend()
    plt.grid()
    plt.title('Loss')

    plt.show()


def print_conf_matrix(labels, pred_labels):
    cm = ConfusionMatrixDisplay(confusion_matrix(labels, pred_labels))
    cm.plot()
    plt.show()

In [None]:
accuracy = []
precision = []
recall = []
for dataset in chosen_datasets:
    df = pd.read_csv(os.path.join("datasets", dataset))
    X_train, X_test, y_train, y_test = train_test_split(df["text"].values, df["label"].values, test_size=0.25, random_state=1)
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(X_train)
    sequences = tokenizer.texts_to_sequences(X_train)
    max_length = max([len(seq) for seq in sequences])
    padded_sequences = pad_sequences(sequences, maxlen=max_length)
    new_sequences = tokenizer.texts_to_sequences(X_test)
    new_padded_sequences = pad_sequences(new_sequences, maxlen=max_length)

    model = Sequential()
    model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=32))
    model.add(SimpleRNN(32))
    model.add(Dense(1, activation='sigmoid'))

    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

    epochs = 3
    history = model.fit(padded_sequences, y_train, validation_data=(new_padded_sequences, y_test), epochs=epochs)
    print_metrics(history, epochs=epochs)

    predictions = model.predict(new_padded_sequences)
    correct = 0
    predicted_labels = []
    for i in range(len(predictions)):
        prediction_label = 1 if predictions[i] > 0.5 else 0
        predicted_labels.append(prediction_label)
        if y_test[i] == prediction_label:
            correct += 1
    print_conf_matrix(y_test, predicted_labels)
    accuracy.append(correct / len(predictions))
    print(f"Accuracy: {correct / len(predictions)}")
    print(f"Precision: {precision_score(y_test, predicted_labels)}")
    precision.append(precision_score(y_test, predicted_labels))
    print(f"Recall: {recall_score(y_test, predicted_labels)}")
    recall.append(recall_score(y_test, predicted_labels))
print(f"Average accuracy: {np.average(accuracy)}")
print(f"Average precision: {np.average(precision)}")
print(f"Average recall: {np.average(recall)}")

In [None]:
from tensorflow.keras.layers import LSTM

In [None]:
accuracy = []
precision = []
recall = []
for dataset in chosen_datasets:
    df = pd.read_csv(os.path.join("datasets", dataset))
    X_train, X_test, y_train, y_test = train_test_split(df["text"].values, df["label"].values, test_size=0.25, random_state=1)
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(X_train)
    sequences = tokenizer.texts_to_sequences(X_train)
    max_length = max([len(seq) for seq in sequences])
    padded_sequences = pad_sequences(sequences, maxlen=max_length)
    new_sequences = tokenizer.texts_to_sequences(X_test)
    new_padded_sequences = pad_sequences(new_sequences, maxlen=max_length)

    model = Sequential()
    model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=32))
    model.add(LSTM(32))
    model.add(Dense(1, activation='sigmoid'))

    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

    epochs = 5
    history = model.fit(padded_sequences, y_train, validation_data=(new_padded_sequences, y_test), epochs=epochs)
    print_metrics(history, epochs=epochs)

    predictions = model.predict(new_padded_sequences)
    correct = 0
    predicted_labels = []
    for i in range(len(predictions)):
        prediction_label = 1 if predictions[i] > 0.5 else 0
        predicted_labels.append(prediction_label)
        if y_test[i] == prediction_label:
            correct += 1
    print_conf_matrix(y_test, predicted_labels)
    accuracy.append(correct / len(predictions))
    print(f"Accuracy: {correct / len(predictions)}")
    print(f"Precision: {precision_score(y_test, predicted_labels)}")
    precision.append(precision_score(y_test, predicted_labels))
    print(f"Recall: {recall_score(y_test, predicted_labels)}")
    recall.append(recall_score(y_test, predicted_labels))
print(f"Average accuracy: {np.average(accuracy)}")
print(f"Average precision: {np.average(precision)}")
print(f"Average recall: {np.average(recall)}")

In [None]:
from tensorflow.keras.layers import Conv1D, GlobalMaxPooling1D

In [None]:
accuracy = []
precision = []
recall = []
for dataset in chosen_datasets:
    df = pd.read_csv(os.path.join("datasets", dataset))
    X_train, X_test, y_train, y_test = train_test_split(df["text"].values, df["label"].values, test_size=0.25, random_state=1)
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(X_train)
    sequences = tokenizer.texts_to_sequences(X_train)
    max_length = max([len(seq) for seq in sequences])
    padded_sequences = pad_sequences(sequences, maxlen=max_length)
    new_sequences = tokenizer.texts_to_sequences(X_test)
    new_padded_sequences = pad_sequences(new_sequences, maxlen=max_length)

    model = Sequential()
    model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=32))
    model.add(Conv1D(128, 5, activation='relu'))
    model.add(GlobalMaxPooling1D())
    model.add(Dense(1, activation='sigmoid'))

    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

    epochs = 5
    history = model.fit(padded_sequences, y_train, validation_data=(new_padded_sequences, y_test), epochs=epochs)
    print_metrics(history, epochs=epochs)

    predictions = model.predict(new_padded_sequences)
    correct = 0
    predicted_labels = []
    for i in range(len(predictions)):
        prediction_label = 1 if predictions[i] > 0.5 else 0
        predicted_labels.append(prediction_label)
        if y_test[i] == prediction_label:
            correct += 1
    print_conf_matrix(y_test, predicted_labels)
    accuracy.append(correct / len(predictions))
    print(f"Accuracy: {correct / len(predictions)}")
    print(f"Precision: {precision_score(y_test, predicted_labels)}")
    precision.append(precision_score(y_test, predicted_labels))
    print(f"Recall: {recall_score(y_test, predicted_labels)}")
    recall.append(recall_score(y_test, predicted_labels))
print(f"Average accuracy: {np.average(accuracy)}")
print(f"Average precision: {np.average(precision)}")
print(f"Average recall: {np.average(recall)}")

In [None]:
from tensorflow.keras.layers import Dense, Embedding, Flatten

In [None]:
accuracy = []
precision = []
recall = []
for dataset in chosen_datasets:
    df = pd.read_csv(os.path.join("datasets", dataset))
    X_train, X_test, y_train, y_test = train_test_split(df["text"].values, df["label"].values, test_size=0.25, random_state=1)
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(X_train)
    sequences = tokenizer.texts_to_sequences(X_train)
    max_length = max([len(seq) for seq in sequences])
    padded_sequences = pad_sequences(sequences, maxlen=max_length)
    new_sequences = tokenizer.texts_to_sequences(X_test)
    new_padded_sequences = pad_sequences(new_sequences, maxlen=max_length)

    model = Sequential()
    model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=32))
    model.add(Flatten())
    model.add(Dense(1, activation='sigmoid'))

    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

    epochs = 3
    history = model.fit(padded_sequences, y_train, validation_data=(new_padded_sequences, y_test), epochs=epochs)
    print_metrics(history, epochs=epochs)

    predictions = model.predict(new_padded_sequences)
    correct = 0
    predicted_labels = []
    for i in range(len(predictions)):
        prediction_label = 1 if predictions[i] > 0.5 else 0
        predicted_labels.append(prediction_label)
        if y_test[i] == prediction_label:
            correct += 1
    print_conf_matrix(y_test, predicted_labels)
    accuracy.append(correct / len(predictions))
    print(f"Accuracy: {correct / len(predictions)}")
    print(f"Precision: {precision_score(y_test, predicted_labels)}")
    precision.append(precision_score(y_test, predicted_labels))
    print(f"Recall: {recall_score(y_test, predicted_labels)}")
    recall.append(recall_score(y_test, predicted_labels))
print(f"Average accuracy: {np.average(accuracy)}")
print(f"Average precision: {np.average(precision)}")
print(f"Average recall: {np.average(recall)}")

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, \
    recall_score, confusion_matrix

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, WeightedRandomSampler

from transformers import AutoTokenizer, AutoModelForSequenceClassification

import os
from tqdm.notebook import tqdm
from collections import defaultdict
from time import time

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
random.seed(1)

In [None]:
class Bert(nn.Module):
    def __init__(self, bert, bert_output_dim, output_dim):
        super().__init__()
        self.bert = bert
        self.relu = nn.ReLU()
        self.fc = nn.Linear(bert_output_dim, output_dim)

    def forward(self, sent_id, mask):
        logits = self.bert(sent_id, mask, return_dict=False)[0]
        outputs = self.relu(logits)
        outputs = self.fc(outputs)
        return outputs


def plot_learning_curves(history):
    sns.set_style(style='whitegrid')
    fig = plt.figure(figsize=(20, 14))

    plt.subplot(2, 2, 1)
    plt.title('Accuracy', fontsize=15)
    plt.plot(history['accuracy']['train'], label='Train')
    plt.plot(history['accuracy']['val'], label='Test')
    plt.xlabel('epoch', fontsize=15)
    plt.legend()

    plt.subplot(2, 2, 2)
    plt.title('Loss', fontsize=15)
    plt.plot(history['loss']['train'], label='Train')
    plt.plot(history['loss']['val'], label='Test')
    plt.xlabel('epoch', fontsize=15)
    plt.legend()

    plt.show()


def train(
    model,
    criterion,
    optimizer,
    train_loader,
    test_loader,
    num_epochs,
    lr_scheduler,
    device='cpu'
):
    history = defaultdict(lambda: defaultdict(list))
    for epoch in tqdm(range(num_epochs), total=num_epochs):
        # Train
        start_time = time()
        model.train(True)
        train_loss = 0
        test_loss = 0
        print(f'epoch: {epoch + 1}/{num_epochs}')
        print('Train')
        preds, labels = np.array([], dtype=int), np.array([], dtype=int)
        for sent_id, mask, y_batch in train_loader:
            sent_id, mask, y_batch = sent_id.to(device), mask.to(device), y_batch.to(device)
            outputs = model(sent_id, mask).logits.squeeze()
            if outputs.size() == torch.Size([]):
                outputs = outputs.unsqueeze(0)
            loss = criterion(outputs, y_batch.float())
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            train_loss += np.sum(loss.detach().cpu().numpy())
            y_pred = nn.Sigmoid()(outputs).detach().cpu().numpy() > 0.5
            preds = np.concatenate((preds, y_pred))
            labels = np.concatenate((labels, y_batch.detach().cpu().numpy()))
        train_loss /= len(train_loader)
        train_accuracy = accuracy_score(labels, preds)
        train_precision = precision_score(labels, preds)
        train_recall = recall_score(labels, preds)
        history['loss']['train'].append(train_loss)
        history['accuracy']['train'].append(train_accuracy)
        history['precision']['train'].append(train_precision)
        history['recall']['train'].append(train_recall)
        lr_scheduler.step()
        print('Test')
        model.train(False)
        preds, labels = np.array([], dtype=int), np.array([], dtype=int)
        for sent_id, mask, y_batch in test_loader:
            sent_id, mask, y_batch = sent_id.to(device), mask.to(device), y_batch.to(device)
            with torch.no_grad():
                outputs = model(sent_id, mask).logits.squeeze()
                if outputs.size() == torch.Size([]):
                    outputs = outputs.unsqueeze(0)
                loss = criterion(outputs, y_batch.float())
            test_loss += np.sum(loss.detach().cpu().numpy())
            y_pred = nn.Sigmoid()(outputs).detach().cpu().numpy() > 0.5
            preds = np.concatenate((preds, y_pred))
            labels = np.concatenate((labels, y_batch.detach().cpu().numpy()))
        test_loss /= len(test_loader)
        val_accuracy = accuracy_score(labels, preds)
        val_precision = precision_score(labels, preds)
        val_recall = recall_score(labels, preds)
        history['loss']['val'].append(test_loss)
        history['accuracy']['val'].append(val_accuracy)
        history['precision']['val'].append(val_precision)
        history['recall']['val'].append(val_recall)
        print(f'Epoch {epoch + 1} of {num_epochs} took {time() - start_time:.3f}s')
        print(f'  training loss (in-iteration): \t{train_loss:.6f}')
        print(f'  validation loss (in-iteration): \t{test_loss:.6f}')
        print(f'\n  training accuracy: \t\t\t{train_accuracy * 100:.2f} %')
        print(f'  validation accuracy: \t\t\t{val_accuracy * 100:.2f} %')
        print(f'\n  training precision: \t\t\t{train_precision * 100:.2f} %')
        print(f'  validation precision: \t\t{val_precision * 100:.2f} %')
        print(f'\n  training recall: \t\t\t{train_recall * 100:.2f} %')
        print(f'  validation recall: \t\t\t{val_recall * 100:.2f} %')
        if epoch > 0:
            plot_learning_curves(history)
    return model, history


def predict(model, test_batch_gen, device='cpu'):
    res_outputs = np.array([])
    res_targets = np.array([])
    for batch in test_batch_gen:
        input_ids, attention_mask, target = batch
        input_ids, attention_mask, target = input_ids.to(device), \
                                            attention_mask.to(device), \
                                            target.to(device)
        output = model(input_ids, attention_mask).logits.squeeze()
        output = nn.Sigmoid()(output) > 0.5
        res_outputs = np.append(res_outputs,
                                output.detach().cpu().numpy())
        res_targets = np.append(res_targets,
                                target.view(-1, 1).cpu().numpy())

    print(f'accuracy_score: {accuracy_score(res_targets, res_outputs)}')
    print(f'precision_score: {precision_score(res_targets, res_outputs)}')
    print(f'recall_score: {recall_score(res_targets, res_outputs)}')
    print('confusion matrix')
    print(print_conf_matrix(res_targets, res_outputs))
    return accuracy_score(res_targets, res_outputs), precision_score(res_targets, res_outputs), recall_score(res_targets, res_outputs)

In [None]:
accuracy = []
precision = []
recall = []
for dataset in chosen_datasets:
    df = pd.read_csv(os.path.join("datasets", dataset))
    tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased", use_fast=False)
    bert = AutoModelForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased", output_attentions=False, output_hidden_states=False)
    # model = Bert(bert, bert.classifier.out_features, 1)
    bert.classifier = nn.Linear(in_features=768, out_features=1, bias=True)
    model = bert
    X_train, X_test, y_train, y_test = train_test_split(df["text"].values, df["label"].values, test_size=0.25, random_state=1)

    max_seq_len = 256
    tokens_train = tokenizer.batch_encode_plus(
        X_train,
        max_length = max_seq_len,
        padding=True,
        truncation=True
    )

    tokens_test = tokenizer.batch_encode_plus(
        X_test,
        max_length = max_seq_len,
        padding=True,
        truncation=True
    )

    train_seq = torch.tensor(tokens_train['input_ids'])
    train_mask = torch.tensor(tokens_train['attention_mask'])
    train_y = torch.tensor(y_train)

    test_seq = torch.tensor(tokens_test['input_ids'])
    test_mask = torch.tensor(tokens_test['attention_mask'])
    test_y = torch.tensor(y_test)

    batch_size = 16
    train_data = TensorDataset(train_seq, train_mask, train_y)
    train_labels, train_counts = np.unique(y_train, return_counts=True)
    train_class_weights = [sum(train_counts) / c for c in train_counts]
    train_weigths = [train_class_weights[e] for e in y_train]
    train_sampler = WeightedRandomSampler(weights=train_weigths, num_samples=y_train.shape[0])
    train_dataloader = DataLoader(train_data, batch_size=batch_size, sampler=train_sampler)

    test_data = TensorDataset(test_seq, test_mask, test_y)
    test_sampler = RandomSampler(test_data)
    test_dataloader = DataLoader(test_data, batch_size=batch_size, sampler=test_sampler)


    output_dim = 1
    lr = 3e-5
    num_epochs = 3

    model = model.to(device)
    optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=0.2)
    criterion = nn.BCEWithLogitsLoss()
    lr_scheduler = optim.lr_scheduler.ExponentialLR(optimizer=optimizer,
                                                    gamma=0.9, verbose=True)
    model, history = train(
    model,
    criterion,
    optimizer,
    train_dataloader,
    test_dataloader,
    num_epochs,
    lr_scheduler,
    device
    )
    acc, prec, rec = predict(model, test_dataloader, device)
    accuracy.append(acc)
    precision.append(prec)
    recall.append(rec)

In [None]:
print(f"Average accuracy: {np.average(accuracy)}")
print(f"Average precision: {np.average(precision)}")
print(f"Average recall: {np.average(recall)}")