In [1]:
import csv
from collections import defaultdict, Counter

import pandas as pd
import numpy as np

from nltk import word_tokenize, TweetTokenizer
from pathlib import Path

import vecto.embeddings

from tqdm import tqdm

In [75]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
from sklearn.utils.class_weight import compute_sample_weight
from sklearn.utils import shuffle
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score, confusion_matrix

In [96]:
import torch
import torch.nn.functional as F
from skorch import NeuralNetClassifier

In [66]:
DATA_DIR = Path('../data/')
DATASET_DIR = DATA_DIR.joinpath('dataset/')
FASTTEXT_DIR = DATA_DIR.joinpath('embeddings/')

In [61]:
df_random = pd.read_csv(PATH_TO_DATASET + 'rusentiment_random_posts.csv')
df_preselected = pd.read_csv(PATH_TO_DATASET + 'rusentiment_preselected_posts.csv')

In [101]:
class SmallDeepNet(torch.nn.Module):
    def __init__(self, embedding_dim, hidden_size, nb_classes):
        super().__init__()

        self.hidden_size = hidden_size
        self.nb_classes = nb_classes
        self.embedding_dim = embedding_dim

        self.criterion = torch.nn.CrossEntropyLoss()
        self.optimizer = torch.optim.SGD(self.parameters(), lr=0.01, momentum=0.9)
        self.classifier = torch.nn.Sequential(
            torch.nn.Linear(self.embedding_dim, self.hidden_size),
            torch.nn.ELU(),
            torch.nn.Linear(self.hidden_size, self.hidden_size),
            torch.nn.ELU(),
            torch.nn.Linear(self.hidden_size, self.hidden_size),
            torch.nn.ELU(),
            torch.nn.Linear(self.hidden_size, self.nb_classes),
        )

    def forward(self, inputs):
        logits = self.classifier(inputs)
        outputs = F.softmax(logits, dim=-1)

        return outputs

    def backward(self, y_pred, target):
        loss = self.criterion(y_pred, target)

        # Zero the gradients
        self.optimizer.zero_grad()

        # perform a backward pass (backpropagation)
        loss.backward()

        # Update the parameters
        self.optimizer.step()
        return loss

In [65]:
def load_embeddings(filename):
    try:
        embeddings = vecto.embeddings.load_from_dir(filename)

    except EOFError:
        print(f'Cannot load: {filename}')
        embeddings = None
    return embeddings

In [88]:
def create_data_matrix_embeddings(samples, word_embeddings):
    embeddings_dim = len(word_embeddings.matrix[0])
    nb_samples = len(samples)
    X = np.zeros((nb_samples, embeddings_dim), dtype=np.float32)

    nb_empty = 0
    empty_samples = []
    for i, sample in enumerate(samples):
        tokens = sample.split(' ')
        tokens_embeddings = [word_embeddings.get_vector(t) for t in tokens if
                             word_embeddings.has_word(t)]
        if len(tokens_embeddings) > 0:
            mean_embeddings = np.mean(tokens_embeddings, axis=0)
            X[i] = mean_embeddings
        else:
            nb_empty += 1
            empty_samples.append(tokens)

    print(f'Empty samples: {nb_empty}')

    return X, empty_samples

In [38]:
def load_data(filename):
    tokenizer = TweetTokenizer()

    with open(filename, 'r') as f:
        reader = csv.DictReader(f)

        samples = []
        labels = []
        for row in reader:
            text = row['text']
            label = row['label']

            text_tokenized = tokenizer.tokenize(text)

            text_joined = ' '.join(text_tokenized)

            samples.append(text_joined)
            labels.append(label)

    return samples, labels

In [39]:
def create_training_data(mode, labels_mode):
    data_base_filename = DATASET_DIR.joinpath('rusentiment_random_posts.csv')
    data_posneg_filename = DATASET_DIR.joinpath('rusentiment_preselected_posts.csv')
    data_test_filename = DATASET_DIR.joinpath('rusentiment_test.csv')

    samples_base_train, labels_base_train = load_data(data_base_filename)
    samples_posneg_train, labels_posneg_train = load_data(data_posneg_filename)
    samples_test, labels_test = load_data(data_test_filename)

    print(f'Data base: {len(samples_base_train)}, {len(labels_base_train)}')
    print(f'Data posneg: {len(samples_posneg_train)},'
          f' {len(labels_posneg_train)}')
    print(f'Data test: {len(samples_test)}, {len(labels_test)}')
    print(f'Labels: {len(set(labels_base_train))},'
          f' {len(set(labels_base_train))}, {len(set(labels_test))}')

    if mode == 'base':
        samples_train = samples_base_train
        labels_train = labels_base_train
    elif mode == 'posneg':
        samples_train = samples_base_train + samples_posneg_train
        labels_train = labels_base_train + labels_posneg_train
    elif mode == 'pos':
        target_class = 'positive'
        target_samples = \
            [s for s, l in zip(samples_posneg_train, labels_posneg_train)
             if l == target_class]
        target_labels = [target_class] * len(target_samples)
        samples_train = samples_base_train + target_samples
        labels_train = labels_base_train + target_labels
    elif mode == 'neg':
        target_class = 'negative'
        target_samples = \
            [s for s, l in zip(samples_posneg_train, labels_posneg_train)
             if l == target_class]
        target_labels = [target_class] * len(target_samples)
        samples_train = samples_base_train + target_samples
        labels_train = labels_base_train + target_labels
    elif mode == 'neutral':
        target_class = 'neutral'
        target_samples = \
            [s for s, l in zip(samples_posneg_train, labels_posneg_train)
             if l == target_class]
        target_labels = [target_class] * len(target_samples)
        samples_train = samples_base_train + target_samples
        labels_train = labels_base_train + target_labels
    elif mode == 'posneg_only':
        samples_train = samples_posneg_train
        labels_train = labels_posneg_train
    elif mode == 'replace':
        nb_replace = len(samples_posneg_train)
        samples_base_train, labels_base_train = \
            shuffle(samples_base_train, labels_base_train)
        samples_train = samples_base_train[:-nb_replace] + samples_posneg_train
        labels_train = labels_base_train[:-nb_replace] + labels_posneg_train
    elif mode == 'debug':
        nb_samples_debug = 2000
        samples_train = samples_base_train[:nb_samples_debug]
        labels_train = labels_base_train[:nb_samples_debug]
    elif mode == 'sample':
        nb_sample = len(samples_posneg_train)
        samples_base_train, labels_base_train = shuffle(
            samples_base_train, labels_base_train)
        samples_train = samples_base_train[:nb_sample]
        labels_train = labels_base_train[:nb_sample]
    elif mode == 'sample_posneg':
        nb_samples_by_classes = Counter(labels_posneg_train)

        samples_train = []
        labels_train = []
        for target_class, target_counts in nb_samples_by_classes.most_common():
            base_samples_of_target_class = [
                s for s, l in zip(samples_base_train, labels_base_train)
                if l == target_class]
            shuffle(base_samples_of_target_class)
            base_samples_of_target_class = \
                base_samples_of_target_class[:target_counts]

            samples_train.extend(base_samples_of_target_class)
            labels_train.extend([target_class] * len(base_samples_of_target_class))
    else:
        raise ValueError(f'Mode {mode} is unknown')

    if labels_mode == 'base':
        pass
    elif labels_mode == 'neg':
        labels_train = ['rest' if lbl != 'negative' else lbl for lbl in labels_train]
        labels_test = ['rest' if lbl != 'negative' else lbl for lbl in labels_test]
    elif labels_mode == 'pos':
        labels_train = ['rest' if lbl != 'positive' else lbl for lbl in labels_train]
        labels_test = ['rest' if lbl != 'positive' else lbl for lbl in labels_test]
    else:
        raise ValueError(f'Labels mode {labels_mode} is unknown')

    return samples_train, labels_train, samples_test, labels_test

In [112]:
def score_model(model, X, y_true, labels):
    y_pred = model.predict(X)

    if len(set(y_true)) == 2:
        average = 'binary'
        pos_label = int(np.argwhere(labels != 'rest'))
    else:
        average = 'weighted'
        pos_label = 1

    accuracy = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred, average=average, pos_label=pos_label)
    precision = precision_score(y_true, y_pred, average=average, pos_label=pos_label)
    recall = recall_score(y_true, y_pred, average=average, pos_label=pos_label)

    return accuracy, f1, precision, recall

In [54]:
samples_train, labels_train, samples_test, labels_test = create_training_data(mode='posneg',
                                                                              labels_mode='base')

Data base: 21268, 21268
Data posneg: 6950, 6950
Data test: 2967, 2967
Labels: 5, 5, 5


In [56]:
len(samples_train)

28218

In [59]:
print(f'Data train: {len(samples_train)}')
print(f'Labels train: {Counter(labels_train)}')

Data train: 28218
Labels train: Counter({'neutral': 11300, 'positive': 6110, 'skip': 4094, 'negative': 3654, 'speech': 3060})


In [72]:
embeddings = load_embeddings(str(FASTTEXT_DIR))
print(f'Word embeddings: {len(embeddings.vocabulary.lst_words)}')

Word embeddings: 507470


In [76]:
label_encoder = LabelEncoder()
label_encoder.fit(labels_train)
print(f'Labels: {label_encoder.classes_}')

Labels: ['negative' 'neutral' 'positive' 'skip' 'speech']


In [89]:
X_train, empty_samples = create_data_matrix_embeddings(samples_train, embeddings)
y_train = label_encoder.transform(labels_train)
print(f'Train data: {X_train.shape}, {y_train.shape}')

Empty samples: 191
Train data: (28218, 300), (28218,)


In [121]:
X_test, empty_samples_test = create_data_matrix_embeddings(samples_test, embeddings)
y_test = label_encoder.transform(labels_test)
print(f'Test data: {X_test.shape}, {y_test.shape}')

Empty samples: 19
Test data: (2967, 300), (2967,)


In [91]:
scaler = StandardScaler()
scaler.fit(X_train)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [92]:
X_train = scaler.transform(X_train)

In [123]:
X_test = scaler.transform(X_test)

In [114]:
models = [
    LogisticRegression(),
    LinearSVC(),
    GradientBoostingClassifier(),
    # net,
]
results = []
for model in models:
    model.fit(X_train, y_train)  # , sample_weight=sample_weight

    result = score_model(model, X_train, y_train, label_encoder.classes_)
    results.append(result)

print('===== RESULTS =====')
for model, (accuracy_train, f1_train, precision_train, recall_train) in zip(models, results):
    model_name = model.__class__.__name__
    print(f'{model_name}: F1 train {f1_train:.3f}')



===== RESULTS =====
LogisticRegression: F1 train 0.631
LinearSVC: F1 train 0.623
GradientBoostingClassifier: F1 train 0.680


In [124]:
results_test = []
for model in models:

    result = score_model(model, X_test, y_test, label_encoder.classes_)
    results_test.append(result)

print('===== RESULTS TEST =====')
for model, (accuracy_train, f1_train, precision_train, recall_train) in zip(models, results_test):
    model_name = model.__class__.__name__
    print(f'{model_name}: F1 test {f1_train:.3f}')

===== RESULTS TEST =====
LogisticRegression: F1 test 0.688
LinearSVC: F1 test 0.674
GradientBoostingClassifier: F1 test 0.687
