This notebook contains neural network approach. 
It works like simple AutoEncoder: encodes TF-IDF vectors with `char_wb` analyzer. After encoding result codes are fitted into simple `LogisticRegression` model with balanced weights. We filter a lot of non-expert sentences to make training process more stable and dataset less imbalanced.


#TODO
*  Try to use TripletLoss with hard samples mining
*  Try to use TripletLoss with multiple positive and negative samples and one anchor for more stable training
*  Experiment with architecture and losses (maybe -MSE(anchore, negative) will work better, because it is possible that we don't need clusters) 

In [1]:
import pandas as pd

import numpy as np
import re, nltk
from tqdm import tqdm

from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from sklearn.feature_extraction.text import TfidfVectorizer

from torch.utils.data import Dataset, DataLoader
import random

import matplotlib.pyplot as plt
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
import torchvision

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score



# Preprocessing data

What was done for preprocessing?
- found patterns for expert questions
- tried to use deeppavlov pretrained model for NER, but understood that this method is too time-consuming
- tried to use spellchecker, but there were corrected too many word including good ones

In [2]:
path = '/content/data.csv'
data = pd.read_csv(path, sep='\t',  delimiter=';') 

data = data.astype({"Question": "string"})

path_l = '/content/train.csv'
y = pd.read_csv(path_l,  delimiter=';')

train = pd.merge(data, y, on='ID')

Find most popular non-experts words

In [3]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler(random_state=0)
X = np.array(train["Question"].values).reshape(-1, 1)
X, y1 = ros.fit_resample(X, train["Answer"].values.reshape(-1, 1)) 
X_train, X_test, y_train, y_test = train_test_split(X, y1, test_size=0.33, random_state=42)
X_train, X_test, y_train, y_test = X_train[:,0], X_test[:,0], y_train, y_test


N = 6
pipeline = Pipeline([
    ('features', TfidfVectorizer(ngram_range=(1, N), max_features=10000)),
    ('clf', LogisticRegression())
])

pipeline.fit(X_train, y_train)
features = {v: k for k, v in pipeline.named_steps['features'].vocabulary_.items()}
k = 300
candidates = {}
candidates[0] = [features[i] for i in np.argsort(pipeline.named_steps['clf'].coef_[0])[:k]]
candidates[1] = [features[i] for i in np.argsort(pipeline.named_steps['clf'].coef_[0])[-k:]][::-1]


  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [4]:
print(candidates[0])

['это', 'назовите', 'будет', 'почему', 'сколько', 'он', 'клевер', 'клевера', 'формула', 'скольки', 'самая', 'куда', 'жизнь', 'кем', 'находиться', 'номер', 'русский', 'дата', 'пушкин', 'не', 'столица', 'зачем', 'самый', 'можно', 'сталин', 'то', 'вконтакте', 'так', 'всех', 'что', 'числа', 'чего', 'мы', 'имя', 'сейчас', 'нужно', 'автомобиля', 'нашей', 'продолжите', 'месте', 'по', 'на земле', 'на', 'какую', 'год', 'или', 'кого', 'они', 'кто', 'когда', 'брат', 'петр', 'мая', 'когда родился', 'появилось', 'ниже', 'родина', 'праздник', 'со', 'как', 'про', 'песни', 'ленин', 'свою', 'дом', 'их', 'тока', 'где', 'который', 'лучший', 'фамилия', 'популярная', 'нет', 'петербург', 'ног', 'может', 'главный', 'остров', 'картину', 'часто', 'вратарь', 'кто первым', 'белый', 'страна', 'мультфильм', 'самое', 'аниме', 'мультике', 'если', 'одном', 'бы', 'об', 'xx', 'выйграл', 'высокая гора', 'какого', 'снимался', 'певица', 'богов', 'принято', 'крещение руси', 'какой самый большой', 'именно', 'вопрос', 'телеф

In [5]:
def get_preprocess_data(X):
    """
    We found some patterns usual for expert questions:
    - Question answer at the end
    - Capital letter at the start
    - 'это' only with long dash before
    - No special symbols like '...%!+'
    - More than 6 words in question
    - Right using of comma like 'word, word'
    - No frequent non-expert words such as \['почему', 'будет', 'клевер', 'назовите', 'клевера'\] 

    Args: 
        X - dataframe with columns [id, questions]
    """

    no_expert_words = ['почему', 'будет', 'клевер', 'назовите', 'клевера', 'нельзя', 'нет', 'то', 'метро']
    regex_tokenizer = nltk.RegexpTokenizer("\w+")
    
    def normalize_text(text):
        # lowercase text
        text = str(text).lower()
        # remove non-UTF
        text = text.encode("utf-8", "ignore").decode()
        # remove punktuation symbols
        text = " ".join(regex_tokenizer.tokenize(text))
        return text
    
    questions = X.iloc[:, 1]
    vals = questions.values
    
    end_question = []
    fst_capital = []
    no_expert_eto = []
    special_signs = []
    good_special_words= []
    not_enough_words = []
    no_expert = []
    bad_comma = []

    for i in tqdm(range(len(vals))):
        x = vals[i]
        end_question.append(int(bool(re.search(r'.*?\?$', x))))
        fst_capital.append(int(bool(re.search(r'^[А-Я]', x))))
        no_expert_eto.append(int(bool(re.search(r'(\- это|[^\–] это)[^а-яА-Я]', x))))
        special_signs.append(int(bool(re.search(r'([\'?_:$!%^&*+\"”<>]|\.{1,}|,{2,}).*?\?$', x))))
        good_special_words.append(int(bool(re.search(r'[а-яА-Яa-zA-Z]\.|\'[а-яА-Яa-zA-Z]|[a-zA-Z]+:\s?[a-zA-Z]+|«.*»', x))))
        not_enough_words.append(int(bool(re.search(r'([а-яА-Яa-zA-Z]+[\s]?){6,}', x)))) 
        bad_comma.append(int(not bool(re.search(r'[а-яА-Яa-zA-Z]+\, [а-яА-Яa-zA-Z]+', x)))*int(',' in x))
        normal_x = normalize_text(x)
        no_expert.append(int(bool(any(w in normal_x.split() for w in no_expert_words[:6])))) 
     
    X['end_question'] = pd.Series(end_question)
    X['fst_capital'] = pd.Series(fst_capital)
    X['no_expert_eto'] = pd.Series(no_expert_eto)
    X['special_signs'] = pd.Series(special_signs)
    X['good_special_words'] = pd.Series(good_special_words)
    X['not_enough_words'] = pd.Series(not_enough_words)
    X['no_expert'] = pd.Series(no_expert)
    X['bad_comma'] = pd.Series(bad_comma)
    
    X.drop(X[X['end_question'] == 0].index, inplace=True)
    X.drop(X[X['fst_capital'] == 0].index, inplace=True)
    X.drop(X[(X['special_signs'] == 1) & (X['good_special_words'] == 0)].index, inplace=True)
    X.drop(X[X['bad_comma'] == 1].index, inplace=True)
    X.drop(X[X['no_expert_eto'] == 1].index, inplace=True)
    X.drop(X[X['not_enough_words'] == 0].index, inplace=True)
    X.drop(X[X['no_expert'] == 1].index, inplace=True)
    
    #X['Question'] = list(map(normalize_text, X['Question']))
    
    print(X.shape)

    return X[['ID', 'Question']]

In [6]:
X_train = train.iloc[:, :2] 
y_train = train.iloc[:, 2]


In [7]:
non_labeled_data = get_preprocess_data(X_train) # filter all simple sentences
train = pd.merge(non_labeled_data, y, on='ID')
train.shape

100%|██████████| 30000/30000 [00:01<00:00, 22643.22it/s]


(21014, 10)


(21014, 3)

# AutoEncoder

In [8]:
X, y = train["Question"].values, train["Answer"].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X, y = X_train, y_train
X_train = np.array(X_train).reshape(-1, 1)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
class QuestionsDataset(Dataset):
    """ 
    This class make anchors, positives, negatives samples for TripletMarginLoss
    """

    def __init__(self, data, labels, transform=None):
        """
        Args:
            data: corpus of TF-IDF vectors
            labels: labels for TF-IDF vectors
        """
        self.questions = data
        
        self.ans = labels
        self.positives = data[np.array(labels) == 1]
        self.negatives = data[np.array(labels) == 0]
        self.pos_idxs = np.array(range(labels.shape[0]))[labels == 1]
        self.neg_idxs = np.array(range(labels.shape[0]))[labels == 0]
        
        self.transform = transform
        self.pos_key = True # this param controls dataset imbalance

    def __len__(self):
        return self.ans.shape[0]

    def __getitem__(self, idx):
        # forget this idx we will make a new one
        if torch.is_tensor(idx):
            idx = idx.tolist()
        
        if self.pos_key == True:
            self.pos_key = False
            idx = random.choice(self.pos_idxs) # if self.pos_key == True, we take a expert question
            batch_anchors = torch.from_numpy(self.questions[idx])
            label = self.ans[idx] # for future experiments

            batch_positives = torch.from_numpy(np.array(random.choice(self.positives))) 
            batch_negatives = torch.from_numpy(np.array(random.choice(self.negatives)))
        else:
            self.pos_key = True
            idx = random.choice(self.neg_idxs) # if self.pos_key == False, we take a non-expert question
            batch_anchors = torch.from_numpy(self.questions[idx])
            label = self.ans[idx] # for future experiments

            batch_positives = torch.from_numpy(np.array(random.choice(self.negatives)))
            batch_negatives = torch.from_numpy(np.array(random.choice(self.positives)))

        return batch_anchors, batch_positives, batch_negatives, label

Next cell transform all text data into a TF-IDF vectors 

N, max_features must be tuned. N=10, max_features=1000 will get ~0.78 score after merging with filtered simple data

In [10]:
N = 10 # ngram_range [1;N]
max_features = 1000 
transformer = TfidfVectorizer(ngram_range=(1, N), max_features=max_features, analyzer='char_wb')
X_train1 = transformer.fit_transform(X_train).toarray()
X_test1 = transformer.transform(X_test).toarray()
X_val1 = transformer.transform(X_val).toarray()

## Architecture

In [11]:
# make model deterministic
seed = 42
torch.manual_seed(seed)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True

class AE(nn.Module):
    def __init__(self, **kwargs):
        super().__init__()
        self.encoder_hidden_layer = nn.Linear(
            in_features=kwargs["input_shape"], out_features=1024
        )
        self.encoder_output_layer = nn.Linear(
            in_features=1024, out_features=512
        )
        self.dropout = torch.nn.Dropout(p=0.5, inplace=False)
        
        self.decoder_hidden_layer = nn.Linear(
            in_features=512, out_features=1024
        )
        self.decoder_output_layer = nn.Linear(
            in_features=1024, out_features=kwargs["input_shape"]
        )

    def get_codes(self, features):
        activation = self.encoder_hidden_layer(features)
        activation = torch.relu(activation) 
        code = self.encoder_output_layer(activation)
        
        code = torch.relu(code)
        return code

    def forward(self, features):
        activation = self.encoder_hidden_layer(features)
        activation_1 = torch.relu(activation) # make new non-linear representation of TF-IDF vectors
        activation = self.dropout(activation) # add noise to get stable training. Potentially can add resistance to small errors in words (But I don't think that this is good for question classification).  
        
        code = self.encoder_output_layer(activation)
        code = torch.relu(code) # make code 
        
        activation_2 = self.decoder_hidden_layer(code) # restore non-linear representation of TF-IDF vectors for optimization
        return code, activation_1, activation_2


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = AE(input_shape=max_features).to(device)

optimizer = optim.Adam(model.parameters(), lr=1e-3)

criterion = nn.MSELoss()
triplet_loss = nn.TripletMarginLoss(margin=1.0, p=2) #nn.MSELoss()

## Aplication on data

In [12]:
train_dataset = QuestionsDataset(X_train1, y_train)
test_dataset = QuestionsDataset(X_test1, y_test)
val_dataset = QuestionsDataset(X_val1, y_val)

# make dataloaders
train_loader = torch.utils.data.DataLoader(
    train_dataset, batch_size=1000, shuffle=True, num_workers=0, pin_memory=True
)

val_loader = torch.utils.data.DataLoader(
    val_dataset, batch_size=1000, shuffle=True, num_workers=0, pin_memory=True
)


First stage: Pretrain AutoEncoder without tripletloss (Because I don't want cluster noise vectors) 

In [13]:
for epoch in range(7):
    loss = 0
    for anchor, positive, negative, labels in train_loader:
        
        anchor, positive, negative = anchor.float().to(device), positive.float().to(device), negative.float().to(device)
        
        optimizer.zero_grad()
        
        # compute reconstructions
        anchor_out, rec_anc, rec_anc1 = model(anchor)
        
        positive_out, rec_pos, _ = model(positive)
        negative_out, rec_neg, _ = model(negative)
        
        train_loss =  100 * criterion(rec_anc, rec_anc1)
        
        train_loss.backward()
 
        optimizer.step()
        
        loss += train_loss.item()
    
    val_loss = 0
    loss = loss / len(train_loader)
    
    for anchor, positive, negative, labels in val_loader:
        anchor, positive, negative = anchor.float().to(device), positive.float().to(device), negative.float().to(device)

        # compute reconstructions
        anchor_out, rec_anc, rec_anc1 = model(anchor)

        positive_out, rec_pos, _ = model(positive)
        negative_out, rec_neg, _ = model(negative)

        validation_loss = 100 * criterion(rec_anc, rec_anc1)
        val_loss += validation_loss.item()

    val_loss = val_loss / len(val_loader)
    if abs(val_loss - train_loss) > 0.0005:
        break
    # display the epoch training loss
    print("epoch : {}/{}, loss = {:.6f}, val_loss = {:.6f}".format(epoch + 1, 7, loss, val_loss))

Second stage: Train AutoEncoder with TripletLoss to make expert/non-expert clusters

In [14]:
for epoch in range(5):
    loss = 0
    for anchor, positive, negative, labels in train_loader: 
        anchor, positive, negative = anchor.float().to(device), positive.float().to(device), negative.float().to(device)

        optimizer.zero_grad()
        
        # compute reconstructions
        anchor_out, rec_anc, rec_anc1 = model(anchor)
        positive_out, rec_pos, _ = model(positive)
        negative_out, rec_neg, _ = model(negative)

        # compute training reconstruction loss
        # NB: first triplet for codes, second triplet is experimental (I assumed that if I do clusterization before codes it will improve codes clusterization )
        train_loss =  100 * criterion(rec_anc, rec_anc1) + triplet_loss(anchor_out, positive_out, negative_out) + triplet_loss(rec_anc, rec_pos, rec_neg)
        
        # compute accumulated gradients
        train_loss.backward()
        
        # perform parameter update based on current gradients
        optimizer.step()
        
        # add the mini-batch training loss to epoch loss
        loss += train_loss.item()
    
    # compute the epoch training loss
    val_loss = 0
    loss = loss / len(train_loader)
    
    for anchor, positive, negative, labels in val_loader:
        anchor, positive, negative = anchor.float().to(device), positive.float().to(device), negative.float().to(device)

        # compute reconstructions
        anchor_out, rec_anc, rec_anc1 = model(anchor)
        positive_out, rec_pos, _ = model(positive)
        negative_out, rec_neg, _ = model(negative)

        validation_loss = 100 * criterion(rec_anc, rec_anc1) + triplet_loss(anchor_out, positive_out, negative_out) + triplet_loss(rec_anc, rec_pos, rec_neg)
        val_loss += validation_loss.item()
    
    val_loss = val_loss / len(val_loader)

    #if abs(val_loss - train_loss) > 0.06:
    #  break
    # display the epoch training loss
    print("epoch : {}/{}, loss = {:.6f}, val_loss = {:.6f}".format(epoch + 1, 5, loss, val_loss))

epoch : 1/5, loss = 2.004185, val_loss = 2.001757
epoch : 2/5, loss = 1.999453, val_loss = 1.987203
epoch : 3/5, loss = 1.973693, val_loss = 1.969549
epoch : 4/5, loss = 1.946391, val_loss = 1.934149
epoch : 5/5, loss = 1.927349, val_loss = 1.953159


Apply simple LogisticRegression

In [15]:
# encode data with AutoEncoder for LogisticRegression

X_train_enc = model.get_codes(torch.from_numpy(X_train1).to(device).float()).detach().cpu().numpy()
X_val_enc = model.get_codes(torch.from_numpy(X_val1).to(device).float()).detach().cpu().numpy()
X_test_enc = model.get_codes(torch.from_numpy(X_test1).to(device).float()).detach().cpu().numpy()

print(X_train_enc.shape)

(13448, 512)


Train logisticRegression

In [16]:
clf1 = LogisticRegression(class_weight='balanced', random_state=1)

logisticRegression = VotingClassifier(estimators=[('lr', clf1)], voting='soft')
logisticRegression = logisticRegression.fit(X_train_enc, y_train)


Test logisticRegression on hard samples

In [17]:
y_pred = logisticRegression.predict_proba(X_train_enc)[:,1]

y_true = y_train
print("train", roc_auc_score(y_true, y_pred), accuracy_score(y_true, logisticRegression.predict(X_train_enc)))

y_pred = logisticRegression.predict_proba(X_test_enc)[:,1]
y_true = y_test
print("test", roc_auc_score(y_true, y_pred), accuracy_score(y_true, logisticRegression.predict(X_test_enc)))

y_pred = logisticRegression.predict_proba(X_val_enc)[:,1]
y_true = y_val
print("val", roc_auc_score(y_true, y_pred), accuracy_score(y_true, logisticRegression.predict(X_val_enc)))


train 0.6934183586841189 0.6289411064842356
test 0.6564822806036892 0.6276469188674756
val 0.6537615823235923 0.6155218554861731


In [18]:
from joblib import dump, load
dump(logisticRegression, 'filename.joblib') # save logisticRegression
torch.save(model.state_dict(), 'net') # save AutoEncoder