# SemEval-2020 Task 4: Commonsense Validation and Explanation

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1rlwPIErsvGcYYgM6s_FT9FDn9lcHzvVU?usp=sharing)

# 0. Getting started

In [6]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [7]:
%cd /content/gdrive/MyDrive/git/fer-tar

/content/gdrive/.shortcut-targets-by-id/1yfuNPQUT_G0CfNtdGxHhLi96giKyswbZ/git/fer-tar


## 0.1. Imports

In [8]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import scripts.data as data
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


## 0.2. Getting the data

In [9]:
X_train, X_dev, X_test, y_train, y_dev, y_test = data.load_data()

## 0.3. Preprocessing

In [10]:
X_ultimate_train, y_ultimate_train = data.melt_columns(X_train, y_train)
X_ultimate_dev, y_ultimate_dev = data.melt_columns(X_dev, y_dev)
X_ultimate_test, y_ultimate_test = data.melt_columns(X_test, y_test)

In [11]:
vec = TfidfVectorizer(ngram_range=(1, 3), lowercase=True)
vec.fit(X_ultimate_train)
X_tfidf_train = vec.transform(X_ultimate_train)
X_tfidf_test = vec.transform(X_ultimate_test)

# 1. Baseline

In [12]:
clf = LogisticRegression(C=0.02)
clf.fit(X_tfidf_train, y_ultimate_train)

  y = column_or_1d(y, warn=True)


LogisticRegression(C=0.02, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [13]:
sent0_preds = clf.predict_proba(vec.transform(X_test['sent0']))[:, 0]
sent1_preds = clf.predict_proba(vec.transform(X_test['sent1']))[:, 0]
accuracy_score(y_test, sent0_preds > sent1_preds)

0.655

In [16]:
def spy_sparse2torch_sparse(data):
    """

    :param data: a scipy sparse csr matrix
    :return: a sparse torch tensor
    """
    samples=data.shape[0]
    features=data.shape[1]
    values=data.data
    coo_data=data.tocoo()
    indices=torch.LongTensor([coo_data.row,coo_data.col])
    t=torch.sparse.FloatTensor(indices,torch.from_numpy(values).float(),[samples,features])
    return t

In [17]:
s0 = vec.transform(X_train['sent0'])
s1 = vec.transform(X_train['sent1'])
s0 = spy_sparse2torch_sparse(s0)
s1 = spy_sparse2torch_sparse(s1)

s0_test = vec.transform(X_test['sent0'])
s1_test = vec.transform(X_test['sent1'])
s0_test = spy_sparse2torch_sparse(s0_test)
s1_test = spy_sparse2torch_sparse(s1_test)

vec_size = s0.shape[1]

In [37]:
class PairwiseLogisticRegression(nn.Module):
    def __init__(self, in_p):
        super().__init__()
        self.vec_size = in_p
        self.linear1 = nn.Linear(in_p, 1)
        self.linear2 = nn.Linear(in_p, 1)
        self.linear3 = nn.Linear(2, 1)
        self.loss_f = nn.CrossEntropyLoss()

    def forward(self, X1, X2):
        p0 = nn.LeakyReLU(0.1)(self.linear1(X1)).reshape(-1, 1)
        p1 = nn.LeakyReLU(0.01)(self.linear2(X2)).reshape(-1 ,1)
        p = nn.Sigmoid()(self.linear3(torch.cat((p0, p1), dim=1)))
        return p

    def predict(self, X1, X2):
        p = self.forward(X1, X2)
        return p > 0.5

    def loss(self, X1, X2, y):
        p = self.forward(X1, X2)
        return - torch.sum(y * torch.log(p + 1e-13) + (1 - y) * torch.log(1 - p + 1e-13))

In [38]:
model = PairwiseLogisticRegression(vec_size)

In [42]:
opt = optim.Adam(model.parameters(), lr = 0.1, weight_decay=0.5)
y_tensor = torch.tensor(y_train.values)
for i in range(400):
    loss = model.loss(torch.cat((s0, s1)), torch.cat((s1, s0)), torch.cat((y_tensor, 1 - y_tensor)))
    loss.backward()
    if i % 20 == 0:
        print(loss)
    opt.step()
    opt.zero_grad()


tensor(1206.9274, grad_fn=<NegBackward>)
tensor(604.5444, grad_fn=<NegBackward>)
tensor(643.7679, grad_fn=<NegBackward>)
tensor(546.4450, grad_fn=<NegBackward>)
tensor(516.6306, grad_fn=<NegBackward>)
tensor(494.0226, grad_fn=<NegBackward>)
tensor(478.1906, grad_fn=<NegBackward>)
tensor(465.5270, grad_fn=<NegBackward>)
tensor(454.6223, grad_fn=<NegBackward>)
tensor(445.7906, grad_fn=<NegBackward>)
tensor(417.1361, grad_fn=<NegBackward>)
tensor(400.9794, grad_fn=<NegBackward>)
tensor(498.0103, grad_fn=<NegBackward>)
tensor(378.3919, grad_fn=<NegBackward>)
tensor(375.1275, grad_fn=<NegBackward>)
tensor(375.6886, grad_fn=<NegBackward>)
tensor(384.2704, grad_fn=<NegBackward>)
tensor(340.1692, grad_fn=<NegBackward>)
tensor(361.3019, grad_fn=<NegBackward>)
tensor(429.7474, grad_fn=<NegBackward>)


In [46]:
!pip install nlpaug

Collecting nlpaug
[?25l  Downloading https://files.pythonhosted.org/packages/eb/f8/b11caecdd19aa2b1b2cb46c6cbbec692abd621aad884e653e459a8546add/nlpaug-1.1.3-py3-none-any.whl (394kB)
[K     |▉                               | 10kB 18.5MB/s eta 0:00:01[K     |█▋                              | 20kB 9.6MB/s eta 0:00:01[K     |██▌                             | 30kB 7.7MB/s eta 0:00:01[K     |███▎                            | 40kB 6.8MB/s eta 0:00:01[K     |████▏                           | 51kB 4.0MB/s eta 0:00:01[K     |█████                           | 61kB 4.5MB/s eta 0:00:01[K     |█████▉                          | 71kB 4.7MB/s eta 0:00:01[K     |██████▋                         | 81kB 4.9MB/s eta 0:00:01[K     |███████▌                        | 92kB 5.1MB/s eta 0:00:01[K     |████████▎                       | 102kB 5.3MB/s eta 0:00:01[K     |█████████▏                      | 112kB 5.3MB/s eta 0:00:01[K     |██████████                      | 122kB 5.3MB/s eta 0:00

In [47]:
import nlpaug.augmenter.char as nac
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.sentence as nas
import nlpaug.flow as nafc

from nlpaug.util import Action

In [57]:
aug = naw.SynonymAug(aug_min=1, aug_max=10)
X_aug = aug.augment(list(X_ultimate_train.values), n=2)

'He pour out orange succus on his cereal.'

AttributeError: ignored

In [45]:
accuracy_score(y_test, y_pred.numpy())

0.572

In [None]:
print(y_train)

      1
0      
0     0
1     0
2     1
3     1
4     1
...  ..
9995  0
9996  0
9997  1
9998  1
9999  0

[10000 rows x 1 columns]


In [None]:
print(y_test)

      1
0      
1175  0
452   0
275   0
869   0
50    1
...  ..
1114  1
8     1
1945  0
1053  0
1123  0

[1000 rows x 1 columns]


In [None]:
print(X_test)

                                                  sent0                                              sent1
id                                                                                                        
1175        He loves to stroll at the park with his bed       He loves to stroll at the park with his dog.
452       The inverter was able to power the continent.           The inverter was able to power the house
275             The chef put extra lemons on the pizza.         The chef put extra mushrooms on the pizza.
869                   sugar is used to make coffee sour                 sugar is used to make coffee sweet
50    There are beautiful flowers here and there in ...  There are beautiful planes here and there in t...
...                                                 ...                                                ...
1114               If it had rained, you would got wet.           If it is a sunny day, you would got wet.
8                                 ice