# SemEval-2020 Task 4: Commonsense Validation and Explanation

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1rlwPIErsvGcYYgM6s_FT9FDn9lcHzvVU?usp=sharing)

## Getting the data

In [3]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [4]:
import pandas as pd
import numpy as np

In [5]:
X_train = pd.read_csv('/content/gdrive/MyDrive/data/Training/subtaskA_data_all.csv', index_col=0)
X_dev   = pd.read_csv('/content/gdrive/MyDrive/data/Dev/subtaskA_dev_data.csv'     , index_col=0)
X_test  = pd.read_csv('/content/gdrive/MyDrive/data/Test/subtaskA_test_data.csv'   , index_col=0)

y_train = pd.read_csv('/content/gdrive/MyDrive/data/Training/subtaskA_answers_all.csv', index_col=0, header=None)
y_dev   = pd.read_csv('/content/gdrive/MyDrive/data/Dev/subtaskA_gold_answers.csv'    , index_col=0, header=None)
y_test  = pd.read_csv('/content/gdrive/MyDrive/data/Test/subtaskA_gold_answers.csv'   , index_col=0, header=None)

## Not a fan of pandas, so just switch to numpy arrays

In [6]:
X_train = np.array([X_train['sent0'], X_train['sent1']]).T
y_train = np.array(y_train)
X_test = np.array([X_test['sent0'], X_test['sent1']]).T
y_test = np.array(y_test)

In [7]:
# a quick glance at the data
print(X_train[0][0])
print(X_train[0][1])
print(y_train[0][0])

print()

print(X_test[0][0])
print(X_test[0][1])
print(y_test[0][0])

He poured orange juice on his cereal.
He poured milk on his cereal.
0

He loves to stroll at the park with his bed
He loves to stroll at the park with his dog.
0


# A pre-trained LM model, just out of curiosity

In [8]:
!pip install lm-scorer
from lm_scorer.models.auto import AutoLMScorer
import torch

Collecting lm-scorer
  Downloading https://files.pythonhosted.org/packages/c8/89/d86ee877bfa51104b338a67413c76b6fde50a76c7b7e0c55c546effe97e9/lm_scorer-0.4.2-py3-none-any.whl
Collecting pip>=20.0.0
[?25l  Downloading https://files.pythonhosted.org/packages/fe/ef/60d7ba03b5c442309ef42e7d69959f73aacccd0d86008362a681c4698e83/pip-21.0.1-py3-none-any.whl (1.5MB)
[K     |████████████████████████████████| 1.5MB 5.2MB/s 
Collecting transformers<3.0.0,>=2.9.0
[?25l  Downloading https://files.pythonhosted.org/packages/48/35/ad2c5b1b8f99feaaf9d7cdadaeef261f098c6e1a6a2935d4d07662a6b780/transformers-2.11.0-py3-none-any.whl (674kB)
[K     |████████████████████████████████| 675kB 16.8MB/s 
Collecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/f5/99/e0808cb947ba10f575839c43e8fafc9cc44e4a7a2c8f79c60db48220a577/sentencepiece-0.1.95-cp37-cp37m-manylinux2014_x86_64.whl (1.2MB)
[K     |████████████████████████████████| 1.2MB 23.0MB/s 
[?25hCollecting sacremoses
[?25l  

In [9]:
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
print(device)
scorer = AutoLMScorer.from_pretrained("gpt2-large", device=device)

sent0 = "He put the elephant in the fridge"
sent1 = "SKrskr riječii smisla neimadeju"
sent2 = "He put the turkey in the fridge"

print(sent0, scorer.sentence_score(sent0))
print(sent1, scorer.sentence_score(sent1))
print(sent2, scorer.sentence_score(sent2))

cpu


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=764.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1042301.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=3247202234.0, style=ProgressStyle(descr…


He put the elephant in the fridge 1.297326704875864e-18
SKrskr riječii smisla neimadeju 0.0
He put the turkey in the fridge 8.462602778441146e-18


In [10]:
def score(X):
    return np.array([ [scorer.sentence_score(x[0]), scorer.sentence_score(x[1]) ] for x in X])

def predict(X):
    scores = score(X)
    # the label is the "incorrect" sentence, so minimum probability
    return np.argmin(scores, axis=1)


In [11]:
from sklearn.metrics import accuracy_score

y_test_pred = predict(X_test)

accuracy = accuracy_score(y_test, y_test_pred)
print(f"acc = {accuracy}")

acc = 0.788


# My ngram language model

In [12]:
# I had problems with the installed version
!pip install "nltk==3.4.4"
from nltk import download
download('brown')

from nltk.corpus import brown
from collections import defaultdict
from nltk.util import ngrams as util_ngrams



[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!


In [13]:
class MyCounter:

    def __init__(self, ngrams):
        # dictionary where the key is ngram length and value is a dictionary
        # where the key is the ngram and the value is occurrence count
        dictionary = defaultdict(lambda: defaultdict(lambda: 0))
        for ngram in ngrams:
            dictionary[len(ngram)][ngram] += 1
        
        self.dictionary = dictionary

    def count(self, ngram: tuple):
        """Count how many times ngram appears."""
        return self.dictionary[len(ngram)][ngram]

    def count_ngrams(self, n: int):
        """Count different ngrams of length n."""
        return len(self.dictionary[n])


class NgramLanguageModel:

    def __init__(self, n, sentences):
        m = n - 1
        mgram_generators = [util_ngrams(sentence, m) for sentence in sentences]
        ngram_generators = [util_ngrams(sentence, n) for sentence in sentences]
        mgrams = [mgram for mgram_gen in mgram_generators for mgram in mgram_gen]
        ngrams = [ngram for ngram_gen in ngram_generators for ngram in ngram_gen]
        
        self.counter = MyCounter(mgrams + ngrams)
        self.n = n

    def ngram_score(self, ngram):
        ngram_count = self.counter.count(ngram)
        mgram = tuple(ngram[:-1])
        mgram_count = self.counter.count(mgram)
        # add one smoothing
        return (ngram_count + 1) / (mgram_count + self.counter.count_ngrams(self.n))

    def log_sentence_score(self, sentence):
        probs = [self.ngram_score(ngram) for ngram in util_ngrams(sentence, self.n)]
        sentence_prob = np.sum(np.log(probs))
        return sentence_prob

    def sentence_score(self, sentence):
        return np.exp(log_sentence_score(sentence))


In [14]:
lm = NgramLanguageModel(3, brown.sents())
# sanity check
print("shit on desk:", lm.ngram_score(('shit', 'on', 'desk')))
print("at that time:", lm.ngram_score(('at', 'that', 'time')))

def log_score(X):
    log_probs = [[lm.log_sentence_score(x[0]), lm.log_sentence_score(x[1])] for x in X]
    return log_probs

def predict(X):
    log_probs = log_score(X)
    # the correct label is the "wrong" sentence, so min prob
    return np.argmin(log_probs, axis=1)

sent0 = "I put an elephant in the fridge"
sent1 = "I put a turkey in the fridge"
sent2 = "I shit on the desk"
print(sent0, lm.log_sentence_score(sent0))
print(sent1, lm.log_sentence_score(sent1))
print(sent2, lm.log_sentence_score(sent2))

shit on desk: 1.197151736588309e-06
at that time: 2.513889238040172e-05
I put an elephant in the fridge -395.43191664017684
I put a turkey in the fridge -354.52578555293314
I shit on the desk -218.1699055374355


In [15]:
# calculate accuracy on test
y_test_predicted = predict(X_test)
accuracy = accuracy_score(y_test, y_test_predicted)
print(accuracy)

0.53
