## Retreiving Data from Previous Analysis

In [2]:
import pickle
import numpy as np

In [2]:
conversion_map = {}
freq_map = {}
one_to_n_map = {}

with open('utils/conversion_map.pkl', 'rb') as fp:
    conversion_map = pickle.load(fp)

with open('utils/freq_map.pkl', 'rb') as fp:
    freq_map = pickle.load(fp)

with open('utils/one_to_n_map.pkl', 'rb') as fp:
    one_to_n_map = pickle.load(fp)

In [3]:
freq_map[:10]

[('u', 271),
 ('im', 148),
 ('dont', 72),
 ('nigga', 49),
 ('niggas', 45),
 ('n', 41),
 ('pls', 35),
 ('ur', 35),
 ('lil', 32),
 ('thats', 30)]

## Testing GLOVE Twitter Embeddings 50D

In [7]:
from gensim.scripts.glove2word2vec import glove2word2vec

'''creating standard glove vectors for the given dataset'''
glove_filename_std = '../glove_vectors/glove.6B.50d.txt'
word2vec_filename_std = glove_filename_std + '.word2vec'

In [16]:
glove2word2vec(glove_filename_std, word2vec_filename_std)

  glove2word2vec(glove_filename_std, word2vec_filename_std)


(400000, 50)

In [8]:
'''creating twitter glove vectors for the given dataset'''
glove_filename_twt = '../glove_vectors/glove.twitter.27B.50d.txt'
word2vec_filename_twt = glove_filename_twt + '.word2vec'

In [17]:
glove2word2vec(glove_filename_twt, word2vec_filename_twt)

  glove2word2vec(glove_filename_twt, word2vec_filename_twt)


(1193514, 50)

In [9]:
from gensim.models import KeyedVectors

In [10]:
vectorizer_std = KeyedVectors.load_word2vec_format(word2vec_filename_std, binary=False)
print('You: ', vectorizer_std.get_vector('you'))

vectorizer_twt = KeyedVectors.load_word2vec_format(word2vec_filename_twt, binary=False)
print('u: ', vectorizer_twt.get_vector('u'))

You:  [-1.0919e-03  3.3324e-01  3.5743e-01 -5.4041e-01  8.2032e-01 -4.9391e-01
 -3.2588e-01  1.9972e-03 -2.3829e-01  3.5554e-01 -6.0655e-01  9.8932e-01
 -2.1786e-01  1.1236e-01  1.1494e+00  7.3284e-01  5.1182e-01  2.9287e-01
  2.8388e-01 -1.3590e+00 -3.7951e-01  5.0943e-01  7.0710e-01  6.2941e-01
  1.0534e+00 -2.1756e+00 -1.3204e+00  4.0001e-01  1.5741e+00 -1.6600e+00
  3.7721e+00  8.6949e-01 -8.0439e-01  1.8390e-01 -3.4332e-01  1.0714e-02
  2.3969e-01  6.6748e-02  7.0117e-01 -7.3702e-01  2.0877e-01  1.1564e-01
 -1.5190e-01  8.5908e-01  2.2620e-01  1.6519e-01  3.6309e-01 -4.5697e-01
 -4.8969e-02  1.1316e+00]
u:  [ 0.083004  1.0053    0.26507  -0.098562 -0.18409  -0.033368 -0.56497
  0.43791   0.20716   1.288    -0.55683  -0.19672  -4.234    -0.063966
 -0.87323  -0.080305 -0.63516  -0.21967  -0.12525   1.0197    0.20205
  0.19305   0.34885   1.0354    0.16417  -0.60181  -0.097535 -0.2351
  0.1023    0.25122  -0.43261  -0.54675  -0.053332  0.56012   0.91886
  0.19207   0.14778  -0.37423 

In [11]:
type(vectorizer_twt.get_vector('u'))

numpy.ndarray

In [12]:
def similarity(vec1, vec2):
    return np.dot(vec1, vec2)/(np.linalg.norm(vec1)*np.linalg.norm(vec2))

In [13]:
def comp_rs_ns(raw, norm):
    try:
        vec_norm_std = vectorizer_std.get_vector(norm)
    except:
        print(f"Std vector not available for {norm}")
        return
    try:
        vec_raw_std = vectorizer_std.get_vector(raw)
    except:
        print(f"Std vector not available for {raw}")
        return
    print(f"Similarity b/w norm std and raw std: {similarity(vec_norm_std, vec_raw_std)}")

In [14]:
def comp_rt_nt(raw, norm):
    try:
        vec_norm_twt = vectorizer_twt.get_vector(norm)
    except:
        print(f"twt vector not available for {norm}")
        return
    try:
        vec_raw_twt = vectorizer_twt.get_vector(raw)
    except:
        print(f"Twt vector not available for {raw}")
        return
    print(f"Similarity b/w norm twt and raw twt: {similarity(vec_norm_twt, vec_raw_twt)}")

In [15]:
for raw, freq in freq_map[:5]:
    norm = list(conversion_map[raw])[0]
    
    print(f"Raw: {raw}, Norm: {norm}")
    print(f"Most Similar by std: {vectorizer_std.most_similar(raw)},\ntwt: {vectorizer_twt.most_similar(raw)}")
    comp_rs_ns(raw, norm)
    comp_rt_nt(raw, norm)
    print()

Raw: u, Norm: you
Most Similar by std: [('o', 0.7432858347892761), ('n', 0.7190415263175964), ('k', 0.7095504999160767), ('z', 0.7089523077011108), ('si', 0.7075910568237305), ('}', 0.6854987740516663), ('h', 0.6824235916137695), ('ne', 0.677496612071991), (']', 0.6725263595581055), ('ti', 0.6685909628868103)],
twt: [('b', 0.9368361234664917), ('r', 0.9109147787094116), ('m', 0.8701779246330261), ('l', 0.867573082447052), ('f', 0.866772472858429), ('>', 0.8398881554603577), ('i', 0.8337498307228088), ('s', 0.8293094038963318), ('<', 0.8248172998428345), ('w', 0.8073067665100098)]
Similarity b/w norm std and raw std: 0.4276062846183777
Similarity b/w norm twt and raw twt: 0.7801663279533386

Raw: im, Norm: i'm
Most Similar by std: [('tirtzu', 0.7494168877601624), ('breisgau', 0.6679399609565735), ('ried', 0.6625338792800903), ('und', 0.6576429009437561), ('tz', 0.6549264788627625), ('der', 0.6433403491973877), ('ist', 0.6413055658340454), ('winkl', 0.6337300539016724), ('vmkabat', 0.631

In [15]:
def evaluate(raw, gold, pred, file_path, ignCaps=False, verbose=False):
    cor = 0
    changed = 0
    total = 0
    tp = 0
    fp = 0
    fn = 0
    new_raws = 0
    folder_path  = 'v1_verbose/'

    if len(gold) != len(pred):
        print('Error: gold normalization contains a different numer of sentences(' + str(len(gold)) + ') compared to system output(' + str(len(pred)) + ')')
        return

    for sentRaw, sentGold, sentPred in zip(raw, gold, pred):
        if len(sentGold) != len(sentPred):
            print('Error: a sentence has a different length in you output, check the order of the sentences')
            return
        for wordRaw, wordGold, wordPred in zip(sentRaw, sentGold, sentPred):
            if ignCaps:
                wordRaw = wordRaw.lower()
                wordGold = wordGold.lower()
                wordPred = wordPred.lower()
            if wordRaw != wordGold:
                changed += 1
            if wordGold == wordPred:
                cor += 1
            if wordRaw != wordGold and wordPred == wordGold:
                tp += 1
            if wordRaw == wordGold and wordPred != wordGold:
                fp += 1
                with open(folder_path + file_path, 'a') as f:
                    s = f"FP| Raw: {' '.join(sentRaw)}: {wordRaw} -> {wordPred}\n"
                    f.write(s)
            if wordRaw != wordGold and wordPred != wordGold:
                fn += 1
                if wordRaw == wordPred:
                    new_raws += 1
                else:
                    with open(folder_path + file_path, 'a') as f:
                        s = f"FN| Raw: {' '.join(sentRaw)}: {wordRaw} -> {wordPred}, Gold: {wordGold}\n"
                        f.write(s)
            elif verbose:
                print(wordRaw, wordGold, wordPred)
            total += 1

    accuracy = cor / total
    lai = (total - changed) / total
    err = (accuracy - lai) / (1-lai)
    recall = tp / (tp + fn)
    precision = tp / (tp + fp)
    f1 = 2 * precision * recall / (precision + recall)

    print('Baseline acc.(LAI): {:.2f}'.format(lai * 100)) 
    print('Accuracy:           {:.2f}'.format(accuracy * 100)) 
    print('ERR:                {:.2f}'.format(err * 100))
    print('Precision:          {:.2f}'.format(precision * 100))
    print('Recall:             {:.2f}'.format(recall * 100))
    print('F1:                 {:.2f}'.format(f1 * 100))
    print(f"Total: {total}, TP: {tp}, FP: {fp}, FN: {fn}, New words: {new_raws}")

## Trying something with Transformers
1. Find a candidate from the mapping, if its found, great
2. Else, trying a Masked model to replace the word

Tried it manually seems like will be a pretty bad approach.

**But we can try something else,**
1. We can have a spell checker, filter the raw word in the spell checker
2. Then we can use the MASKed model of a transformer and *compare its best results with the result from the spell checker*

In [3]:
def loadNormData(path):
    rawData = []
    goldData = []
    curSent = []

    for line in open(path):
        tok = line.strip().split('\t')

        if tok == [''] or tok == []:
            rawData.append([x[0] for x in curSent])
            goldData.append([x[1] for x in curSent])
            curSent = []

        else:
            if len(tok) > 2:
                err('erroneous input, line:\n' + line + '\n in file ' + path + ' contains more then two elements')
            if len(tok) == 1:
                tok.append('')
            curSent.append(tok)

    # in case file does not end with newline
    if curSent != []:
        rawData.append([x[0] for x in curSent])
        goldData.append([x[1] for x in curSent])
    return rawData, goldData

In [4]:
train_raw, train_gold = loadNormData('Data/multilexnorm/data/en/train.norm')
dev_raw, dev_gold = loadNormData('Data/multilexnorm/data/en/dev.norm')

In [5]:
counts = {}
for sentRaw, sentGold in zip(train_raw, train_gold):
    for wordRaw, wordGold in zip(sentRaw, sentGold):
        if wordRaw not in counts:
            counts[wordRaw] = {}
        if wordGold not in counts[wordRaw]:
            counts[wordRaw][wordGold] = 0
        counts[wordRaw][wordGold] += 1

In [6]:
from transformers import pipeline, AutoTokenizer



### Using the Cardiff NLP `Twitter-roberta-base` Model

In [4]:
MODEL = "cardiffnlp/twitter-roberta-base"
fill_mask = pipeline("fill-mask", model=MODEL, tokenizer=MODEL, device=0)
tokenizer = AutoTokenizer.from_pretrained(MODEL)

Downloading: 100%|██████████| 565/565 [00:00<00:00, 548kB/s]
Downloading: 100%|██████████| 501M/501M [02:14<00:00, 3.73MB/s]
Downloading: 100%|██████████| 899k/899k [00:00<00:00, 1.24MB/s]
Downloading: 100%|██████████| 456k/456k [00:00<00:00, 723kB/s]


### Using the `BERT BASE Uncased` Model

In [7]:
MODEL = "bert-base-uncased"
unmasker = pipeline("fill-mask", model=MODEL, device=0)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [8]:
def preprocess(text):
    new_text = []
    for t in text:
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)

In [14]:
def get_best_normalization(token, c_cmap, c_trs):
    scores = {}

    scores[token] = 1
    if len(c_cmap) != 0:
        for i in range(len(c_cmap)):  
            scores[c_cmap[i]] = 3*(len(c_cmap) - i)

    mul_factor = 2
    for token in c_trs:
        if token in scores.keys():
            scores[token] *= mul_factor
        
    sorted_options = [norm for norm, f in sorted(scores.items(), key=lambda item: -item[1])]
    return sorted_options[0]

In [24]:
def normalize_v1(t):
  '''Normalizing with conversion_map as well as some help from transformer model'''
  norm = list.copy(t)

  for ctr, token in enumerate(t):
    # checking in the static map
    c_cmap = []
    if token in counts:
      s = counts[token]
      c_cmap = [norm for norm, f in sorted(s.items(), key=lambda item: -item[1])]

    # checking options from transformer
    # masked_text = t[:ctr] + ['[MASK]'] + t[(ctr + 1):]
    # candidates = unmasker(preprocess(masked_text))
    # c_trs = [c['token_str'].strip() for c in candidates]
    c_trs = []
   

    # using all the resuts (token, c_cmap, c_trs, c_spc)
    norm_token = get_best_normalization(token, c_cmap, c_trs)
    norm[ctr] = norm_token

  return norm

In [11]:
print(f"raw: {dev_raw[0]}")
normalize_v1(dev_raw[0])

raw: ['@cdutra5', 'bruh', 'get', 'out', 'yo', 'feelings', 'lol']


['@cdutra5', 'brother', 'get', 'out', 'your', 'feelings', 'lol']

In [27]:
pred = []
for raw in dev_raw:
  pred.append(normalize_v1(raw))

### MFR Score (Baseline 2)

In [28]:
evaluate(dev_raw, dev_gold, pred, "mfr")

Baseline acc.(LAI): 93.10
Accuracy:           97.37
ERR:                61.93
Precision:          91.88
Recall:             67.93
F1:                 78.11
Total: 9169, TP: 430, FP: 38, FN: 203, New words: 190


MFR with the **new counts map**, which has only *conversions for words that should be normalized*. The mistake here is that it creates a lot of fase positives overfitting on the static map data. Hence its not a good way.

In [90]:
evaluate(dev_raw, dev_gold, pred)

raw: rt, pred: retweet
raw: i, pred: in
raw: were, pred: wear
raw: rt, pred: retweet
raw: k, pred: 
raw: a, pred: ass
raw: rt, pred: retweet
raw: rt, pred: retweet
raw: 4, pred: for
raw: da, pred: the
raw: no, pred: know
raw: every, pred: ever
raw: rt, pred: retweet
raw: be, pred: but
raw: i, pred: in
raw: m, pred: am
raw: rt, pred: retweet
raw: your, pred: you're
raw: i, pred: in
raw: a, pred: ass
raw: i, pred: in
raw: your, pred: you're
raw: rt, pred: retweet
raw: i, pred: in
raw: i, pred: in
raw: rt, pred: retweet
raw: a, pred: ass
raw: every, pred: ever
raw: o, pred: off
raw: be, pred: but
raw: its, pred: it's
raw: i, pred: in
raw: ohhh, pred: oh
raw: rt, pred: retweet
raw: rt, pred: retweet
raw: be, pred: but
raw: a, pred: ass
raw: i, pred: in
raw: i, pred: in
raw: order, pred: 
raw: a, pred: ass
raw: rt, pred: retweet
raw: 4, pred: for
raw: rt, pred: retweet
raw: i, pred: in
raw: a, pred: ass
raw: a, pred: ass
raw: i, pred: in
raw: i, pred: in
raw: i, pred: in
raw: rt, pred: retw

Making some hyperparameter changes so as to improve the score with the transformer

Results with using the `BERT-BASE-UNCASED` Model

In [17]:
evaluate(dev_raw, dev_gold, pred, "trs_mf2") #mul_factor = 2

Baseline acc.(LAI): 93.10
Accuracy:           97.40
ERR:                62.40
Precision:          91.58
Recall:             68.72
F1:                 78.52
Total: 9169, TP: 435, FP: 40, FN: 198, New words: 187


In [21]:
evaluate(dev_raw, dev_gold, pred_new, "trs_x2_mf2") # running the normalisation on the semi-normalised batch

Baseline acc.(LAI): 93.10
Accuracy:           97.38
ERR:                62.09
Precision:          91.54
Recall:             68.40
F1:                 78.30
Total: 9169, TP: 433, FP: 40, FN: 200, New words: 186


In [51]:
evaluate(dev_raw, dev_gold, pred) #mul_factor = 3

Baseline acc.(LAI): 93.10
Accuracy:           97.21
ERR:                59.56
Precision:          91.07
Recall:             66.03
F1:                 76.56
Total: 9169, TP: 418, FP: 41, FN: 215


Increasing the `mul_factor` we can see that the 