In [50]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
from torch.autograd import Variable
import pickle
import re
from collections import defaultdict
from tqdm import tqdm
from scipy.spatial.distance import cosine

from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split

import math
import numpy as np
import scipy

### Preparing Data

In [12]:
# text_data = []

In [13]:
# def file_to_data(path):
#     try:
#         with open(path) as f:
#             data = [[x.rstrip().split('\t')[1], x.rstrip().split('\t')[2], x.rstrip().split('\t')[0]]  for x in f.readlines()]
#     except FileNotFoundError:
#         print("File does not exist")
#         return
    
#     formatted_data = []
#     for row in data:
#         formatted_data.append([row[0], row[1], row[2]])

#     return formatted_data

In [123]:
# year = '2016'
# data_file = 'postediting.test.tsv'
# path = f'data/sts/semeval-sts/{year}/{data_file}'
# data = file_to_data(path)

In [127]:
# with open('data/Multi-30k/train.en') as f:
#     data = [x.strip() for x in f.readlines()]
# data[:5]

['Two young, White males are outside near many bushes.',
 'Several men in hard hats are operating a giant pulley system.',
 'A little girl climbing into a wooden playhouse.',
 'A man in a blue shirt is standing on a ladder cleaning a window.',
 'Two men are at the stove preparing food.']

In [135]:
# for dat in data[15000:20000]:
#     text_data.append(dat)

In [136]:
# text_data = list(set(text_data))
# len(text_data)

60265

In [4]:
with open('data/base_text_data.pkl', 'rb') as f:
    text_data = pickle.load(f)

### Word2Vec Training

In [51]:
class Tokenizer_w2v:
    def __init__(self):
        self.punctuations = [r'\.', r'\.{2,}',
                             r'\!+', r'\:+', r'\;+', r'\"+', r"\'+", r'\?+', r'\,+', r'\(|\)|\[|\]|\{|\}|\<|\>']

    def clean(self, line):
        for pattern in self.punctuations:
            line = re.sub(pattern, '', line)
        line = re.sub(r'[^a-z]', ' ', line.lower())
        return line

    def tokenize(self, line):
        line = self.clean(line)
        return line.split()

In [52]:
class Review:
    tokenizer = Tokenizer_w2v()
    
    def __init__(self, text):
        self.text = text
        self.tokens = Review.tokenizer.tokenize(self.text)

    def __iter__(self):
        return iter(self.tokens)

    def __str__(self):
        return self.text

    def __len__(self):
        return len(self.tokens)

    def __getitem__(self, idx):
        return self.tokens[idx]


In [None]:
training_data = []

for r in text_data:
    training_data.append(Review(r))

In [53]:
class CBOW(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(CBOW, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear = nn.Linear(embedding_dim, vocab_size)

    def forward(self, inputs):
        input_embeds = self.embeddings(inputs)
        embeds = torch.mean(input_embeds, dim=1)
        out = self.linear(embeds)
        return F.log_softmax(out, dim=1)

In [54]:
class Word2Vec:
    def __init__(self, reviews, context_size=2, embedding_size=50, oov_threshold=2, neg_sample_size=5, lr=0.001):

        self.reviews = reviews
        self.oov_threshold = oov_threshold
        self.oov_token = '<OOV>'
        self.context_size = context_size
        self.embedding_size = embedding_size

        self.vocabulary = {self.oov_token}
        self.vocab_idx = {self.oov_token: 0}
        self.vocab_ridx = {0: self.oov_token}

        self.freq = defaultdict(int)
        self.freq_dist = [0]
        self.total_word_count = 0

        self.build_vocabulary()

        self.BATCH_SIZE = 64
        self.neg_sample_size = neg_sample_size

        self.model = CBOW(self.N, self.embedding_size)
        self.dataset = self.build_dataset()
        self.weights = self.negative_sampling()
        self.optimizer = optim.Adam(self.model.parameters(), lr)
    
    def build_vocabulary(self):
        
        print("Building Vocabulary")
        for review in tqdm(self.reviews):
            for token in review:
                self.freq[token] += 1

        index = 1
        for token, f in self.freq.items():
            if f > self.oov_threshold:
                self.vocabulary.add(token)
                self.vocab_idx[token] = index
                self.vocab_ridx[index] = token
                self.freq_dist.append(f)
                index += 1
            else:
                self.freq_dist[0] += f

        self.total_word_count = sum(self.freq.values())        
        self.N = len(self.vocabulary)
        print(f"Total Vocabulary Size: {self.N}")

    def build_dataset(self):

        print("Building Dataset")
        dataset = []
        for review in tqdm(self.reviews):
            for i in range(self.context_size, len(review) - self.context_size):
                focus = review[i]
                if focus not in self.vocabulary:
                    focus = self.oov_token
                focus_index = self.vocab_idx[focus]
                context_indices = []
                for j in range(i - self.context_size, i + self.context_size + 1):
                    if i == j:
                        continue
                    context = review[j]
                    if context not in self.vocabulary:
                        context = self.oov_token
                    context_index = self.vocab_idx[context]
                    context_indices.append(context_index)
                dataset.append((context_indices, focus_index))

        return dataset


    def negative_sampling(self):
        print("Computing Weights")
        normalized_freq = F.normalize(torch.Tensor(self.freq_dist).pow(0.75), dim=0)
        weights = torch.ones(len(self.freq_dist))

        for _ in tqdm(range(len(self.freq_dist))):
            for _ in range(self.neg_sample_size):
                neg_index = torch.multinomial(normalized_freq, 1)[0]
                weights[neg_index] += 1
        
        return weights


    def train(self, num_epochs):
        losses = []
        loss_fn = nn.NLLLoss(weight=self.weights)

        for epoch in range(num_epochs):
            print(f"Epoch {epoch}")
            net_loss = 0
            for i in tqdm(range(0, len(self.dataset), self.BATCH_SIZE)):
                batch = self.dataset[i: i+self.BATCH_SIZE]

                context = [x[0] for x in batch]
                focus = [x[1] for x in batch]

                context_var = Variable(torch.LongTensor(context))
                focus_var = Variable(torch.LongTensor(focus))

                self.optimizer.zero_grad()
                log_probs = self.model(context_var)
                loss = loss_fn(log_probs, focus_var)
                loss.backward()
                self.optimizer.step()

                net_loss += loss.item()
            print(f"Loss: {loss.item()}")
            losses.append(net_loss)

    
    def get_embedding(self, word_idx):
        embedding_index = Variable(torch.LongTensor([word_idx]))
        return self.model.embeddings(embedding_index).data[0]
    
    def get_closest_vector(self, _word, k):
        
        word = _word.lower()

        if word not in self.vocabulary:
            word = self.oov_token

        distances = []
        focus_index = self.vocab_idx[word]
        focus_embedding = self.get_embedding(focus_index)

        for i in range(1, self.N):
            if i == focus_index:
                continue
        
            comp_embedding = self.get_embedding(i)
            comp_word = self.vocab_ridx[i]
            dist = cosine(focus_embedding, comp_embedding)
            distances.append({'Word': comp_word, 'Distance': dist})
        
        distances = sorted(distances, key=lambda x: x['Distance'])

        return [x['Word'] for x in distances[:k]]

In [55]:
with open('word2vec_models/word2vec_model_35.pkl', 'rb') as f:
    word2vec_model_35 = pickle.load(f)

In [56]:
class Tokenizer_TFIDF:
    def __init__(self):
        self.regex_subs = {
            r'(https?:\/\/)\S+': "0URL0",
            r'(?<!http://)www\.\S+': "0URL0",
            r'(\W)(?=\1)': '',
            r'(?<=[a-zA-Z])(\-)(?=[a-zA-Z])': ''
        }

        self.punctuations = [r'\.', r'\.{2,}',
                             r'\!+', r'\:+', r'\;+', r'\"+', r"\'+", r'\?+', r'\,+', r'\(|\)|\[|\]|\{|\}|\<|\>']

        self.delimiter = '<SPLIT>'
        self.stemmer = SnowballStemmer(language='english')    
        self.stop_words = set(stopwords.words('english'))

    def clean_line(self, line):
        for pattern, rep in self.regex_subs.items():
            line = re.sub(pattern, rep, line)
        for pattern in self.punctuations:
            line = re.sub(pattern, '', line)
        return line.lower()

    def tokenize_line(self, line):
        line = re.sub('\s+', self.delimiter, line)

        token_list = [x.strip()
                      for x in line.split(self.delimiter) if x.strip() != '']

        return token_list

    def clean_and_tokenize(self, lines):
        if isinstance(lines, list):
            cleaned_tokens = []
            for line in lines:
                if not len(line.strip()):
                    continue
                line = self.clean_line(line)
                tokens = self.tokenize_line(line)
                cleaned_tokens.append(tokens)
            return cleaned_tokens
        else:
            line = self.clean_line(lines)
            tokens = self.tokenize_line(line)
            return tokens

    def _clean(self, line):
        line = self.clean_line(line)
        
        cleaned = []
        for token in line.split():
            if token not in self.stop_words:
                cleaned.append(self.stemmer.stem(token))

        return " ".join(cleaned)

    def clean(self, lines):
        if isinstance(lines, list):
            cleaned_lines = []
            for line in lines:
                if not len(line.strip()):
                    continue
                line = self._clean(line)
                cleaned_lines.append(line)
            return cleaned_lines
        else:
            line = self._clean(lines)
            return line

In [57]:
class TfidfVectorizer():
    def __init__(self):
        self.tf_l = []
        self.tf_r = []

        self.idf = defaultdict(int)

        self.vocab = {}
        self.data_l = None
        self.data_r = None
        self.vocab_len = 0
        self.num_docs = 0

    def create_vocab(self, data):
        print("Creating Vocabulary...")
        self.data_l = []
        self.data_r = []
        for items in data:
            self.data_l.append(items[0].split())
            self.data_r.append(items[1].split())
        self.num_docs = len(self.data_l)

        for text in self.data_l:
            for token in text:
                if not token in self.vocab:
                    self.vocab[token] = self.vocab_len
                    self.vocab_len += 1
        
        for text in self.data_r:
            for token in text:
                if not token in self.vocab:
                    self.vocab[token] = self.vocab_len
                    self.vocab_len += 1

    def compute_tf(self):
        print("Computing TF Scores...")
        for text in self.data_l:
            d = defaultdict(int)
            for token in text:
                d[self.vocab[token]] += 1
            self.tf_l.append(d)
        
        for text in self.data_r:
            d = defaultdict(int)
            for token in text:
                d[self.vocab[token]] += 1
            self.tf_r.append(d)
    
    def compute_idf(self):
        print("Computing IDF Scores...")
        for token in self.vocab:
            df = 0
            for text in self.data_l:
                if token in text:
                    df += 1
            
            for text in self.data_r:
                if token in text:
                    df += 1
        
            self.idf[self.vocab[token]] = math.log((1 + self.num_docs)/(1 + df)) + 1
     
    def fit_transform(self, data):
        self.create_vocab(data)
        self.compute_tf()
        self.compute_idf()
        print("Creating TF-IDF Vectors...")
        X_l = np.zeros((self.num_docs, self.vocab_len), dtype='float32')
        X_r = np.zeros((self.num_docs, self.vocab_len), dtype='float32')

        for i in range(self.num_docs):
            for token in self.data_l[i]:
                X_l[i][self.vocab[token]] = self.tf_l[i][self.vocab[token]] * self.idf[self.vocab[token]]
        
        for i in range(self.num_docs):
            for token in self.data_r[i]:
                X_r[i][self.vocab[token]] = self.tf_r[i][self.vocab[token]] * self.idf[self.vocab[token]]
        
        return X_l, X_r
    
    def transform(self, data):
        data_l = []
        data_r = []
        tf_l = []
        tf_r = []
        num_docs = len(data)

        for items in data:
            data_l.append(items[0].split())
            data_r.append(items[1].split())

        for text in data_l:
            d = defaultdict(int)
            for token in text:
                if token in self.vocab:
                    d[self.vocab[token]] += 1
            tf_l.append(d)

        for text in data_r:
            d = defaultdict(int)
            for token in text:
                if token in self.vocab:
                    d[self.vocab[token]] += 1
            tf_r.append(d)
        
        X_l = np.zeros((num_docs, self.vocab_len), dtype='float32')
        X_r = np.zeros((num_docs, self.vocab_len), dtype='float32')

        for i in range(num_docs):
            for token in data_l[i]:
                if token in self.vocab:
                    X_l[i][self.vocab[token]] = tf_l[i][self.vocab[token]] * self.idf[self.vocab[token]]
        
        for i in range(num_docs):
            for token in data_r[i]:
                if token in self.vocab:
                    X_r[i][self.vocab[token]] = tf_r[i][self.vocab[token]] * self.idf[self.vocab[token]]
        
        return X_l, X_r

In [58]:
with open('tfidf_vectorizer.pkl', 'rb') as f:
    vectorizer = pickle.load(f)
tokenizer_tfidf = Tokenizer_TFIDF()
tokenizer_w2v = Tokenizer_w2v()

In [59]:
with open('data/test_data.pkl', 'rb') as f:
    test_data = pickle.load(f)

In [60]:
x_test = test_data['x']
y_test = test_data['y']

In [116]:
def get_weighted_vector(word2vec_model, tfidf_model, tfidf_tokenizer, w2v_tokenizer, sent):
    vec = np.zeros(word2vec_model.embedding_size)
    tfidf_sent = tfidf_tokenizer.clean(sent)
    tf_idf_vector = np.squeeze(tfidf_model.transform([[tfidf_sent, '']])[0])
    tok_used = 0

    for token in w2v_tokenizer.tokenize(sent):
        tfidf_token = tfidf_tokenizer.clean(token)
        if tfidf_token == '':
            continue
        try:
            tfidf_token_idx = tfidf_model.vocab[tfidf_token]
        except KeyError:
            continue
        tf_idf_weight = tf_idf_vector[tfidf_token_idx]
        try:
            w2v_vec = word2vec_model.get_embedding(word2vec_model_35.vocab_idx[token]).detach().numpy()
        except KeyError:
            continue
        vec += w2v_vec * tf_idf_weight
        tok_used += 1
    
    return vec 

In [101]:
def cosine_similarity(vec_1, vec_2):
    return vec_1@vec_2.T/(np.linalg.norm(vec_1) * np.linalg.norm(vec_2))

In [117]:
preds = []
for x in x_test:
    vec_1 = get_weighted_vector(word2vec_model_35, vectorizer, tokenizer_tfidf, tokenizer_w2v, x[0])
    vec_2 = get_weighted_vector(word2vec_model_35, vectorizer, tokenizer_tfidf, tokenizer_w2v, x[1])
    preds.append(5 * cosine_similarity(vec_1, vec_2))

Simple Word2Vec

In [115]:
pearson_score, _ = scipy.stats.pearsonr(preds, y_test)
pearson_score

0.5782066792384322

Word2Vec with weighted TF-IDF

In [106]:
pearson_score, _ = scipy.stats.pearsonr(preds, y_test)
pearson_score

0.6248150712858092

Word2Vec With TF-IDF Selection

In [118]:
pearson_score, _ = scipy.stats.pearsonr(preds, y_test)
pearson_score

0.6327463478758604