In [None]:
import torch
from urllib.request import urlretrieve
print(torch.cuda.is_available())

# Download pre-trained InferSent model
url = 'https://dl.fbaipublicfiles.com/infersent/infersent1.pkl'
urlretrieve(url, 'infersent1.pkl')

# Download GloVe embeddings (this will take some time)
!wget http://nlp.stanford.edu/data/glove.6B.zip

# Unzip GloVe embeddings
!unzip glove.6B.zip -d glove


True
--2024-11-11 20:37:55--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2024-11-11 20:37:55--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2024-11-11 20:37:55--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


In [None]:
import numpy as np
import time
import torch
import torch.nn as nn

class InferSent(nn.Module):

    def __init__(self, config):
        super(InferSent, self).__init__()
        self.bsize = config['bsize']
        self.word_emb_dim = config['word_emb_dim']
        self.enc_lstm_dim = config['enc_lstm_dim']
        self.pool_type = config['pool_type']
        self.dpout_model = config['dpout_model']
        self.version = 1 if 'version' not in config else config['version']

        self.enc_lstm = nn.LSTM(self.word_emb_dim, self.enc_lstm_dim, 1,
                                bidirectional=True, dropout=self.dpout_model)

        assert self.version in [1, 2]
        if self.version == 1:
            self.bos = '<s>'
            self.eos = '</s>'
            self.max_pad = True
            self.moses_tok = False
        elif self.version == 2:
            self.bos = '<p>'
            self.eos = '</p>'
            self.max_pad = False
            self.moses_tok = True

    def is_cuda(self):
        return self.enc_lstm.bias_hh_l0.data.is_cuda

    def forward(self, sent_tuple):
        sent, sent_len = sent_tuple

        # Sort by length (keep idx)
        sent_len_sorted, idx_sort = np.sort(sent_len)[::-1], np.argsort(-sent_len)
        sent_len_sorted = sent_len_sorted.copy()
        idx_unsort = np.argsort(idx_sort)

        idx_sort = torch.from_numpy(idx_sort).cuda() if self.is_cuda() \
            else torch.from_numpy(idx_sort)
        sent = sent.index_select(1, idx_sort)

        # Handling padding in Recurrent Networks
        sent_packed = nn.utils.rnn.pack_padded_sequence(sent, sent_len_sorted)
        sent_output = self.enc_lstm(sent_packed)[0]  # seqlen x batch x 2*nhid
        sent_output = nn.utils.rnn.pad_packed_sequence(sent_output)[0]

        # Un-sort by length
        idx_unsort = torch.from_numpy(idx_unsort).cuda() if self.is_cuda() \
            else torch.from_numpy(idx_unsort)
        sent_output = sent_output.index_select(1, idx_unsort)

        # Pooling
        if self.pool_type == "mean":
            sent_len = torch.FloatTensor(sent_len.copy()).unsqueeze(1).cuda()
            emb = torch.sum(sent_output, 0).squeeze(0)
            emb = emb / sent_len.expand_as(emb)
        elif self.pool_type == "max":
            if not self.max_pad:
                sent_output[sent_output == 0] = -1e9
            emb = torch.max(sent_output, 0)[0]
            if emb.ndimension() == 3:
                emb = emb.squeeze(0)
                assert emb.ndimension() == 2

        return emb

    def set_w2v_path(self, w2v_path):
        self.w2v_path = w2v_path

    def get_word_dict(self, sentences, tokenize=True):
        # create vocab of words
        word_dict = {}
        sentences = [s.split() if not tokenize else self.tokenize(s) for s in sentences]
        for sent in sentences:
            for word in sent:
                if word not in word_dict:
                    word_dict[word] = ''
        word_dict[self.bos] = ''
        word_dict[self.eos] = ''
        return word_dict

    def get_w2v(self, word_dict):
        assert hasattr(self, 'w2v_path'), 'w2v path not set'
        # create word_vec with w2v vectors
        word_vec = {}
        with open(self.w2v_path, encoding='utf-8') as f:
            for line in f:
                word, vec = line.split(' ', 1)
                if word in word_dict:
                    word_vec[word] = np.fromstring(vec, sep=' ')
        print('Found %s(/%s) words with w2v vectors' % (len(word_vec), len(word_dict)))
        return word_vec

    def get_w2v_k(self, K):
        assert hasattr(self, 'w2v_path'), 'w2v path not set'
        # create word_vec with k first w2v vectors
        k = 0
        word_vec = {}
        with open(self.w2v_path, encoding='utf-8') as f:
            for line in f:
                word, vec = line.split(' ', 1)
                if k <= K:
                    word_vec[word] = np.fromstring(vec, sep=' ')
                    k += 1
                if k > K:
                    if word in [self.bos, self.eos]:
                        word_vec[word] = np.fromstring(vec, sep=' ')

                if k > K and all([w in word_vec for w in [self.bos, self.eos]]):
                    break
        return word_vec

    def build_vocab(self, sentences, tokenize=True):
        assert hasattr(self, 'w2v_path'), 'w2v path not set'
        word_dict = self.get_word_dict(sentences, tokenize)
        self.word_vec = self.get_w2v(word_dict)
        print('Vocab size : %s' % (len(self.word_vec)))

    # build w2v vocab with k most frequent words
    def build_vocab_k_words(self, K):
        assert hasattr(self, 'w2v_path'), 'w2v path not set'
        self.word_vec = self.get_w2v_k(K)
        print('Vocab size : %s' % (K))

    def update_vocab(self, sentences, tokenize=True):
        assert hasattr(self, 'w2v_path'), 'warning : w2v path not set'
        assert hasattr(self, 'word_vec'), 'build_vocab before updating it'
        word_dict = self.get_word_dict(sentences, tokenize)

        # keep only new words
        for word in self.word_vec:
            if word in word_dict:
                del word_dict[word]

        # udpate vocabulary
        if word_dict:
            new_word_vec = self.get_w2v(word_dict)
            self.word_vec.update(new_word_vec)
        else:
            new_word_vec = []
        print('New vocab size : %s (added %s words)'% (len(self.word_vec), len(new_word_vec)))

    def get_batch(self, batch):
        # sent in batch in decreasing order of lengths
        # batch: (bsize, max_len, word_dim)
        embed = np.zeros((len(batch[0]), len(batch), self.word_emb_dim))

        for i in range(len(batch)):
            for j in range(len(batch[i])):
                embed[j, i, :] = self.word_vec[batch[i][j]]

        return torch.FloatTensor(embed)

    def tokenize(self, s):
        from nltk.tokenize import word_tokenize
        if self.moses_tok:
            s = ' '.join(word_tokenize(s))
            s = s.replace(" n't ", "n 't ")  # HACK to get ~MOSES tokenization
            return s.split()
        else:
            return word_tokenize(s)

    def prepare_samples(self, sentences, bsize, tokenize, verbose):
        sentences = [[self.bos] + s.split() + [self.eos] if not tokenize else
                     [self.bos] + self.tokenize(s) + [self.eos] for s in sentences]
        n_w = np.sum([len(x) for x in sentences])

        # filters words without w2v vectors
        for i in range(len(sentences)):
            s_f = [word for word in sentences[i] if word in self.word_vec]
            if not s_f:
                import warnings
                warnings.warn('No words in "%s" (idx=%s) have w2v vectors. \
                               Replacing by "</s>"..' % (sentences[i], i))
                s_f = [self.eos]
            sentences[i] = s_f

        lengths = np.array([len(s) for s in sentences])
        n_wk = np.sum(lengths)
        if verbose:
            print('Nb words kept : %s/%s (%.1f%s)' % (
                        n_wk, n_w, 100.0 * n_wk / n_w, '%'))

        # sort by decreasing length
        lengths, idx_sort = np.sort(lengths)[::-1], np.argsort(-lengths)
        sentences = np.array(sentences)[idx_sort]

        return sentences, lengths, idx_sort

    def encode(self, sentences, bsize=64, tokenize=True, verbose=False):
        tic = time.time()
        sentences, lengths, idx_sort = self.prepare_samples(
                        sentences, bsize, tokenize, verbose)

        embeddings = []
        for stidx in range(0, len(sentences), bsize):
            batch = self.get_batch(sentences[stidx:stidx + bsize])
            if self.is_cuda():
                batch = batch.cuda()
            with torch.no_grad():
                batch = self.forward((batch, lengths[stidx:stidx + bsize])).data.cpu().numpy()
            embeddings.append(batch)
        embeddings = np.vstack(embeddings)

        # unsort
        idx_unsort = np.argsort(idx_sort)
        embeddings = embeddings[idx_unsort]

        if verbose:
            print('Speed : %.1f sentences/s (%s mode, bsize=%s)' % (
                    len(embeddings)/(time.time()-tic),
                    'gpu' if self.is_cuda() else 'cpu', bsize))
        return embeddings

--2024-11-11 18:49:00--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2024-11-11 18:49:00--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2024-11-11 18:49:01--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


202

  checkpoint = torch.load('/content/InferSent/infersent1.pkl', map_location=device)


LookupError: 
**********************************************************************
  Resource [93mpunkt[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt/PY3/english.pickle[0m

  Searched in:
    - '/root/nltk_data'
    - '/usr/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
    - ''
**********************************************************************


In [None]:
def load_infersent_model():
    params_model = {
        'word_emb_dim': 300,
        'bsize': 64,
        'enc_lstm_dim': 2048,
        'pool_type': 'mean',
        'dpout_model': 0.0,
        'version': 1,
    }
    infer_sent = InferSent(params_model)

    # Load GloVe embeddings
    glove_path = '/content/InferSent/glove/glove.6B.300d.txt'
    infer_sent.set_w2v_path(glove_path)
    infer_sent.build_vocab_k_words(K=100000)  # You can change K to fit your needs
    return infer_sent

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def calculate_similarity(sentence1, sentence2):
    """
    Given two sentences, calculate the similarity score between 0-5 using InferSent.
    """
    # Tokenize the sentences
    sentences = [sentence1, sentence2]

    # Encode the sentences using the InferSent model
    embeddings = infer_sent.encode(sentences, tokenize=False)

    # Compute the cosine similarity between the two sentence embeddings
    cos_sim = np.dot(embeddings[0], embeddings[1]) / (np.linalg.norm(embeddings[0]) * np.linalg.norm(embeddings[1]))

    # Normalize the similarity score to be between 0 and 5
    similarity = (cos_sim + 1) * 2.5  # Cosine similarity is between -1 and 1, so we scale it to be between 0 and 5

    return similarity
# sentence1 = "I love machine learning."
# sentence2 = "I love machine learning"
# sentence1 = "The cat sat on the cozy sofa by the window."
# sentence2 = "A cat was relaxing on a comfortable couch near the window."
# sentence1 = "The quick brown fox jumped over the lazy dog."
# sentence2 = "A last emrald of the pearl city green bird barks over a active cat."
# sentence1 = "The sun sets over the ocean."
# sentence2 = "The car engine makes a loud noise."
# sentence1 = "I hate machine learning."
# sentence2 = "Cricket is specifically bad."

# Calculate and print similarity score
sentence1 = input("Enter Sentence1: ") #"I hate machine learning."
sentence2 = input("Enter Sentence2: ") # "Cricket is specifically bad."
similarity = calculate_similarity(sentence1, sentence2)
# print(f"Similarity Score (0-5): {similarity}")

print(f"Cosine Similarity (0-5 scale): {similarity}")
print("--"*30)
print(f"Pearson Correlation Coefficient: {pearson_corr}")
print("--"*30)
print(f"Spearman Rank Correlation: {spearman_corr}")

# This and the above cell works fiine for similar sentence

In [None]:
import torch
import numpy as np
from nltk.tokenize import word_tokenize
from models import InferSent
import nltk
from scipy.stats import pearsonr, spearmanr
from sklearn.metrics.pairwise import cosine_similarity

# Ensure punkt tokenizer is downloaded
nltk.download('punkt')

# Initialize the InferSent model
params_model = {
    'word_emb_dim': 300,
    'bsize': 64,
    'enc_lstm_dim': 2048,
    'pool_type': 'max',
    'dpout_model': 0.0,
    'version': 1,
}
infer_sent = InferSent(params_model)

# Set the GloVe word vectors path


# Build vocabulary with the most frequent words (adjust K based on your dataset)
# infer_sent.build_vocab_k_words(K=100000)

# Load the pre-trained weights
model_path = '/content/InferSent/infersent1.pkl'
infer_sent.load_state_dict(torch.load(model_path))
glove_path = '/content/InferSent/glove/glove.6B.300d.txt'
infer_sent.set_w2v_path(glove_path)
sentence1 = input("Enter Sentence1: ") #"I hate machine learning."
sentence2 = input("Enter Sentence2: ") # "Cricket is specifically bad."
sentences = [sentence1, sentence2]
infer_sent.build_vocab(sentences, tokenize=False)  # Set tokenize=True if you need tokenization


# Encode sentences to get embeddings
embeddings = infer_sent.encode(sentences, tokenize=False, bsize=64)

# Compute cosine similarity between the embeddings
similarity = cosine_similarity([embeddings[0]], [embeddings[1]])

similarity_score_0_to_5 = (similarity[0][0] + 1) * 2.5  # Scale to 0-5 range

pearson_corr, _ = pearsonr(embeddings[0], embeddings[1])

# Compute Spearman rank correlation
spearman_corr, _ = spearmanr(embeddings[0], embeddings[1])

# Print results
print(f"Cosine Similarity (0-5 scale): {similarity_score_0_to_5}")
print("--"*30)
print(f"Pearson Correlation Coefficient: {pearson_corr}")
print("--"*30)
print(f"Spearman Rank Correlation: {spearman_corr}")




In [None]:
def calculate_similarity(sentence1, sentence2):
    # Ensure sentences are passed as a list of strings (not tokenized)
    sentences = [sentence1, sentence2]

    # Encode the sentences using the InferSent model
    print(f"Encoding sentences...")
    embeddings = infer_sent.encode(sentences, tokenize=True)

    # Compute cosine similarity between the two embeddings
    print(f"Computing similarity...")
    similarity = cosine_similarity([embeddings[0]], [embeddings[1]])

    return similarity[0][0]

# Example sentences
sentence1 = "I love machine learning."
sentence2 = "Cricket is fascinating."

# Calculate and print similarity score
similarity = calculate_similarity(sentence1, sentence2)
print(f"Similarity Score (0-5): {similarity}")