In [14]:
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from gensim.models import KeyedVectors
import re
from datasets import load_dataset
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.spatial.distance import cosine
from xgboost import XGBClassifier

In [2]:
# Load pre-trained Word2Vec model
model = KeyedVectors.load_word2vec_format('./GoogleNews-vectors-negative300.bin', binary=True)

# Normalize the vectors
model.init_sims(replace=True)

  model.init_sims(replace=True)


In [3]:
def word_tokenizee(text):
    # Define a regular expression pattern to match words
    pattern = r'\b\w+\b'
    
    # Use the findall method to extract words from the text
    words = re.findall(pattern, text.lower())
    
    return words

In [4]:
def remove_stop_words(phrase):
    custom_stop_words = set(['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', 'should', 'now'])
    word_tokens = phrase.split()
    filtered_phrase = [word for word in word_tokens if word.lower() not in custom_stop_words]
    filtered_phrase = ' '.join(filtered_phrase)
    return filtered_phrase

In [5]:
def get_tfidf_dictionary(train_phrases):
    positive_phrases = [pair[0] for pair in train_phrases if pair[2] == 1]
    negative_phrases = [pair[1] for pair in train_phrases if pair[2] == 0]
    all_phrases = positive_phrases + negative_phrases
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform(all_phrases)
    feature_names = tfidf_vectorizer.get_feature_names_out()
    print("creation started")
    tfidf_dict = {}
    for j, word in enumerate(feature_names):
        word_tfidf_scores = {}
        for i, phrase in enumerate(all_phrases):
            score = tfidf_matrix[i, j]
            if score > 0:
                word_tfidf_scores[phrase] = score
        tfidf_dict[word] = word_tfidf_scores
    return tfidf_dict

In [11]:

def generate_phrase_embedding(phrase, word_embeddings):
    tokens = word_tokenizee(phrase.lower())  # Tokenize phrase into words
    phrase_embedding = np.zeros(word_embeddings.vector_size)  # Initialize phrase embedding
    # total_weight = 0  # Initialize total weight of valid tokens
    # for token in tokens:
    #     if token in word_embeddings and token in tfidf_dict:
    #         tfidf_score = max(tfidf_dict[token].values())  # Use the maximum TF-IDF score for the token
    #         phrase_embedding += word_embeddings[token] * tfidf_score  # Add weighted word embedding to phrase embedding
    #         total_weight += tfidf_score  # Increment total weight
    # if total_weight > 0:
    #     phrase_embedding /= total_weight  # Average pooling weighted embeddings
    # return phrase_embedding
    valid_tokens_count = 0  # Initialize count of valid tokens
    for token in tokens:
        if token in word_embeddings:
            phrase_embedding += word_embeddings[token]  # Add word embedding to phrase embedding
            valid_tokens_count += 1  # Increment count of valid tokens
    if valid_tokens_count > 0:
        phrase_embedding /= valid_tokens_count  # Average pooling embeddings
    return phrase_embedding

In [7]:
def cosine_similarity(embedding1, embedding2):
    similarity = 1 - cosine(embedding1, embedding2)
    return similarity

In [8]:
dataset = load_dataset("PiC/phrase_similarity",split="train")
train_phrases=[]
for i in range(0,7004):
    phrase1=dataset[i]["phrase1"]
    phrase2=dataset[i]["phrase2"]
    phrase1=remove_stop_words(phrase1)
    phrase2=remove_stop_words(phrase2)
    label=dataset[i]["label"]
    tu=(phrase1,phrase2,label)
    train_phrases.append(tu)
print("Dataset Imported")
#dict1=get_tfidf_dictionary(train_phrases)

Dataset Imported


In [16]:
# Generate phrase embeddings for training data
dataset1 = load_dataset("PiC/phrase_similarity",split="test")
test_phrases=[]
for i in range(0,2000):
    phrase1=dataset1[i]["phrase1"]
    phrase2=dataset1[i]["phrase2"]
    phrase1=remove_stop_words(phrase1)
    phrase2=remove_stop_words(phrase2)
    label=dataset1[i]["label"]
    tu=(phrase1,phrase2,label)
    test_phrases.append(tu)
print("Test Dataset Loaded")
train_similarity = []
train_labels = []
for phrase1, phrase2, label in train_phrases:
    embedding1 = generate_phrase_embedding(phrase1, model)
    embedding2 = generate_phrase_embedding(phrase2, model)
    if np.count_nonzero(embedding1) == 0 or np.count_nonzero(embedding2) == 0:
        similarity = 0  # Handle zero division
    else:
        similarity = cosine_similarity(embedding1, embedding2)  # Calculate similarity
    train_similarity.append(similarity)
    train_labels.append(label)

# Convert lists to arrays
train_similarity = np.array(train_similarity).reshape(-1, 1)
train_labels = np.array(train_labels)

# Train a logistic regression classifier directly on the similarity
classifier = LogisticRegression()
classifier.fit(train_similarity.reshape(-1, 1), train_labels)


# Example test phrases
test_similarity = []
true_labels = []
for phrase1, phrase2, true_label in test_phrases:
    embedding1 = generate_phrase_embedding(phrase1, model)
    embedding2 = generate_phrase_embedding(phrase2, model)
    similarity = cosine_similarity(embedding1, embedding2)  # Calculate similarity
    test_similarity.append(similarity)
    true_labels.append(true_label)

# Convert lists to arrays
test_similarity = np.array(test_similarity)

# Predict similarity using the trained logistic regression model
predicted_labels = classifier.predict(test_similarity.reshape(-1, 1))
# Evaluate performance
accuracy = accuracy_score(true_labels, predicted_labels)
print("Accuracy:", accuracy)

Test Dataset Loaded
Accuracy: 0.508


  dist = 1.0 - uv / np.sqrt(uu * vv)
