## IHLT Project
# Semantic Textual Similarity

### IMPORTS

In [None]:
import os
import math
import nltk
from string import punctuation
from collections import Counter
from scipy.stats import pearsonr
from nltk import word_tokenize, pos_tag
from nltk.metrics import jaccard_distance
from nltk.corpus import stopwords, wordnet
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPRegressor

nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')
nltk.download('wordnet_ic')
from nltk.corpus import wordnet_ic
brown_ic = wordnet_ic.ic('ic-brown.dat')
nltk.download('averaged_perceptron_tagger')

# Mount google colab
#from google.colab import drive
#drive.mount('/content/drive')

[nltk_data] Downloading package wordnet to /Users/yaiza/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/yaiza/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/yaiza/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet_ic to
[nltk_data]     /Users/yaiza/nltk_data...
[nltk_data]   Package wordnet_ic is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/yaiza/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

### DATA LOAD AND PREPROCESS

In [None]:
def read_sentences(folder_path):
    """
    Read sentences from files in the given folder path.

    Parameters:
    - folder_path (str): Path to folder containing files with pairs of sentences.

    Returns:
    - s1 (list of str): List of first sentences.
    - s2 (list of str): List of second sentences.
    """
    s1 = []
    s2 = []

    file_names = sorted(os.listdir(folder_path))

    for file_name in file_names:
        if file_name.startswith('STS.input.'):
            file_path = os.path.join(folder_path, file_name)

            with open(file_path, encoding="utf8") as f:
                for line in f:
                    pair_of_sentences = line.strip().split('\t')
                    s1.append(pair_of_sentences[0])
                    s2.append(pair_of_sentences[1])

    return s1, s2


In [None]:
def read_gs(folder_path):
    """
    Read gold standard scores from files in the given folder path.

    Parameters:
    - folder_path (str): Path to folder containing files with gold standard scores.

    Returns:
    - gs (list of float): List of gold standard scores.
    """
    gs = []
    file_names = sorted(os.listdir(folder_path))

    for file_name in file_names:
        if file_name.startswith('STS.gs.') and not file_name.startswith('STS.gs.ALL'):
            file_path = os.path.join(folder_path, file_name)
            with open(file_path, encoding="utf8") as f:
                for line in f:
                    score = float(line.strip())
                    gs.append(score)

    return gs


In [None]:
def tokenize_words(sentences):
    '''
    Tokenize words in sentences.

    Parameters:
    - sentences (list of str): List of sentences.

    Returns:
    - clean_sentences (list of list of str): List of tokenized sentences.
    - clean_sentences_with_sw (list of list of str): List of tokenized sentences with stopwords.
    '''
    clean_sentences_with_sw = []
    clean_sentences = []

    for i in range(len(sentences)):
        clean_sentence_with_sw = []
        clean_sentence = []
        tokenized_sentence = word_tokenize(sentences[i])
        for word in tokenized_sentence:
            word = word.lower()
            if word not in punctuation:
                clean_sentence_with_sw.append(word)
                if word not in stopwords.words('english'):
                    clean_sentence.append(word)

        clean_sentences_with_sw.append(clean_sentence_with_sw)
        clean_sentences.append(clean_sentence)

    return clean_sentences, clean_sentences_with_sw

In [None]:
def lemmatize_tokens(pos_tags):
    """
    Given a list with pairs of (word, pos_tag), a list with the lemmas of the words is returned.

    Parameters:
    - pos_tags (list): List with tuples of (word, pos_tag).

    Returns:
    - lemmas (list): List with the lemmas of the words.
    """
    lemmas = []
    wnl = nltk.stem.WordNetLemmatizer()

    d = {'N': 'n', 'V': 'v', 'R': 'r', 'J': 'a'}

    for p in pos_tags:
        if p[1] in d:
            lemmas.append(wnl.lemmatize(p[0], pos=d[p[1]]))
        else:
            lemmas.append(p[0])
    return lemmas

In [None]:
def preprocess_sentences(sentences):
    """
    Given a list of sentences, the words in the sentences are tokenized,
    and the lemmas of the words are computed.

    Parameters:
    - sentences (list of str): List of sentences.

    Returns:
    - word_tokens (list of lists): List of lists with the tokenized sentences.
    - lemmas (list of lists): List of lists with the lemmas of the words in the sentences.
    """
    word_tokens, word_tokens_sw = tokenize_words(sentences)
    pos_tags = [pos_tag(sentence) for sentence in word_tokens]
    lemmas = [lemmatize_tokens(sentence) for sentence in pos_tags]

    return word_tokens, word_tokens_sw, pos_tags, lemmas

### DEFINITION OF FEATURE VARIABLES

In [None]:
def length_difference(words1, words2):
    """
    Given two lists of words, the difference in length between the two lists is computed.

    Parameters:
    - words1 (list): List of words.
    - words2 (list): List of words.

    Returns:
    - (float): Difference in length between the two lists.
    """
    max_length = max(len(words1), len(words2))
    if max_length == 0:
        return 0
    return abs(len(words1) - len(words2)) / max_length

#### Lexical dimensions

In [None]:
def jaccard_similarity(words1, words2):
    """
    Given two lists of words, the Jaccard similarity between the two lists is computed.

    Parameters:
    - words1 (list): List of words.
    - words2 (list): List of words.

    Returns:
    - jaccard_similarity (float): Jaccard similarity between the two lists of words.
    """
    return 1-jaccard_distance(set(words1), set(words2))

In [None]:
def dice_similarity(words1, words2):
    """
    Given two lists of words, the Dice similarity between the two lists is computed.

    Parameters:
    - words1 (list): List of words.
    - words2 (list): List of words.

    Returns:
    - dice_similarity (float): Dice similarity between the two lists of words.
    """
    set1 = set(words1)
    set2 = set(words2)

    intersection = len(set1.intersection(set2))
    dice_similarity = (2 * intersection) / (len(set1) + len(set2))

    return dice_similarity

In [None]:
def overlap_similarity(words1, words2):
    """
    Given two lists of words, the overlap similarity between the two lists is computed.

    Parameters:
    - words1 (list): List of words.
    - words2 (list): List of words.

    Returns:
    - overlap_similarity (float): Overlap similarity between the two lists of words.
    """
    set1 = set(words1)
    set2 = set(words2)

    intersection = len(set1.intersection(set2))
    overlap_similarity = intersection / min(len(set1), len(set2))

    return overlap_similarity

In [None]:
def bow_cosine_similarity(words1, words2, scoring_method):
    """
    Given two lists of words, the cosine similarity between the two lists is computed using the bag-of-words approach.
    The scoring method for the bag-of-words approach can be 'binary', 'count', or 'frequency'.

    Parameters:
    - words1 (list): List of words.
    - words2 (list): List of words.
    - scoring_method (str): Scoring method for the bag-of-words approach. Can be 'binary', 'count', or 'frequency'.

    Returns:
    - cosine_similarity (float): Cosine similarity between the two lists of words.
    """
    combined_words = set(words1 + words2)

    if scoring_method == "binary":
        vector1 = [1 if word in words1 else 0 for word in combined_words]
        vector2 = [1 if word in words2 else 0 for word in combined_words]
    elif scoring_method == "count":
        count1 = Counter(words1)
        count2 = Counter(words2)
        vector1 = [count1[word] for word in combined_words]
        vector2 = [count2[word] for word in combined_words]
    elif scoring_method == "frequency":
        total_words1 = len(words1)
        total_words2 = len(words2)
        frequency1 = {word: words1.count(word) / total_words1 for word in combined_words}
        frequency2 = {word: words2.count(word) / total_words2 for word in combined_words}
        vector1 = [frequency1[word] for word in combined_words]
        vector2 = [frequency2[word] for word in combined_words]
    else:
        raise ValueError("Invalid scoring method. Please choose 'binary', 'count', or 'frequency'.")

    dot_product = sum(i * j for i, j in zip(vector1, vector2))
    magnitude1 = sum(i ** 2 for i in vector1) ** 0.5
    magnitude2 = sum(j ** 2 for j in vector2) ** 0.5

    cosine_similarity = dot_product / (magnitude1 * magnitude2) if (magnitude1 * magnitude2) != 0 else 0

    return cosine_similarity

In [None]:
def unigram_similarity(words1, words2):
    """
    Given two lists of words, the unigram similarity between the two lists is computed.

    Parameters:
    - words1 (list): List of words.
    - words2 (list): List of words.

    Returns:
    - unigram_similarity (float): Unigram similarity between the two lists of words.
    """
    count_same = sum(min(words1.count(w), words2.count(w)) for w in set(words1))
    total_words = len(words1) + len(words2)
    return 2 * count_same / total_words if total_words > 0 else 0

In [None]:
def get_word_importance(words):
    """
    Given a list of lists of words, the importance of each word is computed.
    The importance of a word is defined as the logarithm of the total number of words divided by the frequency of the word.

    Parameters:
    - words_ws (list of lists): List of lists of words.

    Returns:
    - importance (dict): Dictionary with the importance of each word.
    """
    all_words = [word.lower() for sentence in words for word in sentence]
    freq = Counter(all_words)
    total_freq = len(all_words)
    importance = {}
    for word, word_freq in freq.items():
        importance[word] = math.log(float(total_freq) / float(word_freq))

    return importance

In [None]:
def unigram_similarity_importance(words1, words2):
    """
    Given two lists of words, the unigram similarity between the two lists is computed.

    Parameters:
    - words1 (list): List of words.
    - words2 (list): List of words.

    Returns:
    - unigram_similarity (float): Unigram similarity between the two lists of words.
    """
    word_importance = get_word_importance(words1 + words2)
    max_importance = max(word_importance.values())
    count_same = sum(min(words1.count(w), words2.count(w)) * word_importance.get(w, max_importance) for w in set(words1))
    total_words = len(words1) + len(words2)
    similarity = 2 * count_same / total_words if total_words > 0 else 0

    return similarity

#### Syntactic dimensions

In [None]:
def get_most_frequent_synset(word, pos):
    """
    Get the most frequent WordNet synset for a given word and part-of-speech tag.

    Parameters:
    - word (str): Word.
    - pos (str): Part-of-speech tag.

    Returns:
    - most_frequent_synset (Synset): Most frequent synset.
    """
    synsets = wordnet.synsets(word, pos=pos)

    if synsets:
        most_frequent_synset = max(synsets, key=lambda synset: synset.lemmas()[0].count())
        return most_frequent_synset
    else:
        return None

In [None]:
def synset_similarities(lemmas1, lemmas2):
    """
    Calculate the path synset similarity between two sentences.

    Parameters:
    - sentence1 (str): First sentence.
    - sentence2 (str): Second sentence.

    Returns:
    - similarities (list): List of tuples with (word1, word2, similarity).
    """
    d = {'N': 'n', 'V': 'v', 'R': 'r', 'J': 'a'}

    # Get synsets for each word
    synsets1 = [get_most_frequent_synset()]
    synsets1 = [(word, get_most_frequent_synset(word, d[pos[0]])) for word, pos in pos_tags1 if pos[0] in d]
    synsets2 = [(word, get_most_frequent_synset(word, d[pos[0]])) for word, pos in pos_tags2 if pos[0] in d]

    # Calculate path similarity for each pair of synsets
    path_similarity, lch_similarity, wup_similarity, lin_similarity = [], [], [], []
    for synset1 in synsets1:
        for synset2 in synsets2:
            if synset1[1] and synset2[1]:
                path_similarity.append(synset1[1].path_similarity(synset2[1]))
                lch_similarity.append(synset1[1].lch_similarity(synset2[1]) if synset1[1].pos == synset2[1].pos else 0)
                wup_similarity.append(synset1[1].wup_similarity(synset2[1]))
                lin_similarity.append(synset1[1].lin_similarity(synset2[1], brown_ic) if (synset1[1].pos == synset2[1].pos and synset1[1].pos in {'n', 'v', 'r', 'a'}) else 0)

    # Calculate the average similarity for each sentence
    path_similarity = sum(path_similarity) / len(path_similarity) if len(path_similarity) > 0 else 0
    lch_similarity = sum(lch_similarity) / len(lch_similarity) if len(lch_similarity) > 0 else 0
    wup_similarity = sum(wup_similarity) / len(wup_similarity) if len(wup_similarity) > 0 else 0
    lin_similarity = sum(lin_similarity) / len(lin_similarity) if len(lin_similarity) > 0 else 0

    return path_similarity, lch_similarity, wup_similarity, lin_similarity

In [None]:
# Dictionary where we store the similarity between synsets in order to reduce computational cost
computed_synsets_sim = {}

def wordnet_similarity(s1, s2, method):
    if method == "path" and s1 is not None and s2 is not None:
        return s1.path_similarity(s2)

    elif method == "lch" and s1 is not None and s2 is not None and s1.pos == s2.pos:
        return s1.lch_similarity(s2)

    elif method == "wup" and s1 is not None and s2 is not None:
        return s1.wup_similarity(s2)

    elif method == "lin" and s1 is not None and s2 is not None and s1.pos == s2.pos and s1.pos in {'n', 'v', 'r', 'a'}:
        return s1.lin_similarity(s2)

    else:
        return None

def max_similarity_synsets(l1, l2, method):
    # If are the same we return the max value
    if l1 == l2:
        if method == "lch":
            return 3
        else:
            return 1

    # If we have computed before the similarity we don't compute anything
    elif (l1,l2,method) in computed_synsets_sim:
        return computed_synsets_sim[(l1,l2,method)]

    # Get synsets
    synsets1 = wordnet.synsets(l1)
    synsets2 = wordnet.synsets(l2)

    similarities = []
    for s1 in synsets1:
        for s2 in synsets2:
            # Get the similarity between synsets
            similarity = wordnet_similarity(s1, s2, method)
            if similarity is not None:
                similarities.append(similarity)

    # Return the maximum similarity
    if len(similarities) > 0:
        computed_synsets_sim[(l1,l2,method)] = max(similarities)
        return max(similarities)
    else:
        computed_synsets_sim[(l1,l2,method)] = 0
        return 0

def synsets_similarity(lemmas1, lemmas2, method):
    sum_sim1 = 0
    for l1 in lemmas1:
        sum_sim1 += max([max_similarity_synsets(l1, l2, method) for l2 in lemmas2])
    mean_sim1 = sum_sim1 / len(lemmas1)

    sum_sim2 = 0
    for l2 in lemmas2:
        sum_sim2 += max([max_similarity_synsets(l2, l1, method) for l1 in lemmas1])
    mean_sim2 = sum_sim2 / len(lemmas2)

    if mean_sim1 > 0 or mean_sim2 > 0:
        return (2 * mean_sim1 * mean_sim2)/(mean_sim1+mean_sim2)
    else:
        return 0

In [None]:
def bigram_similarity(words1, words2):
    """
    Given two lists of words, the bigram similarity between the two lists is computed.

    Parameters:
    - words1 (list): List of words.
    - words2 (list): List of words.

    Returns:
    - similarity (float): Bigram similarity between the two lists of words.
    """
    bigrams1 = Counter([tuple(words1[i:i + 2]) for i in range(len(words1) - 1)])
    bigrams2 = Counter([tuple(words2[i:i + 2]) for i in range(len(words2) - 1)])

    common_bigrams = set(bigrams1.keys()) & set(bigrams2.keys())
    count = sum(min(bigrams1[bigram], bigrams2[bigram]) for bigram in common_bigrams)

    total_words = len(words1) + len(words2)
    return 2 * count / total_words if total_words > 0 else 0

In [None]:
def trigram_similarity(words1, words2):
    """
    Given two lists of words, the trigram similarity between the two lists is computed.

    Parameters:
    - words1 (list): List of words.
    - words2 (list): List of words.

    Returns:
    - similarity (float): Trigram similarity between the two lists of words.
    """
    trigrams1 = Counter([tuple(words1[i:i + 3]) for i in range(len(words1) - 2)])
    trigrams2 = Counter([tuple(words2[i:i + 3]) for i in range(len(words2) - 2)])

    common_trigrams = set(trigrams1.keys()) & set(trigrams2.keys())
    count = sum(min(trigrams1[trigram], trigrams2[trigram]) for trigram in common_trigrams)

    total_words = len(words1) + len(words2)
    similarity = 2 * count / total_words if total_words > 0 else 0

    return similarity

#### Extract all features

In [None]:
def get_features(sentences1, sentences2, type='both'):
    """
    Given two lists of sentences, a list of lists with the features of each pair of sentences is returned.

    Parameters:
    - sentences1 (list): List of sentences.
    - sentences2 (list): List of sentences.
    - type(str): 'lexical', 'syntactic' or 'both' to select the features to be used.

    Returns:
    - features (list): List of lists with the features of each pair of sentences.
    """

    # Preprocess sentences
    words1, words_sw1, pos_tags1, lemmas1 = preprocess_sentences(sentences1)
    words2, words_sw2, pos_tags2, lemmas2 = preprocess_sentences(sentences2)

    # Compute features
    feature_matrix = []
    for i in range(len(words1)):
        features = []
        features.append(length_difference(words1[i], words2[i])) # Difference in length between words
        features.append(length_difference(words_sw1[i], words_sw2[i])) # Difference in length between words with stopwords
        features.append(length_difference(lemmas1[i], lemmas2[i])) # Difference in length between lemmas
        if type == 'lexical' or type == 'both':
            features.append(jaccard_similarity(words1[i], words2[i])) # Jaccard similarity between words
            features.append(jaccard_similarity(words_sw1[i], words_sw2[i])) # Jaccard similarity between words with stopwords
            features.append(jaccard_similarity(lemmas1[i], lemmas2[i])) # Jaccard similarity between lemmas
            features.append(overlap_similarity(words1[i], words2[i])) # Overlap similarity between words
            features.append(overlap_similarity(words_sw1[i], words_sw2[i])) # Overlap similarity between words with stopwords
            features.append(overlap_similarity(lemmas1[i], lemmas2[i])) # Overlap similarity between lemmas
            features.append(dice_similarity(words1[i], words2[i])) # Dice similarity between words
            features.append(dice_similarity(words_sw1[i], words_sw2[i])) # Dice similarity between words with stopwords
            features.append(dice_similarity(lemmas1[i], lemmas2[i])) # Dice similarity between lemmas
            # for scoring_method in ['binary', 'count', 'frequency']:
            #     features.append(bow_cosine_similarity(words1[i], words2[i], scoring_method)) # Cosine similarity between words
            #     features.append(bow_cosine_similarity(words_sw1[i], words_sw2[i], scoring_method)) # Cosine similarity between words with stopwords
            #     features.append(bow_cosine_similarity(lemmas1[i], lemmas2[i], scoring_method)) # Cosine similarity between lemmas
            features.append(unigram_similarity(words1[i], words2[i])) # Unigram similarity between words
            features.append(unigram_similarity(words_sw1[i], words_sw2[i])) # Unigram similarity between words with stopwords
            features.append(unigram_similarity(lemmas1[i], lemmas2[i])) # Unigram similarity between lemmas
            features.append(unigram_similarity_importance(words1[i], words2[i])) # Unigram similarity importance between words
            features.append(unigram_similarity_importance(words_sw1[i], words_sw2[i])) # Unigram similarity importance between words with stopwords
            features.append(unigram_similarity_importance(lemmas1[i], lemmas2[i])) # Unigram similarity importance between lemmas
        if type == 'syntactic' or type == 'both':
            # path_similarity, lch_similarity, wup_similarity, lin_similarity = synset_similarities(pos_tags1[i], pos_tags2[i])
            # features.append(path_similarity) # Path similarity between synsets
            # features.append(lch_similarity) # LCH similarity between synsets
            # features.append(wup_similarity) # WUP similarity between synsets
            # features.append(lin_similarity) # LIN similarity between synsets
            features.append(synsets_similarity(lemmas1[i], lemmas2[i], 'path')) # Path similarity between synsets
            features.append(synsets_similarity(lemmas1[i], lemmas2[i], 'lch')) # LCH similarity between synsets
            features.append(synsets_similarity(lemmas1[i], lemmas2[i], 'wup')) # WUP similarity between synsets
            features.append(synsets_similarity(lemmas1[i], lemmas2[i], 'lin')) # LIN similarity between synsets
            features.append(bigram_similarity(words1[i], words2[i])) # Bigram similarity between words
            features.append(bigram_similarity(words_sw1[i], words_sw2[i])) # Bigram similarity between words with stopwords
            features.append(bigram_similarity(lemmas1[i], lemmas2[i])) # Bigram similarity between lemmas
            features.append(trigram_similarity(words1[i], words2[i])) # Trigram similarity between words
            features.append(trigram_similarity(words_sw1[i], words_sw2[i])) # Trigram similarity between words with stopwords
            features.append(trigram_similarity(lemmas1[i], lemmas2[i])) # Trigram similarity between lemmas
        feature_matrix.append(features)

    return feature_matrix

### MAIN

#### Load data, preprocess it and extract features

In [None]:
# Define paths to train and test data
train_path = '/Users/yaiza/Downloads/train' # '/content/drive/MyDrive/MAI/MAI/IHLT/project/train/'
test_path = '/Users/yaiza/Downloads/test-gold' # '/content/drive/MyDrive/MAI/MAI/IHLT/project/test-gold/'

# Read sentences and gold standard scores
s1_train, s2_train = read_sentences(train_path)
gs_train = read_gs(train_path)
s1_test, s2_test = read_sentences(test_path)
gs_test = read_gs(test_path)

# Compute features
features_train = get_features(s1_train, s2_train, type='both')
features_test = get_features(s1_test, s2_test, type='both')

# Scale features
scaler = StandardScaler()
scaler.fit(features_train)
scaled_features_train = scaler.transform(features_train)
scaled_features_test = scaler.transform(features_test)

# Save scaled features
with open('/Users/yaiza/Downloads/scaled_features_train.txt', 'w') as file:
    for feature in scaled_features_train:
        file.write(f"{','.join(map(str, feature))}\n")

with open('/Users/yaiza/Downloads/scaled_features_test.txt', 'w') as file:
    for feature in scaled_features_test:
        file.write(f"{','.join(map(str, feature))}\n")

# Save gold standard scores
with open('/Users/yaiza/Downloads/gs_train.txt', 'w') as file:
    for score in gs_train:
        file.write(f"{score}\n")

with open('/Users/yaiza/Downloads/gs_test.txt', 'w') as file:
    for score in gs_test:
        file.write(f"{score}\n")

# # Read scaled features
# scaled_features_train = []
# with open('/Users/yaiza/Downloads/github/features_train2.txt', 'r') as file:
#     for line in file:
#         scaled_features_train.append(list(map(float, line.strip().split(','))))

# scaled_features_test = []
# with open('/Users/yaiza/Downloads/github/features_test2.txt', 'r') as file:
#     for line in file:
#         scaled_features_test.append(list(map(float, line.strip().split(','))))



KeyboardInterrupt: 

#### Train model

In [None]:
# Train MLP
mlp = MLPRegressor(max_iter=2000, early_stopping=True)
mlp.fit(scaled_features_train, gs_train)
print(f'Best validation score: {mlp.best_validation_score_}')

Best validation score: 0.602944749582186


In [None]:
# Predict test scores
predictions = mlp.predict(scaled_features_test)

# Compute Pearson correlation coefficient
print(f'Pearson correlation coefficient: {pearsonr(predictions, gs_test)[0]}')

Pearson correlation coefficient: 0.6657581160950876


### Things to change
- StandardScaler
- More distances from sklearn