In [5]:
# This section prints out the runtime configuration.
# Please don't edit this code block and keep this in your code.
# When submitting your notebook, please ensure this block is executed as well
# Please ignore all error messages in this block.
# For course work 1, please only use CPU for calculation. You can change it to CPU only by selecting:
# Runtime -> Change runtime type -> Hardware accelerator: None (or select GPU/TPU as required).
!cat /proc/cpuinfo | grep "model name" | uniq
!nvidia-smi

model name	: Intel(R) Xeon(R) CPU @ 2.20GHz
/bin/bash: line 1: nvidia-smi: command not found


In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re
import pandas as pd
import numpy as np
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

In [7]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [25]:
with open('./data/WikiText-103.txt', 'r') as file:
  corpus = file.read()

# Regex pattern to split the corpus
pattern = r'(?<!\=)\= [^\=]+ \= (?!\=)'
articles = re.split(pattern, corpus)

In [9]:
# Defining a fucntion to clean the text
def clean_text(text):
  # Convert to lower case
  text = text.lower()

  # Remove punctuation
  text = re.sub(r'[^\w\s]', '', text)

  # Remove numbers
  text = re.sub(r'\d+', '', text)

  # Remove newline characters
  text = re.sub('\n', '', text)

  # Remove square brackets
  text = re.sub('\[.*?\]', '', text)

  # Remove HTML tags
  text = re.sub('https?://\S+|www\.\S+', '', text)

  # Remove urls
  text = re.sub('<.*?>+', '', text)

  # Tokenization
  tokens = word_tokenize(text)

  text = ' '.join(tokens)

  return text

In [10]:
# Clean the article
articles = [clean_text(article) for article in articles]
len(articles)

10728

In [11]:
# Define the stop words list
stop_words = stopwords.words("english")

# Using TFIDF to get the vectorizer
tfidf_vectorizer = TfidfVectorizer(analyzer="word", stop_words=stop_words, sublinear_tf=True, ngram_range=(1, 3), min_df=5, max_df=0.3)
features = tfidf_vectorizer.fit_transform(articles)

# Size of vocabulary
print("Size of vocabulary:" , len(tfidf_vectorizer.get_feature_names_out()))

# Dimensions of feature matrix
print("Dimensions of feature matrix:" , features.shape)

Size of vocabulary: 428434
Dimensions of feature matrix: (10728, 428434)


In [12]:
# Function to retrieve the vector representation of a token
def get_vector(token):
  # If the token is in the vocabulary, return its vector
  if token in tfidf_vectorizer.vocabulary_:
    index = tfidf_vectorizer.vocabulary_[token]
    return features[:, index].toarray()
  # Handling Out-of-Vocabulary (OOV) words by returning None
  else:
    return None

In [13]:
# Function to calculate the cosine similarity matrix between two lists of tokens
def calculate_similarity_matrix(tokens_1, tokens_2):
  similarities = []
  for token1 in tokens_1:
    for token2 in tokens_2:
      # Retrieve vectors for each pair of tokens
      vector1 = get_vector(token1)
      vector2 = get_vector(token2)

      # Handling OOV words by assigning a similarity of 0.5
      if vector1 is None or vector2 is None:
        similarities.append(0.5)
      else:
        # Reshape the vectors for cosine similarity calculation
        vector1 = vector1.reshape(1, -1)
        vector2 = vector2.reshape(1, -1)

        # Calculate cosine similarity and extract the scalar value from the matrix
        cosine_similarity_matrix_value = cosine_similarity(vector1, vector2)[0, 0]

        similarities.append(cosine_similarity_matrix_value)

  return similarities

In [14]:
# Function to get the consine similarity matrix between two terms
def get_cosine_similarity(term_1, term_2):
  # Check if either of the term contains spaces, indicating multi-word phrases
  if (' ' in term_1 or ' ' in term_2):
      # Calculate the similarity for every pair of words in term_1 and term_2
      term_1_tokens = term_1.split()
      term_2_tokens = term_2.split()

      similarities = calculate_similarity_matrix(term_1_tokens, term_2_tokens)

      # All tokens are OOV, handling by returning a default similarity of 0.5
      if not similarities:
        return 0.5

      # Return the average similarity for multi-word phrases
      return np.mean(similarities)

  else:
    # Calculate similarity between single words
    vector1 = get_vector(term_1)
    vector2 = get_vector(term_2)

    # Handling OOV words by returning a default similarity of 0.5
    if vector1 is None or vector2 is None:
      return 0.5

    # Reshape the vectors for cosine similarity calculation
    vector1 = vector1.reshape(1, -1)
    vector2 = vector2.reshape(1, -1)

    # Calculate cosine similarity and extract the scalar value from the matrix
    cosine_similarity_matrix_value = cosine_similarity(vector1, vector2)[0, 0]

    # Return the similarity between single words
    return cosine_similarity_matrix_value

In [23]:
# Load the CSV file into a DataFrame
validation_df = pd.read_csv("./data/CW-1-testdata.csv", header=None)

validation_df.columns = ["id","term1", "term2"]

# Clean the input values
validation_df[["term1", "term2"]] = validation_df[["term1", "term2"]].map(clean_text)

# Produce the output
validation_df["cosine_similarity"] = validation_df.apply(lambda x: get_cosine_similarity(x["term1"], x["term2"]), axis=1)

# Create a .csv file
validation_df.to_csv("./data/10927437_task1_results.csv", header=False, index=False)