In [31]:
# This section prints out the runtime configuration.
# Please don't edit this code block and keep this in your code.
# When submitting your notebook, please ensure this block is executed as well
# Please ignore all error messages in this block.
# For course work 1, please only use CPU for calculation. You can change it to CPU only by selecting:
# Runtime -> Change runtime type -> Hardware accelerator: None (or select GPU/TPU as required).
!cat /proc/cpuinfo | grep "model name" | uniq
!nvidia-smi

model name	: Intel(R) Xeon(R) CPU @ 2.20GHz
/bin/bash: line 1: nvidia-smi: command not found


In [19]:
pip install gensim



In [20]:
from gensim.models import Word2Vec
import nltk
from nltk.tokenize import word_tokenize
from sklearn.metrics.pairwise import cosine_similarity
import re
import numpy as np
import pandas as pd
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

In [21]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [22]:
with open('./data/WikiText-103.txt', 'r') as file:
  corpus = file.read()

# Regex pattern to split the corpus
pattern = r'(?<!\=)\= [^\=]+ \= (?!\=)'
articles = re.split(pattern, corpus)
len(articles)

10728

In [23]:
# Defining a fucntion to clean the text
def clean_text(text):
  # Convert to lower case
  text = text.lower()

  # Remove punctuation
  text = re.sub(r'[^\w\s]', '', text)

  # Remove numbers
  text = re.sub(r'\d+', '', text)

  # Remove newline characters
  text = re.sub('\n', '', text)

  # Remove square brackets
  text = re.sub('\[.*?\]', ' ', text)

  # Remove HTML tags
  text = re.sub('https?://\S+|www\.\S+', ' ', text)

  # Remove urls
  text = re.sub('<.*?>+', ' ', text)

  return text

articles = [clean_text(article) for article in articles]
len(articles)

10728

In [24]:
# Initalize WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

# Define the stop words set
stop_words = set(stopwords.words('english'))

# Funtion to tokenize, lemmatize and remove stop words
def process_text(list_of_strings):

  processed_strings = []

  for text in list_of_strings:
    # Tokenize the text
    tokens = word_tokenize(text)

    # Lemmatize each token and remove stop words
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]

    # Append the processed sentence to the result list
    processed_strings.append(lemmatized_tokens)

  return processed_strings

# Process the articles
tokenized_articles = process_text(articles)

In [25]:
# Train the Word2Vec model using Skip-gram
word2vec_model = Word2Vec(sentences=tokenized_articles, vector_size=100, window=5, min_count=1, workers=4, sg=1)
word2vec_model.save("word2vec_model.model")

# Size of vocabulary
print("Size of vocabulary:", len(word2vec_model.wv))

Size of vocabulary: 154891


In [26]:
# Function to retrieve the vector representation of a token
def get_vector(word):
  # If the word is in the vocabulary, return its vector
  if word in word2vec_model.wv.key_to_index:
    word_array = word.split()
    vectors = [word2vec_model.wv[w] for w in word_array]

    # Handling Out-of-Vocabulary (OOV) words by returning None
    if len(vectors) == 0:
      return None

    return np.mean(vectors, axis=0).reshape(1, -1)

  # Handling Out-of-Vocabulary (OOV) words by returning None
  else:
    return None

In [27]:
# Function to calculate the cosine similarity matrix between two lists of tokens
def calculate_similarity_matrix(tokens1, tokens2):
  similarities = []
  for token1 in tokens1:
    for token2 in tokens2:
      # Retrieve vectors for each pair of tokens
      vector1 = get_vector(token1)
      vector2 = get_vector(token2)

      # Handling OOV words by assigning a similarity of 0.5
      if vector1 is None or vector2 is None:
        similarities.append(0.5)
      else:
        # Reshape the vectors for cosine similarity calculation
        vector1 = vector1.reshape(1, -1)
        vector2 = vector2.reshape(1, -1)

        # Calculate cosine similarity and extract the scalar value from the matrix
        cosine_similarity_matrix = cosine_similarity(vector1, vector2)[0, 0]

        similarities.append(cosine_similarity_matrix)

  return similarities

In [28]:
# Function to get the consine similarity matrix between two terms
def get_cosine_similarity(term1, term2):
  # Check if either term contains spaces, indicating multi-word phrases
  if (' ' in term1 or ' ' in term2):
    # Calculate the similarity for every pair of words in term1 and term2
    term1_tokens = term1.split()
    term2_tokens = term2.split()

    similarities = calculate_similarity_matrix(term1_tokens, term2_tokens)

    # All tokens are OOV, handling by returning a default similarity of 0.5
    if not similarities:
        return 0.5

    # Return the average similarity for multi-word phrases
    return np.mean(similarities)

  else:
    # Calculate similarity between single words
    vector1 = get_vector(term1)
    vector2 = get_vector(term2)

    if vector1 is None or vector2 is None:
        # Handling OOV words by returning a default similarity of 0.5
        return 0.5

    # Calculate cosine similarity
    cosine_similarity_matrix = cosine_similarity(vector1, vector2)

    # Extract the scalar value from the matrix
    cosine_similarity_value = cosine_similarity_matrix[0, 0]

    # Return the similarity between single words
    return cosine_similarity_value

In [30]:
# Load the CSV file into a DataFrame with specified column names
validation_df = pd.read_csv("./data/CW-1-testdata.csv", header=None)

validation_df.columns = ["id","term1", "term2"]

# Clean the input values
validation_df[["term1", "term2"]] = validation_df[["term1", "term2"]].map(clean_text)

# Produce the output
validation_df["cosine_similarity"] = validation_df.apply(lambda x: get_cosine_similarity(x["term1"], x["term2"]), axis=1)

# Create a .csv file
validation_df.to_csv("./data/10927437_task2_results.csv", header=False, index=False)