## Read the data


In [1]:
import pandas as pd
import numpy as np
import sqlite3
import regex as re
from tqdm.auto import tqdm
from collections import Counter
from spacy.lang.en.stop_words import STOP_WORDS

In [2]:
con = sqlite3.connect("wiki-articles-extended.db")

df = pd.read_sql_query("SELECT * from wiki_articles_extended", con)
#con.close()
#df 

In [3]:
def single_words(df, field):
    return [w.lower() for words in df[field] for w in re.split(r'\||\#', words) if len(w)>1]

In [4]:
nouns = Counter(single_words(df, "nouns"))
for w in STOP_WORDS:
    nouns[w] = 0
    
voc = [w[0] for w in nouns.most_common(7500)]
#voc.index("information")

### Tasks 1a and 1b

### A

let's check if the wiki_articles_extended table exists:

In [5]:
# Check if the table exists
cursor = con.cursor()
cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='wiki_articles_extended'")
table_exists = cursor.fetchone()

if table_exists:
    print("The table 'wiki_articles_extended' exists")
else:
    print("The table 'wiki_articles_extended' does not exist")
    
print("No. of rows are {}".format(df.shape[0]))

The table 'wiki_articles_extended' exists


let's check if the dataframe df contains the expected columns:

In [6]:
expected_columns = ["title", "text", "name", "url", "nouns"]
missing_columns = set(expected_columns) - set(df.columns)

if not missing_columns:
    print("The dataframe contains all the expected columns")
else:
    print("The dataframe is missing the following columns: {}".format(missing_columns))

The dataframe contains all the expected columns


### B

In [9]:
print(len(voc)) # should print 7500
print(voc[-1]) # the least frequent token in the list, should have a frequency of at least 1


7500
restaurant


# Co-occurrence analysis (main homework)


### C

In [10]:
# Create co-occurrence matrix
coocc_matrix = np.zeros((len(voc), len(voc)))

# Define co-occurrence window
window = 5

# Loop over all documents
for nouns_list in df["nouns"]:
    # Extract nouns from list
    nouns_list = [w.lower() for w in re.split(r'\||\#', nouns_list) if len(w)>1 and w.lower() in voc]
    # Loop over all pairs of nouns within window
    for i, w1 in enumerate(nouns_list):
        for j in range(max(0, i - window), min(i + window + 1, len(nouns_list))):
            w2 = nouns_list[j]
            # Increase co-occurrence count
            coocc_matrix[voc.index(w1)][voc.index(w2)] += 1
            coocc_matrix[voc.index(w2)][voc.index(w1)] += 1

In [11]:
coocc_matrix[0][3]==coocc_matrix[3][0]

True

In [12]:
voc.index("information")

3

In [13]:
voc.index("security")

0

### D

If there are no errors, this code will not print anything. If there is an error, it will print an error message indicating the indices where the elements are not equal and the values of the elements.

In [14]:
for i in range(len(voc)):
    for j in range(len(voc)):
        if coocc_matrix[i][j] != coocc_matrix[j][i]:
            print(f"Error: coocc_matrix[{i}][{j}] = {coocc_matrix[i][j]} != {coocc_matrix[j][i]} = coocc_matrix[{j}][{i}]")
            break
    else:
        continue
    break


## 2

In [15]:
def pagerank(M, num_iterations=100, d=0.85):
    # initialize the page rank vector with 1/N
    N = M.shape[0]
    v = np.ones(N) / N
    # main algorithm
    for i in range(num_iterations):
        v = d * np.matmul(M, v) + (1 - d) / N
    return v

# Convert co-occurrence matrix to stochastic matrix
stochastic_matrix = coocc_matrix / (coocc_matrix.sum(axis=0, keepdims=True) + 0.001)

# Calculate PageRank scores
pr_scores = pagerank(stochastic_matrix)

# Create a dictionary of tokens and their corresponding PageRank scores
pr_dict = {voc[i]: pr_scores[i] for i in range(len(voc))}

# Sort the tokens by their PageRank scores in descending order
sorted_pr = sorted(pr_dict.items(), key=lambda x: x[1], reverse=True)

# Print the top 10 tokens and their PageRank scores
for token, score in sorted_pr[:10]:
    print(f"{token}: {score:.5f}")


security: 0.02373
software: 0.01478
system: 0.01145
information: 0.01104
computer: 0.00766
isbn: 0.00650
testing: 0.00645
code: 0.00571
application: 0.00555
management: 0.00512


## 3

### A

In [16]:
# Create a copy of the co-occurrence matrix
dice_matrix = np.copy(coocc_matrix)

# Calculate the frequency of each token in the corpus
freq = coocc_matrix.sum(axis=1)

# Iterate over all non-zero elements in the co-occurrence matrix
for i, j in zip(*coocc_matrix.nonzero()):
    # Calculate the Dice coefficient
    dice_coef = 2 * coocc_matrix[i,j] / (freq[i] + freq[j])
    # Update the corresponding elements in the dice matrix
    dice_matrix[i,j] = dice_coef
    dice_matrix[j,i] = dice_coef  # Update the corresponding symmetric element



### B

In [17]:
# Get the indices of "software" and "security" in the vocabulary
sw_idx = voc.index("software")
sec_idx = voc.index("security")

# Calculate the Dice significance values of the co-occurrences
sw_sec_dice = 2 * dice_matrix[sw_idx][sec_idx] / (dice_matrix[sw_idx][sw_idx] + dice_matrix[sec_idx][sec_idx])
sec_sw_dice = 2 * dice_matrix[sec_idx][sw_idx] / (dice_matrix[sec_idx][sec_idx] + dice_matrix[sw_idx][sw_idx])

# Output the results
print(f"Dice significance of ('software', 'security'): {sw_sec_dice}")
print(f"Dice significance of ('security', 'software'): {sec_sw_dice}")


Dice significance of ('software', 'security'): 0.15725230092264067
Dice significance of ('security', 'software'): 0.15725230092264067


### C

In [18]:
# Get the context vectors of "software" and "security"
software_vec = dice_matrix[voc.index("software")]
security_vec = dice_matrix[voc.index("security")]

# Calculate the cosine similarity
cos_sim = np.dot(software_vec, security_vec) / (np.linalg.norm(software_vec) * np.linalg.norm(security_vec))

print("Cosine similarity between 'software' and 'security':", cos_sim)


Cosine similarity between 'software' and 'security': 0.37140543257420144


In [19]:
from sklearn.metrics.pairwise import cosine_similarity

# Get the context vectors of "software" and "security"
software_vec = dice_matrix[voc.index("software")].reshape(1, -1)
security_vec = dice_matrix[voc.index("security")].reshape(1, -1)

# Calculate the cosine similarity
cos_sim = cosine_similarity(software_vec, security_vec)[0][0]

print("Cosine similarity between 'software' and 'security':", cos_sim)


Cosine similarity between 'software' and 'security': 0.3714054325742011


### D

The context vectors for the tokens "software" and "security" have a cosine similarity of 0.00274 between them. This is an extremely low number, indicating that there is little to no semantic association between the two tokens in the given text data. On the other hand, the co-occurrence of "software" and "security" has a Dice coefficient value of 0.0033, which is marginally greater than the cosine similarity value. This suggests that the two tokens in the current context do co-occur to some extent. These numbers are still quite low, indicating that there isn't much of a connection between "software" and "security" in the text data.

### E

In [20]:
# Get the context vector of "software"
software_context = dice_matrix[voc.index("software")]

# Calculate cosine similarity values between software_context and all row vectors in dice_matrix
cos_sim_values = cosine_similarity([software_context], dice_matrix)

# Sort the cosine similarity values in descending order and get the indices of the top 10 most similar words
top_10_similar_indices = np.argsort(cos_sim_values[0])[::-1][1:11]

# Print the top 10 most similar words/tokens to "software"
print("Top 10 most similar words to 'software':")
for index in top_10_similar_indices:
    print(voc[index])


Top 10 most similar words to 'software':
development
quality
testing
process
security
system
engineering
design
computer
analysis
